In [48]:
# Import libraries
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.utils import shuffle
from tqdm import tqdm
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import make_pipeline
import joblib
import pandas as pd
import nltk
import sys
from time import sleep
from IPython.display import clear_output
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [9]:
# Load json file
df = pd.read_csv('emotion_dataset_raw.csv', encoding = 'ISO-8859-1')

# Keep only relevant columns
df = df[['Emotion', 'Text']]

# Factorize category_id
df['emotion_id'], categories = pd.factorize(df['Emotion'])
df['emotion_id'] = df['emotion_id'] + 1

# Dropping all rows containing NaN
df = df.dropna()

In [12]:
# Define a function to remove stop words and stem words
def preprocess(text):
    # Convert text to lowercase
    text = text.lower()
    # Tokenize text into words
    words = nltk.word_tokenize(text)
    # Remove stop words
    stop_words = nltk.corpus.stopwords.words('english')
    words = [w for w in words if w not in stop_words]
    # Stem words using Porter stemmer
    stemmer = nltk.stem.PorterStemmer()
    words = [stemmer.stem(w) for w in words]
    # Return the processed text as a string
    return ' '.join(words)

# Apply the preprocess function to the headline column
df['Text'] = df['Text'].apply(preprocess)

In [13]:
# Oversample the minority class to address class imbalance
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(df[['Text']], df['emotion_id'])

# Get the category we need for testing
X = X_resampled['Text']
y = y_resampled

# Create test/train split
X_train, X_test, y_train, y_test = \
train_test_split(X, y, test_size = 0.30, random_state = 90)
print(X_train.shape)
print(X_test.shape)

(61852,)
(26508,)


In [43]:
# Initiate the model
lr = Pipeline([('cv', TfidfVectorizer()),
               ('clf', LogisticRegression(max_iter = 100000)),
              ])

# Train the logistic regression model on the training set
lr.fit(X_train,y_train)

In [49]:
# Initiate the model with Random Forest
rf = Pipeline([('cv', TfidfVectorizer()),
               ('clf', RandomForestClassifier(n_estimators=100, random_state=42))
              ])

# Train the random forest model on the training set
rf.fit(X_train, y_train)

In [44]:
# Make predictions on the test set
y_pred = lr.predict(X_test)

# Calculate the accuracy of the model
print(f"Accuracy is: {accuracy_score(y_pred,y_test)}")

Accuracy is: 0.7978346159649917


In [50]:
# Make predictions on the test set
y_pred = rf.predict(X_test)

# Calculate the accuracy of the model
print(f"Accuracy is: {accuracy_score(y_pred,y_test)}")

Accuracy is: 0.9014637090689603


In [45]:
# Save the model
joblib.dump(lr, 'lr_model.joblib')

['lr_model.joblib']

In [51]:
# Save the model
joblib.dump(rf, 'rf_model.joblib')

['rf_model.joblib']

In [54]:
# Test model on random texts
news = ["I am so sad", "bloody hell"]

predicted = lr.predict(news)

for i in range(len(news)):
    print(f"{categories[predicted[i]-1]} : {news[i]}")

sadness : I am so sad
anger : bloody hell
