In [4]:
import pandas as pd

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
%cd /content/drive/MyDrive/emotion_detection
!ls

/content/drive/MyDrive/emotion_detection
data.txt  val.txt


In [7]:
df = pd.read_csv('data.txt', sep=";" ,  header=None, names=["Comment", "Emotion"])

In [8]:
df.Emotion.value_counts()

joy         6057
sadness     5247
anger       2434
fear        2161
love        1463
surprise     638
Name: Emotion, dtype: int64

In [9]:
df['Emotion_num'] = df['Emotion'].map({'joy' :0,'sadness':1,'anger':2, 'fear':3,'love':4 , 'surprise':5})

## 1- Without Data balancing + without data preprocessing

In [75]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    df.Comment,
    df.Emotion_num,
    test_size = 0.20,
    random_state = 2022,
    stratify=df.Emotion_num
)

In [76]:
y_train.value_counts()

0    4846
1    4198
2    1947
3    1729
4    1170
5     510
Name: Emotion_num, dtype: int64

### 1.1- Bag of n-grams (with only trigrams)

#### ----> Random Forest

In [77]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report


#1. create a pipeline object
clf = Pipeline([
    ('vectorizer_n_grams', CountVectorizer(ngram_range = (3, 3))),#using the ngram_range parameter
    ('random_forest', (RandomForestClassifier()))
])

#2. fit with X_train and y_train
clf.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.47      0.64      0.55      1211
           1       0.58      0.35      0.44      1049
           2       0.47      0.24      0.32       487
           3       0.21      0.50      0.30       432
           4       0.49      0.08      0.14       293
           5       0.67      0.12      0.21       128

    accuracy                           0.42      3600
   macro avg       0.48      0.32      0.33      3600
weighted avg       0.48      0.42      0.41      3600



### 1.2- Bag of n-grams (with unigram and bigrams)

#### ----> Multinomial Naive Bayes

In [78]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report


#1. create a pipeline object
clf = Pipeline([
    ('vectorizer_n_grams', CountVectorizer(ngram_range = (1, 2))),
    ('multinomial_naive_bayes', (MultinomialNB()))
])

#2. fit with X_train and y_train
clf.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.61      0.98      0.75      1211
           1       0.69      0.90      0.78      1049
           2       0.94      0.30      0.46       487
           3       0.84      0.26      0.39       432
           4       0.92      0.08      0.15       293
           5       1.00      0.02      0.03       128

    accuracy                           0.67      3600
   macro avg       0.84      0.42      0.43      3600
weighted avg       0.75      0.67      0.60      3600



#### ----> Random Forest

In [79]:
#1. create a pipeline object
clf = Pipeline([
    ('vectorizer_n_grams', CountVectorizer(ngram_range = (1, 2))),
    ('random_forest', (RandomForestClassifier()))
])

#2. fit with X_train and y_train
clf.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.77      0.95      0.85      1211
           1       0.91      0.89      0.90      1049
           2       0.92      0.78      0.84       487
           3       0.86      0.74      0.79       432
           4       0.88      0.61      0.72       293
           5       0.82      0.61      0.70       128

    accuracy                           0.84      3600
   macro avg       0.86      0.76      0.80      3600
weighted avg       0.85      0.84      0.84      3600



### 2.1- TF-IDF vectorizer

#### ----> Random Forest

In [80]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report


#1. create a pipeline object
clf = Pipeline([
     ('vectorizer_tfidf',TfidfVectorizer()),
     ('Random Forest', RandomForestClassifier())
])

#2. fit with X_train and y_train
clf.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.93      0.87      1211
           1       0.91      0.89      0.90      1049
           2       0.90      0.82      0.86       487
           3       0.81      0.80      0.80       432
           4       0.84      0.65      0.73       293
           5       0.80      0.62      0.70       128

    accuracy                           0.86      3600
   macro avg       0.85      0.79      0.81      3600
weighted avg       0.86      0.86      0.85      3600



## 2- Without Data balancing + with data preprocessing

In [3]:
import spacy

# load english language model and create nlp object from it
nlp = spacy.load("en_core_web_sm")


#use this utility function to get the preprocessed text data
def preprocess(text):
    # remove stop words and lemmatize the text
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)

    return " ".join(filtered_tokens)

In [11]:
df.preprocessed_comment = df.Comment.apply(preprocess)

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    df.preprocessed_comment,
    df.Emotion_num,
    test_size = 0.20,
    random_state = 2022,
    stratify=df.Emotion_num
)

## ---> Let's check the scores with our best model till now

### 2.1-RandomForest (with unigram and bigrams)

In [15]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
#1. create a pipeline object
clf = Pipeline([
    ('vectorizer_n_grams', CountVectorizer(ngram_range = (1, 2))),
    ('random_forest', (RandomForestClassifier()))
])

#2. fit with X_train and y_train
clf.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.90      0.89      1211
           1       0.89      0.93      0.91      1049
           2       0.88      0.87      0.87       487
           3       0.86      0.83      0.84       432
           4       0.77      0.73      0.75       293
           5       0.84      0.66      0.74       128

    accuracy                           0.87      3600
   macro avg       0.85      0.82      0.83      3600
weighted avg       0.87      0.87      0.87      3600



### 2.2-RandomForest (with TF-IDF Vectorizer)

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report


#1. create a pipeline object
clf = Pipeline([
     ('vectorizer_tfidf',TfidfVectorizer()),
     ('Random Forest', RandomForestClassifier())
])

#2. fit with X_train and y_train
clf.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.91      0.86      1211
           1       0.90      0.86      0.88      1049
           2       0.87      0.85      0.86       487
           3       0.80      0.82      0.81       432
           4       0.78      0.66      0.71       293
           5       0.77      0.62      0.68       128

    accuracy                           0.84      3600
   macro avg       0.82      0.78      0.80      3600
weighted avg       0.84      0.84      0.84      3600



## 3- With Data balancing + with data preprocessing

In [18]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    df.preprocessed_comment,
    df.Emotion_num,
    test_size = 0.20,
    random_state = 2022,
    stratify=df.Emotion_num
)

## ---> Let's check the scores with our best model till now

### 2.1-RandomForest (with unigram and bigrams)

In [19]:
from imblearn.over_sampling import SMOTE

smt = SMOTE(random_state=42)

In [20]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from imblearn.pipeline import Pipeline
from sklearn.metrics import classification_report

#1. create a pipeline object
clf = Pipeline([
    ('vectorizer_n_grams', CountVectorizer(ngram_range = (1, 2))),
    ('smote', smt),
    ('random_forest', (RandomForestClassifier()))
])

#2. fit with X_train and y_train
clf.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.76      0.83      1211
           1       0.90      0.79      0.84      1049
           2       0.69      0.81      0.74       487
           3       0.74      0.75      0.74       432
           4       0.52      0.81      0.64       293
           5       0.53      0.89      0.67       128

    accuracy                           0.78      3600
   macro avg       0.72      0.80      0.74      3600
weighted avg       0.82      0.78      0.79      3600



In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from imblearn.pipeline import Pipeline
from sklearn.metrics import classification_report

#1. create a pipeline object
clf = Pipeline([
    ('tfidf_vectorizer',TfidfVectorizer()),
    ('smote', smt),
    ('random_forest', (RandomForestClassifier()))
])

#2. fit with X_train and y_train
clf.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.87      0.84      1211
           1       0.91      0.81      0.85      1049
           2       0.84      0.85      0.85       487
           3       0.78      0.80      0.79       432
           4       0.66      0.74      0.70       293
           5       0.69      0.65      0.67       128

    accuracy                           0.82      3600
   macro avg       0.78      0.79      0.78      3600
weighted avg       0.83      0.82      0.82      3600



## -----> Best Model = Without Balancing + With preprocessing + RandomForest(Bag of N-grams(1,2))

In [None]:
import pandas as pd
df = pd.read_csv('data.txt', sep=";" ,  header=None, names=["Comment", "Emotion"])
df['Emotion_num'] = df['Emotion'].map({'joy' :0,'sadness':1,'anger':2, 'fear':3,'love':4 , 'surprise':5})

In [None]:
import spacy

# load english language model and create nlp object from it
nlp = spacy.load("en_core_web_sm")


#use this utility function to get the preprocessed text data
def preprocess(text):
    # remove stop words and lemmatize the text
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)

    return " ".join(filtered_tokens)

In [None]:
df.preprocessed_comment = df.Comment.apply(preprocess)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    df.preprocessed_comment,
    df.Emotion_num,
    test_size = 0.20,
    random_state = 2022,
    stratify=df.Emotion_num
)

In [22]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
#1. create a pipeline object
clf = Pipeline([
    ('vectorizer_n_grams', CountVectorizer(ngram_range = (1, 2))),
    ('random_forest', (RandomForestClassifier()))
])

#2. fit with X_train and y_train
clf.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.90      0.90      0.90      1211
           1       0.89      0.93      0.91      1049
           2       0.88      0.88      0.88       487
           3       0.86      0.83      0.84       432
           4       0.78      0.74      0.76       293
           5       0.84      0.66      0.74       128

    accuracy                           0.88      3600
   macro avg       0.86      0.82      0.84      3600
weighted avg       0.87      0.88      0.87      3600



In [23]:
# Exporting NB Classifier to later use in prediction
import joblib
joblib.dump(clf, 'classifier_emotions_model')

['classifier_emotions_model']