# Analyzing the best model

## Imports

In [1]:
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

# Model Metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

## Useful Functions

In [2]:
def evaluate(y_test, y_pred):
    # confusion matrix
    print(confusion_matrix(y_test, y_pred))

    # accuracy, precision, recall, f1
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred, average='weighted'))
    print("Recall:", recall_score(y_test, y_pred, average='weighted'))
    print("F1:", f1_score(y_test, y_pred, average='weighted'))

In [3]:
def show_cm(cm, classes, figsize=(10, 10)):
    # cm = np.array([[TP, FP], [FN, TN]])
    plt.figure(figsize=figsize)
    plt.imshow(cm)
    plt.suptitle('Confusion matrix')
    total = sum(sum(cm))
    plt.title('Total cases: {}'.format(total))
    plt.colorbar()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes)
    plt.yticks(tick_marks, classes)

    for i in range(len(classes)):
        for j in range(len(classes)):
            perc = round(cm[i, j] / total * 100, 1)
            plt.text(j, i, f"{format(cm[i, j], '.0f')} : {perc}%", horizontalalignment="center",
                     color="black" if cm[i, j] > cm.max() / 2 else "white")

    plt.show()

## Loading data

In [4]:
df = pd.read_pickle('data/data_processed.pkl')
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,text,emotions
0,feel aw job get posit succeed not_happen,sadness
1,im alon feel aw,sadness
2,ive probabl mention realli feel proud actual k...,joy
3,feel littl low day back,sadness
4,beleiv much sensit peopl feel tend compassion,love


## Creating BoW Representation that keeps the Text Column

In [5]:
def model_bow(corpus, max_features = 1500):
    vectorizer = CountVectorizer(max_features = max_features)
    x = vectorizer.fit_transform(corpus).toarray()
    return x

x = model_bow(df['text'])

# Replace the text column with the new BOW representation
df_bow = df.copy()
df_bow['rep'] = x.tolist()
df = df_bow
df.head()

Unnamed: 0,text,emotions,rep
0,feel aw job get posit succeed not_happen,sadness,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,im alon feel aw,sadness,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,ive probabl mention realli feel proud actual k...,joy,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,feel littl low day back,sadness,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,beleiv much sensit peopl feel tend compassion,love,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [6]:
df_features = pd.DataFrame(df['rep'].values.tolist())
df = pd.concat([df_features, df['emotions'], df["text"]], axis=1)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1492,1493,1494,1495,1496,1497,1498,1499,emotions,text
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,sadness,feel aw job get posit succeed not_happen
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,sadness,im alon feel aw
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,joy,ive probabl mention realli feel proud actual k...
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,sadness,feel littl low day back
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,love,beleiv much sensit peopl feel tend compassion


In [7]:
print(df['emotions'].value_counts())

emotions
joy         141067
sadness     121187
anger        57317
fear         47712
love         34554
surprise     14972
Name: count, dtype: int64


## Running the best model / representation Combination

In [8]:
model =  LogisticRegression(penalty='l2')

# Step 1: Separate 'text' column from features
features = df.drop(['text', 'emotions'], axis=1)  # Assuming 'target_column' is your target variable
text_column = df['text']

# Step 2: Split data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(features, df['emotions'], test_size=0.2, random_state=42)

# Step 3: Train your model
model.fit(x_train, y_train)

# Step 4: Make predictions
y_pred = model.predict(x_test)

# Step 5: Create a DataFrame containing 'x_test', 'y_test', 'y_pred', and 'text' column
df_results = pd.DataFrame({
    'x_test': x_test.index,
    'y_test': y_test,
    'y_pred': y_pred
})

# Adding 'text' column from original DataFrame based on index
df_results['text'] = text_column[df_results['x_test']].values


df_results.head()

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Unnamed: 0,x_test,y_test,y_pred,text
36130,36130,fear,fear,feel realli helpless heavi heart
138065,138065,sadness,sadness,ive enjoy abl slouch relax unwind frankli need...
146440,146440,fear,fear,gave internship dmrg feel distraught
103337,103337,sadness,sadness,know feel lost
315528,315528,fear,sadness,kindergarten teacher thoroughli weari job take...


In [9]:
# Drop index and x_test
df_results = df_results.drop(['x_test'], axis=1)

# Print only rows where 'y_test' and 'y_pred' are different
df_results = df_results[df_results['y_test'] != df_results['y_pred']]

df_results.shape

(11203, 3)

In [17]:
# print the first 100 results and the full text
for index, row in df_results.head(100).iterrows():
    print(row['y_test'], row['y_pred'])
    print(row['text'])
    print()

fear sadness
kindergarten teacher thoroughli weari job taken univers entranc exam suffer anxieti week not_want carri work studi altern

surprise fear
im forev take time lie feel weird

joy anger
tri nice though get bitchi person phone window feel free littl fit throw pen face

anger joy


love joy
less intellig could not_real feel music not_passion littl thing im not_sur would go

sadness joy
found jason observ help restat truth learn year ago wife togeth year went time love someon trust feel aw lot like let go

anger sadness
emot never intens one

anger sadness
feel anger see parent beat punish child street recent with similar case

love joy
would want thank let heart feel love phenomenon first time

joy love
feel extrem gener last night companion ate doubl told lion stomach pipe

fear joy
not_feel threaten fact place seem almost welcom comfort even

sadness fear
feel devast someon skeptic along

anger sadness
put mask come work suppress empti feel insid pain loneli bitter jade woman 