# Analyzing the best model

## Imports

In [13]:
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.linear_model import LogisticRegression

# Model Metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

## Useful Functions

In [14]:
def evaluate(y_test, y_pred):
    # confusion matrix
    print(confusion_matrix(y_test, y_pred))

    # accuracy, precision, recall, f1
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred, average='weighted'))
    print("Recall:", recall_score(y_test, y_pred, average='weighted'))
    print("F1:", f1_score(y_test, y_pred, average='weighted'))

In [15]:
def show_cm(cm, classes, figsize=(10, 10)):
    # cm = np.array([[TP, FP], [FN, TN]])
    plt.figure(figsize=figsize)
    plt.imshow(cm)
    plt.suptitle('Confusion matrix')
    total = sum(sum(cm))
    plt.title('Total cases: {}'.format(total))
    plt.colorbar()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes)
    plt.yticks(tick_marks, classes)

    for i in range(len(classes)):
        for j in range(len(classes)):
            perc = round(cm[i, j] / total * 100, 1)
            plt.text(j, i, f"{format(cm[i, j], '.0f')} : {perc}%", horizontalalignment="center",
                     color="black" if cm[i, j] > cm.max() / 2 else "white")

    plt.show()

## Loading data

In [16]:
df = pd.read_pickle('data/reps/2_bow.pkl')
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,text,emotions,rep
0,feel irrit kinda hate feel,anger,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,id rather home feel violent lone im not_tri so...,anger,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,suggest wait discuss feel less resent,anger,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,wrong feel royal piss,anger,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,im tierd talk like there hope hell care unders...,anger,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [17]:
df_features = pd.DataFrame(df['rep'].values.tolist())
df = pd.concat([df_features, df['emotions'], df["text"]], axis=1)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1492,1493,1494,1495,1496,1497,1498,1499,emotions,text
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,anger,feel irrit kinda hate feel
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,anger,id rather home feel violent lone im not_tri so...
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,anger,suggest wait discuss feel less resent
3,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,anger,wrong feel royal piss
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,anger,im tierd talk like there hope hell care unders...


In [18]:
print(df['emotions'].value_counts())

emotions
joy         14107
sadness     12119
anger        5732
fear         4771
love         3455
surprise     1497
Name: count, dtype: int64


## Running the best model / representation Combination

In [19]:
model =  LogisticRegression(penalty='l2')

# Step 1: Separate 'text' column from features
features = df.drop(['text', 'emotions'], axis=1)  # Assuming 'target_column' is your target variable
text_column = df['text']

# Step 2: Split data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(features, df['emotions'], test_size=0.2, random_state=42)

# Step 3: Train your model
model.fit(x_train, y_train)

# Step 4: Make predictions
y_pred = model.predict(x_test)

# Step 5: Create a DataFrame containing 'x_test', 'y_test', 'y_pred', and 'text' column
df_results = pd.DataFrame({
    'x_test': x_test.index,
    'y_test': y_test,
    'y_pred': y_pred
})

# Adding 'text' column from original DataFrame based on index
df_results['text'] = text_column[df_results['x_test']].values


df_results.head()

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Unnamed: 0,x_test,y_test,y_pred,text
31271,31271,sadness,sadness,ive recov flu side feel complet listless
35357,35357,sadness,sadness,id drag work feel like punish littl fun rest day
23030,23030,joy,joy,want feel like make posit impact societi found...
12570,12570,joy,joy,want feel intellig abl make decis
10709,10709,joy,joy,still feel like talent defens


In [20]:
# Drop index and x_test
df_results = df_results.drop(['x_test'], axis=1)

# Print only rows where 'y_test' and 'y_pred' are different
df_results = df_results[df_results['y_test'] != df_results['y_pred']]

df_results.shape

Unnamed: 0,y_test,y_pred,text
41164,surprise,sadness,feel like littl kid amaz anim alon
28217,sadness,anger,need feel feel boy hate appar want kiss badli
14844,joy,fear,feel pressur although im not_sur thinner tone ...
30973,sadness,joy,find feel low find realli difficult accept god...
23229,joy,sadness,build ambiti compet cautiou cun aquariu know f...
15706,joy,love,almost back track beauti day feel bless
38600,sadness,fear,rememb chaotic confus frighten day sens alway ...
27346,love,sadness,curmudgeonli crank beguil peopl think im forgi...
3792,anger,joy,share small flat friend birthday could not_se ...
4241,anger,sadness,know day die either feel cold hand death run e...


In [21]:
df_results.head(20)

(1214, 3)