# Model Training and Result Analysis

## Imports and Data Loading

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Classifiers
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC

# Model Metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

In [4]:
df = pd.read_pickle('data/data_processed.pkl')

## Defining useful functions

In [5]:
def split(x, y, test_size = 0.2, log=False):
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = test_size, random_state = 0, stratify=y)

    if log:
        print("Train Shape:")
        print(X_train.shape, y_train.shape)
        print("Test Shape:")
        print(X_test.shape, y_test.shape)

        print("\nLabel distribution in the training set:")
        print(y_train.value_counts())
        print("\nLabel distribution in the test set:")
        print(y_test.value_counts())

    return X_train, X_test, y_train, y_test

In [6]:
def evaluate(y_test, y_pred):
    # confusion matrix
    print(confusion_matrix(y_test, y_pred))

    # accuracy, precision, recall, f1
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred))
    print("Recall:", recall_score(y_test, y_pred))
    print("F1:", f1_score(y_test, y_pred))

In [7]:
def train(model, x, y, split_size = 0.2, cross_count = 0):
    if cross_count == 0:
        X_train, X_test, y_train, y_test = split(x, y, test_size = split_size)

        # Train and evaluate model
        model.fit(X_train, y_train) # maybe add a K-FOLDS here
        y_pred = model.predict(X_test)
        evaluate(y_test, y_pred)
    else:
        scores = cross_validate(model, x, y, scoring=['accuracy', 'precision_macro', 'recall_macro', 'f1_macro'], return_train_score=True)
        print(scores)

In [8]:
def show_cm(cm):
    # cm = np.array([[TP, FP], [FN, TN]])
    classes = ['Playoff', 'Eliminated']
    plt.matshow(cm)
    plt.suptitle('Confusion matrix')
    total = sum(sum(cm))
    plt.title('Total cases: {}'.format(total))
    plt.colorbar()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes)
    plt.yticks(tick_marks, classes)

    for i in range(len(classes)):
        for j in range(len(classes)):
            perc = round(cm[i, j] / total * 100, 1)
            plt.text(j, i, f"{format(cm[i, j], '.0f')} : {perc}%", horizontalalignment="center",
                     color="black" if cm[i, j] > cm.max() / 2 else "white")

    plt.show()

## Defining the models to be trained

In [9]:
models = [
    lambda: MultinomialNB(), # Naive Bayes
    lambda: DecisionTreeClassifier(max_depth=5, min_samples_split=2, random_state=42),
    lambda: GaussianNB(),
    lambda: KNeighborsClassifier(n_neighbors=15, weights='uniform'),
    lambda: GradientBoostingClassifier(n_estimators=100, learning_rate=0.01, max_depth=3, random_state=42),
    lambda: RandomForestClassifier(n_estimators=100, random_state=42),
    lambda: MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=1000, random_state=42), # Neural Network
    lambda: SVC(C=0.1, kernel='linear', probability=True), # Support Vector Machine
    lambda: LogisticRegression(penalty='l2'), # Linear Model with overfitting avoidance
    lambda: LinearSVC(penalty='l2'), # Linear Model with overfitting avoidance

    # Linear Model with stochastic gradient descent learning (loss function)
    lambda: SGDClassifier(loss='log', penalty='l2', alpha=0.001, max_iter=100, random_state=42)
]

# Model Training

In [10]:
df = pd.read_pickle('data/reps/1_plain_text.pkl')
df = df.reset_index(drop=True)

df.head()

Unnamed: 0,text,emotions
0,feel irrit kinda hate feel,anger
1,id rather home feel violent lone im not_tri so...,anger
2,suggest wait discuss feel less resent,anger
3,wrong feel royal piss,anger
4,im tierd talk like there hope hell care unders...,anger


In [12]:
""" x = df.drop('emotions', axis=1).values
y = df['emotions']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

# Initialize SVM model
model = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=1000, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
evaluate(y_test, y_pred)

# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Show confusion matrix
show_cm(cm) """

" x = df.drop('emotions', axis=1).values\ny = df['emotions']\n\n# Split data into training and testing sets\nX_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2)\n\n# Initialize SVM model\nmodel = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=1000, random_state=42)\n\n# Train the model\nmodel.fit(X_train, y_train)\n\n# Make predictions\ny_pred = model.predict(X_test)\n\n# Evaluate the model\nevaluate(y_test, y_pred)\n\n# Compute confusion matrix\ncm = confusion_matrix(y_test, y_pred)\n\n# Show confusion matrix\nshow_cm(cm) "

In [None]:
""" # train all models 
for model in models:
    print(model)
    try:
        train(model(), df.drop('emotions', axis=1), df['emotions'], cross_count=5)
    except Exception as e:
        print(e)
    print("\n ---------------- \n") """