## 1. Loading the data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import string

# Load the table in a dataframe for further data analysis
comments_df = pd.read_csv('spam_or_ham_and_polarity.csv')

# Check if the dataframe contains any empty cells. 
# If all columns have 0 empty cells than the dataframe is complete.
print(comments_df.isna().sum())

# Count the number of comments that are spam and ham
comments_df['Spam / Ham'].value_counts()

Nr.                   0
Comment in English    0
Spam / Ham            0
Polarity              0
Likes                 0
Replies               0
dtype: int64


spam    3636
ham      764
Name: Spam / Ham, dtype: int64

## 2. Creating a balanced data set

In [2]:
# All comments that are labeled as ham
ham = comments_df[comments_df['Spam / Ham'] == 'ham']

# All comments that are labeled as spam
spam = comments_df[comments_df['Spam / Ham'] == 'spam']

# Use only as many spam comments as there are ham comments to have a balanced dataset
spam = spam.sample(ham.shape[0])

# Create a new balanced dataset
balanced_comments_df = spam.append(ham, ignore_index = True)
balanced_comments_df

Unnamed: 0,Nr.,Comment in English,Spam / Ham,Polarity,Likes,Replies
0,3443.0,full stupidity,spam,negative,0,0
1,3777.0,lol 6.5 earthquake and ur dead af,spam,negative,0,0
2,2534.0,earthquake Mathafakka!!!!,spam,neutral,0,0
3,1589.0,They spent 3 years making a fucking sewer line...,spam,negative,0,0
4,2731.0,what's Elon smoking?,spam,neutral,0,0
...,...,...,...,...,...,...
1523,4457.0,Why not just hyperloop it?,ham,neutral,0,0
1524,4458.0,I don't think those platforms would be necessa...,ham,neutral,0,0
1525,4459.0,Then you'll create more traffic above with peo...,ham,negative,23,11
1526,4464.0,That looks extremely expensive and I understan...,ham,negative,285,17


## 3. Perfoming 10-fold cross validation

In [3]:
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, fbeta_score

# Get the comments of the balanced data set
X = balanced_comments_df['Comment in English'].astype('U').values

# Get the labels of the comments of the balanced data set
y = balanced_comments_df['Spam / Ham'].values

# Apply tf-idf to the comments
v = TfidfVectorizer()
X_vec = v.fit_transform(X)

# Define 10fold crossvalidation
cv = KFold(n_splits=10, random_state=1, shuffle=True)

# Function to create the reporting for the single algorithms consisting of the average precision, recall, f1, and accuracy
def reporting(name, model):       
    accuracies = []
    precisions = []
    recalls = []
    f1s = []
    fbs = []
    
    # Perform the 10folds
    for train_index, val_index in cv.split(X_vec):
        # Train the model
        model.fit(X_vec[train_index].toarray(), y[train_index])
        # Predict the labels of the test data
        pred = model.predict(X_vec[val_index].toarray())
        
        # Get the report for the currnt fold
        report = classification_report(y[val_index], pred, output_dict=True)
        fb = fbeta_score(y[val_index], pred, average='macro', beta=5.76)
        
        # Add the single measures of the current fold to the array for calculating the averages 
        accuracies.append(report['accuracy'])
        macro_avg = report['macro avg']
        precisions.append(macro_avg['precision']) 
        recalls.append(macro_avg['recall'])
        f1s.append(macro_avg['f1-score'])
        fbs.append(fb)
        
        # If needed, the confusion matrix of the fold can be visualized
        #confusion_matrix_rf_balanced = confusion_matrix(y[val_index], pred)
        # Plot the confusion matrix for Voting Classifier
        #display_labels = ['Ham', 'Spam']

        #disp = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix_rf_balanced, display_labels=display_labels)
        #disp = disp.plot(cmap = 'viridis')
        #plt.grid(False)
        #plt.rcParams.update({'font.size': 15})
        #plt.tight_layout()
    
    # Print the average values of the metrics for current model
    print('Model:', name)
    print('Accuracy:', sum(accuracies) / len(accuracies))
    print('F1Score:', sum(f1s) / len(f1s))
    print('FbScore:', sum(fbs) / len(fbs))
    print('Recall:', sum(recalls) / len(recalls))
    print('Precision:', sum(precisions) / len(precisions))
    #print()
    #print(sum(accuracies) / len(accuracies))
    #print(sum(f1s) / len(f1s))
    #print(sum(fbs) / len(fbs))
    #print(sum(recalls) / len(recalls))
    #print(sum(precisions) / len(precisions))
    
# Create the models we want to test
model1 = LogisticRegression()
model2 = SVC()
model3 = GaussianNB()
model4 = RandomForestClassifier()

# Create the reportings
reporting('LR', model1)
reporting('SVM', model2)
reporting('RF', model4)
reporting('NB', model3)

Model: LR
Accuracy: 0.8036807705538355
F1Score: 0.8032899834254117
FbScore: 0.8034531340969376
Recall: 0.8035068345373428
Precision: 0.8045738858680543
Model: SVM
Accuracy: 0.8095717234262125
F1Score: 0.8092220332136183
FbScore: 0.8096090340899125
Recall: 0.8096946318550323
Precision: 0.8108691030426947
Model: RF
Accuracy: 0.7866744066047471
F1Score: 0.7858696822604828
FbScore: 0.7865057301633993
Recall: 0.786670561275721
Precision: 0.7894280923114274
Model: NB
Accuracy: 0.6793042655658755
F1Score: 0.6777648325389719
FbScore: 0.6782369667331818
Recall: 0.6783745537866642
Precision: 0.6810001153417955
