## 1. Loading the data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import string

# Load the table in a dataframe for further data analysis
comments_df = pd.read_csv('spam_or_ham_and_polarity.csv')

# Check if the dataframe contains any empty cells. 
# If all columns have 0 empty cells than the dataframe is complete.
print(comments_df.isna().sum())

# Count the number of comments that are spam and ham
comments_df['Spam / Ham'].value_counts()

Nr.                   0
Comment in English    0
Spam / Ham            0
Polarity              0
Likes                 0
Replies               0
dtype: int64


spam    3636
ham      764
Name: Spam / Ham, dtype: int64

## 2. Creating a balanced data set

In [2]:
# All comments that are labeled as ham
ham = comments_df[comments_df['Spam / Ham'] == 'ham']

# All comments that are labeled as spam
spam = comments_df[comments_df['Spam / Ham'] == 'spam']

# Use only as many spam comments as there are ham comments to have a balanced dataset
spam = spam.sample(ham.shape[0])

# Create a new balanced dataset
balanced_comments_df = spam.append(ham, ignore_index = True)
balanced_comments_df

Unnamed: 0,Nr.,Comment in English,Spam / Ham,Polarity,Likes,Replies
0,2797.0,is it jst me or the idea is stupid nd it will ...,spam,negative,0,0
1,146.0,Is it only for Tesla owners,spam,neutral,1,0
2,2629.0,Price to make 100 billion,spam,neutral,0,0
3,4390.0,This is amazing... how does Elon come up with ...,spam,positive,0,0
4,1762.0,Anyone else find a design flaw with this video...,spam,negative,0,0
...,...,...,...,...,...,...
1523,4457.0,Why not just hyperloop it?,ham,neutral,0,0
1524,4458.0,I don't think those platforms would be necessa...,ham,neutral,0,0
1525,4459.0,Then you'll create more traffic above with peo...,ham,negative,23,11
1526,4464.0,That looks extremely expensive and I understan...,ham,negative,285,17


## 3. Loading the stopwords

In [3]:
import nltk
import re

# Download stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\KarrasO\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## 4. Defining the BOW function

In [4]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

# only_letters, tokenization, stemming, stopwords_removal are boolean values
# that decide how the text is going to be preprocessed
def clean_text(only_letters, tokenization, stemming, stopwords_removal, comments):
    
    # Create a new Porter Stemmer object
    porter_stemmer = PorterStemmer()

    processed_comments_df = []

    # Iterate through each comment in the balanced dataset
    for i in range(len(comments)):
        
        comment = comments[i]
        
        if(only_letters):
            # Keep only letters and spaces
            comment = re.sub('[^a-zA-Z]', ' ', comment)
            # Turn all letters to lower case
            comment = comment.lower()

        if(tokenization):
            # Turn comment to tokens
            comment = comment.split()

        if(stemming):
            # Do stemming
            comment = [porter_stemmer.stem(word) for word in comment]

        if(stopwords_removal):
            # Remove stop words
            comment = [word for word in comment if word.lower() not in stopwords.words('english')]

        if(tokenization): # Join words again to form a text
            comment = " ".join(comment)
        
        # Add processed comment 
        processed_comments_df.append(comment)
        
    return processed_comments_df

## 5. Performing 10-fold cross validation

In [5]:
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, fbeta_score

# Get the comments of the balanced data set
X = balanced_comments_df['Comment in English'].astype('U').values

# Get the labels of the comments of the balanced data set
y = balanced_comments_df['Spam / Ham'].values

# Apply bow to the comments
X_vec = CountVectorizer().fit_transform(clean_text(True, True, True, True, X))

# Define 10fold crossvalidation
cv = KFold(n_splits=10, random_state=1, shuffle=True)

# Function to create the reporting for the single algorithms consisting of the average precision, recall, f1, and accuracy
def reporting(name, model):       
    accuracies = []
    precisions = []
    recalls = []
    f1s = []
    fbs = []
    
    # Perform the 10folds
    for train_index, val_index in cv.split(X_vec):
        # Train the model
        model.fit(X_vec[train_index].toarray(), y[train_index])
        # Predict the labels of the test data
        pred = model.predict(X_vec[val_index].toarray())
        
        # Get the report for the currnt fold
        report = classification_report(y[val_index], pred, output_dict=True)
        fb = fb = fbeta_score(y[val_index], pred, average='macro', beta=5.76)
        
        # Add the single measures of the current fold to the array for calculating the averages 
        accuracies.append(report['accuracy'])
        macro_avg = report['macro avg']
        precisions.append(macro_avg['precision']) 
        recalls.append(macro_avg['recall'])
        f1s.append(macro_avg['f1-score'])
        fbs.append(fb)
        
        # If needed, the confusion matrix of the fold can be visualized
        #confusion_matrix_rf_balanced = confusion_matrix(y[val_index], pred)
        # Plot the confusion matrix for Voting Classifier
        #display_labels = ['Ham', 'Spam']

        #disp = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix_rf_balanced, display_labels=display_labels)
        #disp = disp.plot(cmap = 'viridis')
        #plt.grid(False)
        #plt.rcParams.update({'font.size': 15})
        #plt.tight_layout()
    
    # Print the average values of the metrics for current model
    print('Model:', name)
    print('Accuracy:', sum(accuracies) / len(accuracies))
    print('F1Score:', sum(f1s) / len(f1s))
    print('FbScore:', sum(fbs) / len(fbs))
    print('Recall:', sum(recalls) / len(recalls))
    print('Precision:', sum(precisions) / len(precisions))
    #print()
    #print(sum(accuracies) / len(accuracies))
    #print(sum(f1s) / len(f1s))
    #print(sum(fbs) / len(fbs))
    #print(sum(recalls) / len(recalls))
    #print(sum(precisions) / len(precisions))
    
# Create the models we want to test
model1 = LogisticRegression()
model2 = SVC()
model3 = GaussianNB()
model4 = RandomForestClassifier()

# Create the reportings
reporting('LR', model1)
reporting('SVM', model2)
reporting('RF', model4)
reporting('NB', model3)

Model: LR
Accuracy: 0.8088966288269693
F1Score: 0.808538997389012
FbScore: 0.8092999118002207
Recall: 0.8094415982762155
Precision: 0.8108718967096997
Model: SVM
Accuracy: 0.801702786377709
F1Score: 0.8014027794947459
FbScore: 0.8019908532141098
Recall: 0.8020877526228356
Precision: 0.8027910403019177
Model: RF
Accuracy: 0.801685586515308
F1Score: 0.8015794502983947
FbScore: 0.8022729949542027
Recall: 0.8023604940071477
Precision: 0.8023094762887689
Model: NB
Accuracy: 0.6760534915720673
F1Score: 0.6693902341801424
FbScore: 0.6754071206412011
Recall: 0.6765916617122827
Precision: 0.6919706280859296
