## 1. Loading the data and creating balanced data sets

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import string

# Load the table in a dataframe for further data analysis
comments_df = pd.read_csv('ham_comments_multiclass.csv')

# Replace all empty cells with no_aspect and the cells containing x with respective aspect
comments_df.fillna('no_aspect', inplace=True)
comments_df["Feature Request"].replace({'x': 'feature'}, inplace=True)
comments_df["Problem Report"].replace({'x': 'problem'}, inplace=True)
comments_df["Safety"].replace({'x': 'safety'}, inplace=True)
comments_df["Efficiency"].replace({'x': 'efficiency'}, inplace=True)

# Check if the dataframe contains any empty cells. 
# If all columns have 0 empty cells than the dataframe is complete.
print(comments_df.isna().sum())

# Extract all comments where the subject is feature request
feature = comments_df[comments_df['Feature Request'] == 'feature']

# Extract all comments where the subject is not feature request
no_feature = comments_df[comments_df['Feature Request'] == 'no_aspect']

# Extract all comments where the subject is problem report
problem = comments_df[comments_df['Problem Report'] == 'problem']

# Extract all comments where the subject is not problem report
no_problem = comments_df[comments_df['Problem Report'] == 'no_aspect']

# Extract all safety related comments
safety = comments_df[comments_df['Safety'] == 'safety']

# Extract all comments not related to safety
no_safety = comments_df[comments_df['Safety'] == 'no_aspect']

# Extract all efficiency related comments
efficiency = comments_df[comments_df['Efficiency'] == 'efficiency']

# Extract all comments not related to efficiency
no_efficiency = comments_df[comments_df['Efficiency'] == 'no_aspect']

# Use only as many no_feature comments as there are feature comments to have a balanced dataset
no_feature = no_feature.sample(feature.shape[0])
problem = problem.sample(no_problem.shape[0])
no_safety = no_safety.sample(safety.shape[0])
no_efficiency = no_efficiency.sample(efficiency.shape[0])

# Create new balanced datasets
balanced_feature_df = no_feature.append(feature, ignore_index = True)
balanced_problem_df = no_problem.append(problem, ignore_index = True)
balanced_safety_df = no_safety.append(safety, ignore_index = True)
balanced_efficiency_df = no_efficiency.append(efficiency, ignore_index = True)

Nr.                   0
Comment in English    0
Spam / Ham            0
Polarity              0
Feature Request       0
Problem Report        0
Safety                0
Efficiency            0
dtype: int64


## 2. Loading the stopwords

In [2]:
import nltk
import re

# Download stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\KarrasO\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## 3. Defining the BOW function

In [3]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

# only_letters, tokenization, stemming, stopwords_removal are boolean values
# that decide how the text is going to be preprocessed
def clean_text(only_letters, tokenization, stemming, stopwords_removal, comments):
    
    # Create a new Porter Stemmer object
    porter_stemmer = PorterStemmer()

    processed_comments_df = []

    # Iterate through each comment in the balanced dataset
    for i in range(len(comments)):
        
        comment = comments[i]
        
        if(only_letters):
            # Keep only letters and spaces
            comment = re.sub('[^a-zA-Z]', ' ', comment)
            # Turn all letters to lower case
            comment = comment.lower()

        if(tokenization):
            # Turn comment to tokens
            comment = comment.split()

        if(stemming):
            # Do stemming
            comment = [porter_stemmer.stem(word) for word in comment]

        if(stopwords_removal):
            # Remove stop words
            comment = [word for word in comment if word.lower() not in stopwords.words('english')]

        if(tokenization): # Join words again to form a text
            comment = " ".join(comment)
        
        # Add processed comment 
        processed_comments_df.append(comment)
        
    return processed_comments_df

## 4. Performing 10-fold cross validation

In [4]:
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, fbeta_score

# Get the comments of the balanced data sets for each aspect
X1 = balanced_feature_df['Comment in English'].astype('U').values
X2 = balanced_problem_df['Comment in English'].astype('U').values
X3 = balanced_safety_df['Comment in English'].astype('U').values
X4 = balanced_efficiency_df['Comment in English'].astype('U').values

# Get the labels of the comments of the balanced data set for each aspect
y1 = balanced_feature_df['Feature Request'].values
y2 = balanced_problem_df['Problem Report'].values
y3 = balanced_safety_df['Safety'].values
y4 = balanced_efficiency_df['Efficiency'].values

# Apply bow to the comments of the balanced data set for each aspect
X1_vec = CountVectorizer().fit_transform(clean_text(True, True, True, True, X1))
X2_vec = CountVectorizer().fit_transform(clean_text(True, True, True, True, X2))
X3_vec = CountVectorizer().fit_transform(clean_text(True, True, True, True, X3))
X4_vec = CountVectorizer().fit_transform(clean_text(True, True, True, True, X4))

# Define 10fold crossvalidation
cv = KFold(n_splits=10, random_state=1, shuffle=True)

# Function to create the reporting for the single algorithms and aspect consisting of the average precision, recall, f1, and accuracy
def reporting(name, model, feature, data, fb_value):       
    accuracies = []
    precisions = []
    recalls = []
    f1s = []
    fbs = []

    # Perform the 10folds
    for train_index, val_index in cv.split(data):
        # Train the model
        model.fit(data[train_index].toarray(), feature[train_index])
        # Predict the labels of the test data
        pred = model.predict(data[val_index].toarray())
        
        # Get the report for the currnt fold
        report = classification_report(feature[val_index], pred, output_dict=True)
        fb = fbeta_score(feature[val_index], pred, average='macro', beta = fb_value)
        
        # Add the single measures of the current fold to the array for calculating the averages 
        accuracies.append(report['accuracy'])
        macro_avg = report['macro avg']
        precisions.append(macro_avg['precision']) 
        recalls.append(macro_avg['recall'])
        f1s.append(macro_avg['f1-score'])
        fbs.append(fb)
        
        # If needed, the confusion matrix of the fold can be visualized
        #confusion_matrix_rf_balanced = confusion_matrix(feature[val_index], pred)
        # Plot the confusion matrix for Voting Classifier
        #display_labels = ['Aspect', 'NOT_Aspect']

        #disp = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix_rf_balanced, display_labels=display_labels)
        #disp = disp.plot(cmap = 'viridis')
        #plt.grid(False)
        #plt.rcParams.update({'font.size': 15})
        #plt.tight_layout()
    
    # Print the average values of the metrics for current model and aspect
    print('Model:', name)
    print('Accuracy:', sum(accuracies) / len(accuracies))
    print('F1Score:', sum(f1s) / len(f1s))
    print('FbScore:', sum(fbs) / len(fbs))
    print('Recall:', sum(recalls) / len(recalls))
    print('Precision:', sum(precisions) / len(precisions))
    
    #print()
    #print(sum(accuracies) / len(accuracies))
    #print(sum(f1s) / len(f1s))
    #print(sum(fbs) / len(fbs))
    #print(sum(recalls) / len(recalls))
    #print(sum(precisions) / len(precisions))
    
# Create the models we want to test
model1 = LogisticRegression()
model2 = SVC()
model3 = GaussianNB()
model4 = RandomForestClassifier()

# Create the reportings
print('FEATURE')
reporting('LR', model1, y1, X1_vec, 5.38)
reporting('SVM', model2, y1, X1_vec, 5.38)
reporting('RF', model4, y1, X1_vec, 5.38)
reporting('NB', model3, y1, X1_vec, 5.38)
print('PROBLEM')
reporting('LR', model1, y2, X2_vec, 1.87)
reporting('SVM', model2, y2, X2_vec, 1.87)
reporting('RF', model4, y2, X2_vec, 1.87)
reporting('NB', model3, y2, X2_vec, 1.87)
print('SAFETY')
reporting('LR', model1, y3, X3_vec, 2.45)
reporting('SVM', model2, y3, X3_vec, 2.45)
reporting('RF', model4, y3, X3_vec, 2.45)
reporting('NB', model3, y3, X3_vec, 2.45)
print('EFFICIENCY')
reporting('LR', model1, y4, X4_vec, 4.78)
reporting('SVM', model2, y4, X4_vec, 4.78)
reporting('RF', model4, y4, X4_vec, 4.78)
reporting('NB', model3, y4, X4_vec, 4.78)

FEATURE
Model: LR
Accuracy: 0.7149014778325123
F1Score: 0.7104785668016431
FbScore: 0.7166946802756134
Recall: 0.7177477302109655
Precision: 0.7213898193309959
Model: SVM
Accuracy: 0.7045566502463054
F1Score: 0.7006952627112062
FbScore: 0.7063737178994354
Recall: 0.7073245912584148
Precision: 0.7103506127944982
Model: RF
Accuracy: 0.7216748768472907
F1Score: 0.7187183358648573
FbScore: 0.7251437243217346
Recall: 0.7261427869516105
Precision: 0.7266456990721696
Model: NB
Accuracy: 0.6660098522167488
F1Score: 0.6587399063976905
FbScore: 0.6626462932793279
Recall: 0.6634020432182198
Precision: 0.668319046794279
PROBLEM
Model: LR
Accuracy: 0.6647887323943662
F1Score: 0.6626354730755122
FbScore: 0.6647701078695594
Recall: 0.669967129467983
Precision: 0.6708381087439422
Model: SVM
Accuracy: 0.6464788732394366
F1Score: 0.6424755631210253
FbScore: 0.6441082806195617
Recall: 0.6490516435767002
Precision: 0.6525336971270338
Model: RF
Accuracy: 0.6563380281690142
F1Score: 0.6543174905168024
FbSco