# Ονοματεπώνυμο και ΑΜ
# Παντελεήμων Μαλέκας 1115201600268

#  

# Preprocessing and cleanup of the data

In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import seaborn as sns
from sklearn.model_selection import train_test_split
import sys
import os
import re

#In this part we will take each file and make the necessary changes to our data.

train_path = r'C:\Users\Pantelis\Documents\uni\tede\Third\data\train.csv'

#First we will convert every uppercase character to lowercase using the str.lower() function.
train_data = pd.read_csv(train_path, dtype=str).apply(lambda x: x.astype(str).str.lower())

#Now we will remove certain characters with the sub() function provided by the re module.

#We will substitute every unwanted character with ' '. Here we remove the URLs.
train_data['Comment'] = train_data['Comment'].apply(lambda y: re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', " ", y , flags=re.MULTILINE) )

#Here we remove escape characters such as \n, \x and \u.
train_data['Comment'] = train_data['Comment'].apply(lambda z: re.sub(r'\\n', " ", z , flags=re.MULTILINE) )
train_data['Comment'] = train_data['Comment'].apply(lambda z: re.sub(r'\\x..', " ", z , flags=re.MULTILINE) )
train_data['Comment'] = train_data['Comment'].apply(lambda z: re.sub(r'\\u....', " ", z , flags=re.MULTILINE) )

#And finally we remove any other remaining symbols by removing every non-alphabetic character.
train_data['Comment'] = train_data['Comment'].apply(lambda k: re.sub("[^a-z]+", " ", k, flags=re.MULTILINE) )

#Similar work performed in the 'impermium_verification_set' file.
test_path = r'C:\Users\Pantelis\Documents\uni\tede\Third\data\impermium_verification_set.csv'
test_data = pd.read_csv(test_path, dtype=str).apply(lambda x: x.astype(str).str.lower())

test_data['Comment'] = test_data['Comment'].apply(lambda y: re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', " ", y , flags=re.MULTILINE) )

test_data['Comment'] = test_data['Comment'].apply(lambda z: re.sub(r'\\n', " ", z , flags=re.MULTILINE) )
test_data['Comment'] = test_data['Comment'].apply(lambda z: re.sub(r'\\x..', " ", z , flags=re.MULTILINE) )
test_data['Comment'] = test_data['Comment'].apply(lambda z: re.sub(r'\\u....', " ", z , flags=re.MULTILINE) )

test_data['Comment'] = test_data['Comment'].apply(lambda k: re.sub("[^a-z]+", " ", k, flags=re.MULTILINE) )

#Similar work performed in the 'impermium_verification_labels' file.
test_path_labels = r'C:\Users\Pantelis\Documents\uni\tede\Third\data\impermium_verification_labels.csv'
test_labels = pd.read_csv(test_path_labels, dtype=str).apply(lambda x: x.astype(str).str.lower())

test_labels['Comment'] = test_labels['Comment'].apply(lambda y: re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', "", y , flags=re.MULTILINE) )

test_labels['Comment'] = test_labels['Comment'].apply(lambda z: re.sub(r'\\n', " ", z , flags=re.MULTILINE) )
test_labels['Comment'] = test_labels['Comment'].apply(lambda z: re.sub(r'\\x..', " ", z , flags=re.MULTILINE) )
test_labels['Comment'] = test_labels['Comment'].apply(lambda z: re.sub(r'\\u....', " ", z , flags=re.MULTILINE) )

test_labels['Comment'] = test_labels['Comment'].apply(lambda k: re.sub("[^a-z]+", " ", k, flags=re.MULTILINE) )



#  

# Naive Bayes classification with CountVectorizer

In [32]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn import metrics
import numpy as np
import matplotlib.pyplot as plt

from sklearn import naive_bayes

import nltk

#Now we will classify our data with the Naive Bayes classifier.


#Encoding the labels we have.
Encoder = LabelEncoder() 
Train_Y = Encoder.fit_transform(train_data["Insult"])
Test_Y = Encoder.fit_transform(test_labels["Insult"])

#Using the CountVectorizer to fit and transform our data.
count_vectorizer = CountVectorizer()
counts_train = count_vectorizer.fit_transform(train_data['Comment'])
counts_test = count_vectorizer.transform(test_data['Comment'])

#Initializing the NB object. Alpha here is set to 0.5. The default value is 1.0 which is also the one used for Laplace Smoothing.
#By setting alpha to 0.5 we will be using Lidstone Smoothing in order to see the necessary changes when it will be later set to 1.0
Bayes = naive_bayes.MultinomialNB(alpha = 0.5)

#Fitting the training data and making predictions on the test set.
Bayes.fit(counts_train,Train_Y)
predictions_Bayes = Bayes.predict(counts_test)

#Getting the scores we need.
acc_score_test = accuracy_score(Test_Y, predictions_Bayes)
f1_score_test = f1_score(Test_Y, predictions_Bayes, average = 'weighted')


#Now we will check the required enhancements to our classification method.

#First we will perform lemmatization in our text. The required objects are initialized here.
tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

#Defining a lemmatize function that will lemmatize our text and return it in string format.
def lemmatize(text):
    string_list = [lemmatizer.lemmatize(word) for word in tokenizer.tokenize(text)]
    list_to_str = ' '.join([str(element) for element in string_list])
    return list_to_str

#Performing lemmatization in our texts.
train_data['lemmatized_comment'] = train_data.Comment.apply(lemmatize)
test_data['lemmatized_comment'] = test_data.Comment.apply(lemmatize)

#Performing Naive Bayes Classification. Same as before, but now using the 'lemmatized_comment' column.
count_vectorizer_1 = CountVectorizer()
counts_train_1 = count_vectorizer_1.fit_transform(train_data['lemmatized_comment'])
counts_test_1 = count_vectorizer_1.transform(test_data['lemmatized_comment'])

Bayes_1 = naive_bayes.MultinomialNB(alpha = 0.5)
Bayes_1.fit(counts_train_1,Train_Y)
predictions_Bayes_1 = Bayes_1.predict(counts_test_1)

acc_score_test1 = accuracy_score(Test_Y, predictions_Bayes_1)
f1_score_test1 = f1_score(Test_Y, predictions_Bayes_1, average = 'weighted')



#For the next enhancement, we will remove stop words. Everything else is done same as in the previous cases.
count_vectorizer_2 = CountVectorizer(stop_words = 'english')
counts_train_2 = count_vectorizer_2.fit_transform(train_data['Comment'])
counts_test_2 = count_vectorizer_2.transform(test_data['Comment'])

Bayes_2 = naive_bayes.MultinomialNB(alpha = 0.5)
Bayes_2.fit(counts_train_2,Train_Y)
predictions_Bayes_2 = Bayes_2.predict(counts_test_2)

acc_score_test2 = accuracy_score(Test_Y, predictions_Bayes_2)
f1_score_test2 = f1_score(Test_Y, predictions_Bayes_2, average = 'weighted')



#For the next enhancement, we will be using bigrams instead of unigrams. Everything else is done same as in the previous cases.
count_vectorizer_3 = CountVectorizer(ngram_range=(2,2))
counts_train_3 = count_vectorizer_3.fit_transform(train_data['Comment'])
counts_test_3 = count_vectorizer_3.transform(test_data['Comment'])

Bayes_3 = naive_bayes.MultinomialNB(alpha = 0.5)
Bayes_3.fit(counts_train_3,Train_Y)
predictions_Bayes_3 = Bayes_3.predict(counts_test_3)

acc_score_test3 = accuracy_score(Test_Y, predictions_Bayes_3)
f1_score_test3 = f1_score(Test_Y, predictions_Bayes_3, average = 'weighted')



#For the next enhancement, we will be using Laplace Smoothing. This is done by setting the aplha value to 1.0. Everything else is done same as in the previous cases.
Bayes_4 = naive_bayes.MultinomialNB(alpha = 1.0)
Bayes_4.fit(counts_train,Train_Y)
predictions_Bayes_4 = Bayes_4.predict(counts_test)

acc_score_test4 = accuracy_score(Test_Y, predictions_Bayes_4)
f1_score_test4 = f1_score(Test_Y, predictions_Bayes_4, average = 'weighted')

#Now we will create a dataframe that will keep our scores. 
bayes_df = pd.DataFrame(columns=['Classification Method', 'Classification Accuracy', 'F1 Score'])
bayes_df = bayes_df.append({'Classification Method': 'Naive Bayes', 'Classification Accuracy':acc_score_test, 'F1 Score': f1_score_test}, ignore_index=True)
bayes_df = bayes_df.append({'Classification Method': 'NB with Lemmatization', 'Classification Accuracy':acc_score_test1, 'F1 Score': f1_score_test1}, ignore_index=True)
bayes_df = bayes_df.append({'Classification Method': 'NB without Stop Words', 'Classification Accuracy':acc_score_test2, 'F1 Score': f1_score_test2}, ignore_index=True)
bayes_df = bayes_df.append({'Classification Method': 'NB with Bigrams', 'Classification Accuracy':acc_score_test3, 'F1 Score': f1_score_test3}, ignore_index=True)
bayes_df = bayes_df.append({'Classification Method': 'NB with Laplace Smoothing', 'Classification Accuracy':acc_score_test4, 'F1 Score': f1_score_test4}, ignore_index=True)

#Printing our scores.
print("Classification scores for Naive Bayes and its enhancements.")
bayes_df

#Conclusions and Notes:
#As we can see, the Naive Bayes algorithm gives some average scores. Each enhancement gave slightly better results.

#Lemmatization made a slight improvement. 
#This is expected given that lemmatization reduces the tokens we have (by for example transforming 'uses' and 'used' to 'use').
#This allows for Naive Bayes to make better predictions.

#The removal of stop words also improved our results.
#By giving the algorithm more important words to work on, this allowed for better predictions.

#Bigrams reduced our scores. Actually, this is expected. The use of bigrams significantly reduces the range of the words we have.
#Since the predictions are way less we are expected to have reduced scores. 
#I should note here that if one wishes to add unigrams along with bigrams the scores will change.
#Classifying unigrams along with bigrams gives more accurate scores (about the same results as in the ones that use lemmatization).

#The use of Laplace smoothing also increased our scores. Since Laplace smoothing eliminates cases of zero probability this allowed for higher accuracy.

Classification scores for Naive Bayes and its enhancements.


Unnamed: 0,Classification Method,Classification Accuracy,F1 Score
0,Naive Bayes,0.67472,0.674774
1,NB with Lemmatization,0.676063,0.676087
2,NB without Stop Words,0.688143,0.682593
3,NB with Bigrams,0.665324,0.65734
4,NB with Laplace Smoothing,0.682774,0.678859


#  

# Creating the TF-IDF and POS array

# Part 1: Getting the POS tags

In [17]:
#We will create the desired array, by getting the POS tags first.

#We will create two dataframes that will keep the fraction of each POS tag.
fraction_df = pd.DataFrame(columns=['fractionAdverbs', 'fractionVerbs', 'fractionNouns', 'fractionAdjectives'])
fraction_test = pd.DataFrame(columns=['fractionAdverbs', 'fractionVerbs', 'fractionNouns', 'fractionAdjectives'])

#Getting the only the Date and Comment columns from our previous dataframes.
train_df2 = train_data[ ['Date','Comment'] ]
test_df2 = test_data[ ['Date','Comment'] ]

#Iterating every comment and getting the POS tags.
for index, row in train_df2.iterrows():
    
    #Tokenizing the comment and getting the tags. I decided to use the universal tagset to get simplified tags.
    tokens = nltk.word_tokenize(row['Comment'])
    tag_list = nltk.pos_tag(tokens,tagset='universal')
    tag_len = len(tag_list)
    
    #Counting the tags.
    noun_count = 0
    verb_count = 0
    adv_count = 0
    adj_count = 0
    for i,j in tag_list:
        if j == 'NOUN':
            noun_count = noun_count + 1
        if j == 'VERB':
            verb_count = verb_count + 1
        if j == 'ADV':
            adv_count = adv_count + 1
        if j == 'ADJ':
            adj_count = adj_count + 1
        
    #Getting the frequencies.
    noun_freq = 0.0
    verb_freq = 0.0
    adv_freq = 0.0
    adj_freq = 0.0
    if tag_len != 0 :
        noun_freq = noun_count / tag_len
        verb_freq = verb_count / tag_len
        adv_freq = adv_count / tag_len
        adj_freq = adj_count / tag_len
    
    #Adding the fractions to the fraction dataframe.    
    fraction_df = fraction_df.append({'fractionAdverbs': adv_freq, 'fractionVerbs': verb_freq, 'fractionNouns': noun_freq, 'fractionAdjectives': adj_freq }, ignore_index=True)
    
#Similar work performed for the test dataset.
for index, row in test_df2.iterrows():
    tokens = nltk.word_tokenize(row['Comment'])
    tag_list = nltk.pos_tag(tokens,tagset='universal')
    tag_len = len(tag_list)
    
    noun_count = 0
    verb_count = 0
    adv_count = 0
    adj_count = 0
    for i,j in tag_list:
        if j == 'NOUN':
            noun_count = noun_count + 1
        if j == 'VERB':
            verb_count = verb_count + 1
        if j == 'ADV':
            adv_count = adv_count + 1
        if j == 'ADJ':
            adj_count = adj_count + 1
        
    noun_freq = 0.0
    verb_freq = 0.0
    adv_freq = 0.0
    adj_freq = 0.0
    if tag_len != 0 :
        noun_freq = noun_count / tag_len
        verb_freq = verb_count / tag_len
        adv_freq = adv_count / tag_len
        adj_freq = adj_count / tag_len
    
        
    fraction_test = fraction_test.append({'fractionAdverbs': adv_freq, 'fractionVerbs': verb_freq, 'fractionNouns': noun_freq, 'fractionAdjectives': adj_freq }, ignore_index=True)




#  

# Part 2: Merging the TF-IDF array and the POS tags.

In [18]:

#We continue by concatinating the original dataframes with the ones that contain the fractions.
train1 = pd.concat([train_data, fraction_df], axis = 1)
test1 = pd.concat([test_data, fraction_test], axis = 1)

#Getting the fractions in list format.
fractionAdverbs = train1['fractionAdverbs'].tolist()
fractionVerbs = train1['fractionVerbs'].tolist()
fractionNouns = train1['fractionNouns'].tolist()
fractionAdjectives = train1['fractionAdjectives'].tolist()

#Same for the test dataset.
fractionAdverbs_test = test1['fractionAdverbs'].tolist()
fractionVerbs_test = test1['fractionVerbs'].tolist()
fractionNouns_test = test1['fractionNouns'].tolist()
fractionAdjectives_test = test1['fractionAdjectives'].tolist()

#Initializing the TF-IDF vectorizer
tfidf = TfidfVectorizer()
tfs_train = tfidf.fit_transform(train1['Comment'])
tfs_test = tfidf.transform(test1['Comment'])

#Getting the TF-IDF array in dense format so we can merge it with the fractions.
dense = tfs_train.todense()
dense_test = tfs_test.todense()

#Zipping all the fractions.
all_fracs = list(zip(fractionAdverbs, fractionVerbs, fractionNouns,fractionAdjectives ))
all_fracs_test = list(zip(fractionAdverbs_test, fractionVerbs_test, fractionNouns_test,fractionAdjectives_test ))

#Merging the TF-IDF array with the fraction arrays.
tf_pos = np.append(dense, all_fracs, 1)
tf_pos_test = np.append(dense_test, all_fracs_test, 1)

#  

# Part 3: Classification with SVM and Random Forests.

In [19]:
#Initializing the SVM object.
SVM = svm.SVC()

#Fitting the training data and making predictions on the test.
SVM.fit(tf_pos,Train_Y)
predictions_SVM = SVM.predict(tf_pos_test)

#Getting the scores we need.
acc_score_test = accuracy_score(Test_Y, predictions_SVM)
f1_score_test = f1_score(Test_Y, predictions_SVM, average = 'weighted')


#Initializing the Random Forest object.
RF = RandomForestClassifier(n_estimators=100)

#Fitting the training data and making predictions on the test.
RF.fit(tf_pos,Train_Y)
predictions_RF = RF.predict(tf_pos_test)

#Getting the scores we need.
acc_score_test1 = accuracy_score(Test_Y, predictions_RF)
f1_score_test1 = f1_score(Test_Y, predictions_RF, average = 'weighted')


#Using a dataframe to print our results.
tf_pos_df = pd.DataFrame(columns=['Classification Method', 'Classification Accuracy', 'F1 Score'])
tf_pos_df = tf_pos_df.append({'Classification Method': 'SVM', 'Classification Accuracy':acc_score_test, 'F1 Score': f1_score_test}, ignore_index=True)
tf_pos_df = tf_pos_df.append({'Classification Method': 'Random Forest', 'Classification Accuracy':acc_score_test1, 'F1 Score': f1_score_test1}, ignore_index=True)

print("Classification scores for SVM and Random Forests on the TF-IDF/POS array.")
tf_pos_df

#Conclusions and Notes:
#The TF-IDF/POS representation proved to be a entirely different model than the one used in CountVectorizer.

#In SVM we see that the scores were about the same, if not a bit higher than the ones we got in Naive Bayes. 
#This is expected given that SVM uses binary classification. For this reason, it was able to achieve quite accurate scores (since our labels are also binary).

#In Random Forest we see that the scores are less accurate than the ones we got in Naive Bayes.
#There is also random chance taken in account in this algorithm so the results may vary a bit in different executions.

Classification scores for SVM and Random Forests on the TF-IDF/POS array.


Unnamed: 0,Classification Method,Classification Accuracy,F1 Score
0,SVM,0.680089,0.654029
1,Random Forest,0.634004,0.581631


#  

# Beating the benchmark.

In [31]:

#Decided to use CountVectorizer since it provided better scores than either TF-IDF or TF-IDF/POS representations.
#By removing the stop words and using the lemmatized comments, we get more accurate results.
count_vectorizer_f = CountVectorizer(stop_words = 'english')
counts_train_f = count_vectorizer_f.fit_transform(train_data['lemmatized_comment'])
counts_test_f = count_vectorizer_f.transform(test_data['lemmatized_comment'])

#Random Forest Classifier provided the most accurate results so far.
RF.fit(counts_train_f,Train_Y)
predictions_RF = RF.predict(counts_test_f)

#Getting our scores.
acc_score_test = accuracy_score(Test_Y, predictions_RF)
f1_score_test = f1_score(Test_Y, predictions_RF, average = 'weighted')

#Using a dataframe to print them.
final_df = pd.DataFrame(columns=['Classification Method', 'Classification Accuracy', 'F1 Score'])
final_df = final_df.append({'Classification Method': 'Random Forest with additions', 'Classification Accuracy':acc_score_test, 'F1 Score': f1_score_test}, ignore_index=True)

print("Classification scores for my attempt on beating the benchmark.")
final_df

#Conclusions and Notes:
#Here I decided to merge the different techniques we explored in the previous questions. 
#The lemmatized comments and the removal of stop words proved to be highly accurate. Bigrams didn't achieve higher accuracy.

#Regarding the classifiers, Random Forest gave the best results. 
#Since random chance is taken in account in this algorithm, the results may vary a bit in different executions.
#All of the ones I tried however were about 0.69 to 0.70.

#Since Random Forest gives a bit random results, I also tried classifying with Naive Bayes to see more stable results.
#Naive Bayes with alpha set to 0.6 also gave some quite accurate results of about 0.6908.
#SVM gave the least accurate scores of the three algorithms, so it was discarded.

Classification scores for my attempt on beating the benchmark.


Unnamed: 0,Classification Method,Classification Accuracy,F1 Score
0,Random Forest with additions,0.702461,0.688464
