# Εργασία 2 (Τεχνικές Εξόρυξης Δεδομένων)
## Data Mining: Assignment 2
***
### Μαρία Φριτζελά 1115201400218
***

In [212]:
import pandas as pd
import numpy as np
from unicodedata import normalize
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.model_selection import cross_val_score, GridSearchCV
import nltk
from nltk.stem import WordNetLemmatizer
from sklearn import  svm, metrics
from sklearn.ensemble import RandomForestClassifier

In [None]:
nltk.download('punkt')
nltk.download('wordnet')

## Collection and cleaning of data
Date information is not needed so it is not added to our dataframes

In [135]:
traindf = pd.read_csv("data/train.csv", usecols=['Insult', 'Comment'])
testdf = pd.read_csv("data/impermium_verification_labels.csv", index_col='id', usecols=['id', 'Insult', 'Comment'])

Looking at traindf:

In [136]:
traindf

Unnamed: 0,Insult,Comment
0,1,"""You fuck your dad."""
1,0,"""i really don't understand your point.\xa0 It ..."
2,0,"""A\\xc2\\xa0majority of Canadians can and has ..."
3,0,"""listen if you dont wanna get married to a man..."
4,0,"""C\xe1c b\u1ea1n xu\u1ed1ng \u0111\u01b0\u1edd..."
...,...,...
3942,1,"""you are both morons and that is never happening"""
3943,0,"""Many toolbars include spell check, like Yahoo..."
3944,0,"""@LambeauOrWrigley\xa0\xa0@K.Moss\xa0\nSioux F..."
3945,0,"""How about Felix? He is sure turning into one ..."


Splitting the train and test data to X and y

In [137]:
X_train, y_train = traindf.Comment, traindf.Insult

In [138]:
X_test, y_test = testdf.Comment, testdf.Insult

### Clean up train comments' text:
- convert all letters to lowercase
- remove multiple instances of '\'<br>
For example "\\\n" becomes "\n"
- remove "\n" and "\xa0" (non-breaking space latin)
- remove usernames
- remove URLs
- remove special unicode characters (like \xe1, \xe2...)<br>
- remove puctuation
- remove all words containing digits, and any digits
- remove multiple spaces


In [203]:
def clean_comments(comments):
    return comments.apply(lambda comment: comment.lower())\
                .apply(lambda comment: re.sub("\\\\*\\\\", " \\\\" ,comment))\
                .apply(lambda comment: re.sub("\\\\+n|\\\\+xa0", " ", comment))\
                .apply(lambda comment: re.sub('@[^\s]+',' ',comment))\
                .apply(lambda comment: re.sub('(http(s)?:\/\/|www\.)(\S|[a-z]|[A-Z]| [0-9])+', " ", comment))\
                .apply(lambda comment: re.sub('\\\\+\S+'," ", comment))\
                .apply(lambda comment: re.sub('[^A-Za-z0-9 ]+', ' ',comment))\
                .apply(lambda comment: re.sub(r'\w*\d\w*', '', comment))\
                .apply(lambda comment: re.sub(r"\s+"," ", comment, flags = re.I))

In [204]:
X_train = clean_comments(traindf.Comment)

In [207]:
X_test = clean_comments(testdf.Comment)

For example this comment:

In [205]:
traindf.Comment[124]

'"Nope. Not working for me either.32-23-34www.facebook.com/annagillmodel\\\\n\\\\n \\\\n\\\\nYou have my email! :) "'

Has been transformed into this:

In [206]:
X_train[124]

' nope not working for me either you have my email '

Our cleaned up data looks like this:

In [142]:
X_train

0                                      you fuck your dad 
1        i really don t understand your point it seems...
2        a majority of canadians can and has been wron...
3        listen if you dont wanna get married to a man...
4        c b xu bi t c ho kh c ng d ng cu chi nh c ho ...
                              ...                        
3942     you are both morons and that is never happening 
3943     many toolbars include spell check like yahoo ...
3944     sioux falls s d i told my boy he should call ...
3945     how about felix he is sure turning into one h...
3946     you re all upset defending this hipster band ...
Name: Comment, Length: 3947, dtype: object

##  Naive Bayes

Transform the comments into word count vectors using CountVectorizer from sklearn

In [209]:
#Create bag-of-words vector
bow_vectorizer = CountVectorizer(max_features=4000)

X_train_bow = bow_vectorizer.fit_transform(X_train)
X_test_bow = bow_vectorizer.transform(X_test)

Looking at the vector of the first comment in the train data:

In [166]:
pd.DataFrame(X_train_bow[0:1].T.todense(), index=bow_vectorizer.get_feature_names(), columns=["counts"])\
.sort_values(by=["counts"],ascending=False)

Unnamed: 0,counts
you,1
dad,1
fuck,1
your,1
pointing,0
...,...
filthy,0
final,0
finally,0
finance,0


Trying the Naive Bayes

In [210]:
# Instantiate the model
nb = GaussianNB()

# Train the model on the BoW training set
nb.fit(X_train_bow.toarray(), y_train)
# predict the BoW test set
y_pred_nb_bow = nb.predict(X_test_bow.toarray())

In [168]:
print("10-fold Cross Validation Precision NB for BoW:",
      np.mean(cross_val_score(nb, X_train_bow.toarray(), y_train, cv=10, scoring='precision_macro')))
print("10-fold Cross Validation Recall NB for BoW:",
      np.mean(cross_val_score(nb, X_train_bow.toarray(), y_train, cv=10, scoring='recall_macro')))
print("10-fold Cross Validation F-Measure NB for BoW:",
     np.mean(cross_val_score(nb, X_train_bow.toarray(), y_train, cv=10, scoring='f1_macro')))
print("10-fold Cross Validation Accuracy NB for BoW:",
      np.mean(cross_val_score(nb, X_train_bow.toarray(), y_train, cv=10, scoring='accuracy')))

10-fold Cross Validation Precision NB for BoW: 0.5840240145124168
10-fold Cross Validation Recall NB for BoW: 0.6038477436535545
10-fold Cross Validation F-Measure NB for BoW: 0.578019561577787
10-fold Cross Validation Accuracy NB for BoW: 0.6214971406541155


Test Scores:

In [218]:
print("Precision NB for BoW:",metrics.precision_score(y_test, y_pred_nb_bow, average=None))
print("Recall NB for BoW:",metrics.recall_score(y_test, y_pred_nb_bow, average=None))
print("F-Measure NB for BoW:", metrics.f1_score(y_test, y_pred_nb_bow, average=None))
print()
print("Accuracy NB for BoW:",metrics.accuracy_score(y_test,y_pred_nb_bow))

Precision NB for BoW: [0.54244032 0.49426063]
Recall NB for BoW: [0.35319516 0.67966574]
F-Measure NB for BoW: [0.42782427 0.57232213]

Accuracy NB for BoW: 0.5105145413870246


These scores are not very good... Let's improve them!

## Improving the scores of Naive Bayes

### 1) Lemmatization

Use lemmatization of words to improve the scores from the previous question, using the WordNetLemmatizer from nltk

In [214]:
lemmatizer = WordNetLemmatizer()

X_train = X_train.apply(lambda item: ' '.join([lemmatizer.lemmatize(w) for w in nltk.word_tokenize(item)]))
X_test = X_test.apply(lambda item: ' '.join([lemmatizer.lemmatize(w) for w in nltk.word_tokenize(item)]))

### 2) Use of bigrams and Stop word filtering

Create a new improved bag-of-words vector, including bigrams and filtering out stop words

In [215]:
#Create bag-of-words vector
bow_vectorizer = CountVectorizer(ngram_range=(1,2), max_features=4000, stop_words='english')

X_train_bow = bow_vectorizer.fit_transform(X_train)
X_test_bow = bow_vectorizer.transform(X_test)

### 4) Laplace Smoothing

Setting a=1 is called Laplace smoothing

_(https://scikit-learn.org/stable/modules/naive_bayes.html#multinomial-naive-bayes)_

In [220]:
# Instantiate the model
mnb = MultinomialNB(alpha=1.0)

# Train the model on the BoW training set
mnb.fit(X_train_bow.toarray(), y_train)
# predict the BoW test set
y_pred_mnb_bow = mnb.predict(X_test_bow.toarray())

In [197]:
print("10-fold Cross Validation Precision NB for BoW:",
      np.mean(cross_val_score(mnb, X_train_bow.toarray(), y_train, cv=10, scoring='precision_macro')))
print("10-fold Cross Validation Recall NB for BoW:",
      np.mean(cross_val_score(mnb, X_train_bow.toarray(), y_train, cv=10, scoring='recall_macro')))
print("10-fold Cross Validation F-Measure NB for BoW:",
     np.mean(cross_val_score(mnb, X_train_bow.toarray(), y_train, cv=10, scoring='f1_macro')))
print("10-fold Cross Validation Accuracy NB for BoW:",
      np.mean(cross_val_score(mnb, X_train_bow.toarray(), y_train, cv=10, scoring='accuracy')))

10-fold Cross Validation Precision NB for BoW: 0.7392375343304849
10-fold Cross Validation Recall NB for BoW: 0.7639656811390495
10-fold Cross Validation F-Measure NB for BoW: 0.7485289705475866
10-fold Cross Validation Accuracy NB for BoW: 0.7927533251943713


In [221]:
print("Precision NB for BoW:",metrics.precision_score(y_test, y_pred_mnb_bow, average=None))
print("Recall NB for BoW:",metrics.recall_score(y_test, y_pred_mnb_bow, average=None))
print("F-Measure NB for BoW:", metrics.f1_score(y_test, y_pred_mnb_bow, average=None))
print()
print("Accuracy NB for BoW:",metrics.accuracy_score(y_test,y_pred_mnb_bow))

Precision NB for BoW: [0.68045113 0.72044199]
Recall NB for BoW: [0.78151986 0.60538533]
F-Measure NB for BoW: [0.72749196 0.65792129]

Accuracy NB for BoW: 0.6966442953020134


Nice! **~20%** improvement in scores

## TF/IDF Vector

Create a TF/IDF vector inluding bigrams and filtering out stopwords

In [222]:
tfidf_vectorizer= TfidfVectorizer(ngram_range=(1,2), max_features=4000, stop_words='english')

X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [199]:
pd.DataFrame(X_train_tfidf[0:1].T.todense(), index=tfidf_vectorizer.get_feature_names(), columns=["tfidf"])\
.sort_values(by=["tfidf"],ascending=False)

Unnamed: 0,tfidf
dad,0.853429
fuck,0.521210
plain,0.000000
pill,0.000000
pin,0.000000
...,...
forest,0.000000
forever,0.000000
forget,0.000000
forgive,0.000000


## Support Vector Machines (SVM)

According to the assigment details, scoring should be calculated calculated using: classification accuracy and F1 score.

Find optimal parameters for the SVM model as shown in this example:<br>
https://scikit-learn.org/stable/auto_examples/model_selection/plot_grid_search_digits.html

In [180]:
# Set the parameters by cross-validation
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]

scores = ['f1_macro', 'accuracy']

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)

    clf = GridSearchCV(
        svm.SVC(), tuned_parameters, scoring=score
    )
    clf.fit(X_train_tfidf, y_train)

    print("Best parameters set found on development set:")
    print(clf.best_params_)
    print()

# Tuning hyper-parameters for f1_macro
Best parameters set found on development set:
{'C': 1000, 'gamma': 0.001, 'kernel': 'rbf'}

# Tuning hyper-parameters for accuracy
Best parameters set found on development set:
{'C': 1, 'kernel': 'linear'}



In [225]:
#instantiate the model
svm_clf = svm.SVC(C=1, kernel='linear')

# train the model on the TF/IDF training set
svm_clf.fit(X_train_tfidf, y_train)
# predict the TF/IDF test set
y_pred_svm_tfidf = svm_clf.predict(X_test_tfidf)

In [193]:
print("10-fold Cross Validation F-Measure SVM for TF/IDF:",
     np.mean(cross_val_score(svm_clf, X_train_tfidf, y_train, cv=10, scoring='f1_macro')))
print("10-fold Cross Validation Accuracy SVM for TF/IDF:",
      np.mean(cross_val_score(svm_clf, X_train_tfidf, y_train, cv=10, scoring='accuracy')))

10-fold Cross Validation F-Measure SVM for TF/IDF: 0.7611806053010486
10-fold Cross Validation Accuracy SVM for TF/IDF: 0.835817001863394


Test Scores:

In [226]:
print("F-Measure SVM for TF/IDF:", metrics.f1_score(y_test, y_pred_svm_tfidf, average=None))
print("Accuracy SVM for TF/IDF:",metrics.accuracy_score(y_test, y_pred_svm_tfidf))

F-Measure SVM for TF/IDF: [0.74788732 0.5607362 ]
Accuracy SVM for TF/IDF: 0.6796420581655481


## Random Forest

In [228]:
# Instantiate the model
rf = RandomForestClassifier()

# Train the model on the TF/IDF training set
rf.fit(X_train_tfidf, y_train)
# predict the TF/IDF test set
y_pred_rf_tfidf = rf.predict(X_test_tfidf)

In [189]:
print("10-fold Cross Validation F-Measure RF for TF/IDF:",
     np.mean(cross_val_score(rf, X_train_tfidf, y_train, cv=10, scoring='f1_macro')))
print("10-fold Cross Validation Accuracy RF for TF/IDF:",
      np.mean(cross_val_score(rf, X_train_tfidf, y_train, cv=10, scoring='accuracy')))

10-fold Cross Validation F-Measure RF for TF/IDF: 0.7424000777194075
10-fold Cross Validation Accuracy RF for TF/IDF: 0.8244181713037332


Test Scores:

In [229]:
print("F-Measure RF for TF/IDF:", metrics.f1_score(y_test, y_pred_rf_tfidf, average=None))
print("Accuracy RF for TF/IDF:",metrics.accuracy_score(y_test, y_pred_rf_tfidf))

F-Measure RF for TF/IDF: [0.74796172 0.56882959]
Accuracy RF for TF/IDF: 0.6818791946308724
