# Εργασία 3 (Τεχνικές Εξόρυξης Δεδομένων)
## Data Mining: Assignment 3
***
### Μαρία Φριτζελά 1115201400218
***

In [49]:
import pandas as pd
import numpy as np
from unicodedata import normalize
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.model_selection import cross_val_score, GridSearchCV
import nltk
from nltk.stem import WordNetLemmatizer
from sklearn import  svm, metrics
from sklearn.ensemble import RandomForestClassifier

In [None]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('universal_tagset')

## Collection and cleaning of data (Pre-processing text)
Date information is not needed so it is not added to our dataframes

In [50]:
traindf = pd.read_csv("data/train.csv", usecols=['Insult', 'Comment'])
testdf = pd.read_csv("data/impermium_verification_labels.csv", index_col='id', usecols=['id', 'Insult', 'Comment'])

Looking at traindf:

In [51]:
traindf

Unnamed: 0,Insult,Comment
0,1,"""You fuck your dad."""
1,0,"""i really don't understand your point.\xa0 It ..."
2,0,"""A\\xc2\\xa0majority of Canadians can and has ..."
3,0,"""listen if you dont wanna get married to a man..."
4,0,"""C\xe1c b\u1ea1n xu\u1ed1ng \u0111\u01b0\u1edd..."
...,...,...
3942,1,"""you are both morons and that is never happening"""
3943,0,"""Many toolbars include spell check, like Yahoo..."
3944,0,"""@LambeauOrWrigley\xa0\xa0@K.Moss\xa0\nSioux F..."
3945,0,"""How about Felix? He is sure turning into one ..."


Splitting the train and test data to X and y

In [52]:
X_train, y_train = traindf.Comment, traindf.Insult

In [53]:
X_test, y_test = testdf.Comment, testdf.Insult

### Clean up train comments' text:
- convert all letters to lowercase
- remove multiple instances of '\'<br>
For example "\\\n" becomes "\n"
- remove "\n" and "\xa0" (non-breaking space latin)
- remove usernames
- remove URLs
- remove special unicode characters (like \xe1, \xe2...)<br>
- remove puctuation
- remove all words containing digits, and any digits
- remove multiple spaces


In [54]:
def clean_comments(comments):
    return comments.apply(lambda comment: comment.lower())\
                .apply(lambda comment: re.sub("\\\\*\\\\", " \\\\" ,comment))\
                .apply(lambda comment: re.sub("\\\\+n|\\\\+xa0", " ", comment))\
                .apply(lambda comment: re.sub('@[^\s]+',' ',comment))\
                .apply(lambda comment: re.sub('(http(s)?:\/\/|www\.)(\S|[a-z]|[A-Z]| [0-9])+', " ", comment))\
                .apply(lambda comment: re.sub('\\\\+\S+'," ", comment))\
                .apply(lambda comment: re.sub('[^A-Za-z0-9 ]+', ' ',comment))\
                .apply(lambda comment: re.sub(r'\w*\d\w*', '', comment))\
                .apply(lambda comment: re.sub(r"\s+"," ", comment, flags = re.I))

In [55]:
X_train = clean_comments(traindf.Comment)

In [56]:
X_test = clean_comments(testdf.Comment)

For example this comment:

In [57]:
traindf.Comment[124]

'"Nope. Not working for me either.32-23-34www.facebook.com/annagillmodel\\\\n\\\\n \\\\n\\\\nYou have my email! :) "'

Has been transformed into this:

In [58]:
X_train[124]

' nope not working for me either you have my email '

Our cleaned up data looks like this:

In [59]:
X_train

0                                      you fuck your dad 
1        i really don t understand your point it seems...
2        a majority of canadians can and has been wron...
3        listen if you dont wanna get married to a man...
4        c b xu bi t c ho kh c ng d ng cu chi nh c ho ...
                              ...                        
3942     you are both morons and that is never happening 
3943     many toolbars include spell check like yahoo ...
3944     sioux falls s d i told my boy he should call ...
3945     how about felix he is sure turning into one h...
3946     you re all upset defending this hipster band ...
Name: Comment, Length: 3947, dtype: object

##  Naive Bayes

Transform the comments into word count vectors using CountVectorizer from sklearn

In [62]:
#Create bag-of-words vector
bow_vectorizer = CountVectorizer(max_features=4000)

X_train_bow = bow_vectorizer.fit_transform(X_train)
X_test_bow = bow_vectorizer.transform(X_test)

Looking at the vector of the first comment in the train data:

In [63]:
pd.DataFrame(X_train_bow[0:1].T.todense(), index=bow_vectorizer.get_feature_names(), columns=["counts"])\
.sort_values(by=["counts"],ascending=False)

Unnamed: 0,counts
you,1
your,1
dad,1
fuck,1
plutocrats,0
...,...
finals,0
financial,0
find,0
finding,0


Trying the Naive Bayes

In [64]:
# Instantiate the model
nb = GaussianNB()

# Train the model on the BoW training set
nb.fit(X_train_bow.toarray(), y_train)
# predict the BoW test set
y_pred_nb_bow = nb.predict(X_test_bow.toarray())

In [168]:
print("10-fold Cross Validation Precision NB for BoW:",
      np.mean(cross_val_score(nb, X_train_bow.toarray(), y_train, cv=10, scoring='precision_macro')))
print("10-fold Cross Validation Recall NB for BoW:",
      np.mean(cross_val_score(nb, X_train_bow.toarray(), y_train, cv=10, scoring='recall_macro')))
print("10-fold Cross Validation F-Measure NB for BoW:",
     np.mean(cross_val_score(nb, X_train_bow.toarray(), y_train, cv=10, scoring='f1_macro')))
print("10-fold Cross Validation Accuracy NB for BoW:",
      np.mean(cross_val_score(nb, X_train_bow.toarray(), y_train, cv=10, scoring='accuracy')))

10-fold Cross Validation Precision NB for BoW: 0.5840240145124168
10-fold Cross Validation Recall NB for BoW: 0.6038477436535545
10-fold Cross Validation F-Measure NB for BoW: 0.578019561577787
10-fold Cross Validation Accuracy NB for BoW: 0.6214971406541155


Test Scores:

In [218]:
print("Precision NB for BoW:",metrics.precision_score(y_test, y_pred_nb_bow, average=None))
print("Recall NB for BoW:",metrics.recall_score(y_test, y_pred_nb_bow, average=None))
print("F-Measure NB for BoW:", metrics.f1_score(y_test, y_pred_nb_bow, average=None))
print()
print("Accuracy NB for BoW:",metrics.accuracy_score(y_test,y_pred_nb_bow))

Precision NB for BoW: [0.54244032 0.49426063]
Recall NB for BoW: [0.35319516 0.67966574]
F-Measure NB for BoW: [0.42782427 0.57232213]

Accuracy NB for BoW: 0.5105145413870246


These scores are not very good... Let's improve them!

## Improving the scores of Naive Bayes

### 1) Lemmatization

Use lemmatization of words to improve the scores from the previous question, using the WordNetLemmatizer from nltk

In [100]:
lemmatizer = WordNetLemmatizer()

X_train_lem = X_train.apply(lambda item: ' '.join([lemmatizer.lemmatize(w) for w in nltk.word_tokenize(item)]))
X_test_lem = X_test.apply(lambda item: ' '.join([lemmatizer.lemmatize(w) for w in nltk.word_tokenize(item)]))

In [101]:
#Create bag-of-words vector
bow_vectorizer = CountVectorizer(max_features=4000)

X_train_bow = bow_vectorizer.fit_transform(X_train_lem)
X_test_bow = bow_vectorizer.transform(X_test_lem)

In [72]:
# Train the model on the lemmatized BoW training set
nb.fit(X_train_bow.toarray(), y_train)
# predict the BoW test set
y_pred_nb_bow = nb.predict(X_test_bow.toarray())

In [73]:
print("F-Measure NB for Lemmatized BoW data:", metrics.f1_score(y_test, y_pred_nb_bow, average=None))
print()
print("Accuracy NB for Lemmatized BoW data:",metrics.accuracy_score(y_test,y_pred_nb_bow))

F-Measure NB for Lemmatized BoW data: [0.42752868 0.56974922]

Accuracy NB for Lemmatized BoW data: 0.508724832214765


### 2) Stop word filtering

Try a bag-of-words vector, removing stopwords

In [None]:
#Create bag-of-words vector
bow_vectorizer = CountVectorizer(max_features=4000, stop_words='english')

X_train_bow = bow_vectorizer.fit_transform(X_train)
X_test_bow = bow_vectorizer.transform(X_test)

In [114]:
# Train the model on the stopword free BoW training set
nb.fit(X_train_bow.toarray(), y_train)
# predict the BoW test set
y_pred_nb_bow = nb.predict(X_test_bow.toarray())

In [115]:
print("F-Measure NB for stopword free BoW data:", metrics.f1_score(y_test, y_pred_nb_bow, average=None))
print()
print("Accuracy NB for stopword free BoW data:",metrics.accuracy_score(y_test,y_pred_nb_bow))

F-Measure NB for stopword free BoW data: [0.42010582 0.5751938 ]

Accuracy NB for stopword free BoW data: 0.5096196868008949


### 3) Use of bigrams

Try a bag-of-words vector, including bigrams

In [129]:
#Create bag-of-words vector
bow_vectorizer = CountVectorizer(ngram_range=(1,2), max_features=4000)

X_train_bow = bow_vectorizer.fit_transform(X_train)
X_test_bow = bow_vectorizer.transform(X_test)

In [130]:
# Train the model on the lemmatized BoW training set
nb.fit(X_train_bow.toarray(), y_train)
# predict the BoW test set
y_pred_nb_bow = nb.predict(X_test_bow.toarray())

In [131]:
print("F-Measure NB for Lemmatized BoW data:", metrics.f1_score(y_test, y_pred_nb_bow, average=None))
print()
print("Accuracy NB for Lemmatized BoW data:",metrics.accuracy_score(y_test,y_pred_nb_bow))

F-Measure NB for Lemmatized BoW data: [0.39311494 0.59048333]

Accuracy NB for Lemmatized BoW data: 0.5109619686800895


### 4) Laplace Smoothing

In [125]:
#Create bag-of-words vector
bow_vectorizer = CountVectorizer(max_features=4000)

X_train_bow = bow_vectorizer.fit_transform(X_train)
X_test_bow = bow_vectorizer.transform(X_test)

Setting a=1 is called Laplace smoothing

_(https://scikit-learn.org/stable/modules/naive_bayes.html#multinomial-naive-bayes)_

In [126]:
# Instantiate the model
mnb = MultinomialNB(alpha=1.0)

# Train the model on the BoW training set
mnb.fit(X_train_bow.toarray(), y_train)
# predict the BoW test set
y_pred_mnb_bow = mnb.predict(X_test_bow.toarray())

In [132]:
print("F-Measure NB for BoW:", metrics.f1_score(y_test, y_pred_mnb_bow, average=None))
print()
print("Accuracy NB for BoW:",metrics.accuracy_score(y_test,y_pred_mnb_bow))

F-Measure NB for BoW: [0.62959381 0.68109908]

Accuracy NB for BoW: 0.6572706935123043


### Putting it all together:

In [148]:
#Create bag-of-words vector
bow_vectorizer = CountVectorizer(ngram_range=(1,2), max_features=4000, stop_words='english')

X_train_bow = bow_vectorizer.fit_transform(X_train_lem)
X_test_bow = bow_vectorizer.transform(X_test_lem)

In [149]:
# Train the model on the BoW training set
mnb.fit(X_train_bow.toarray(), y_train)
# predict the BoW test set
y_pred_mnb_bow = mnb.predict(X_test_bow.toarray())

In [143]:
print("10-fold Cross Validation Precision NB for BoW:",
      np.mean(cross_val_score(mnb, X_train_bow.toarray(), y_train, cv=10, scoring='precision_macro')))
print("10-fold Cross Validation Recall NB for BoW:",
      np.mean(cross_val_score(mnb, X_train_bow.toarray(), y_train, cv=10, scoring='recall_macro')))
print("10-fold Cross Validation F-Measure NB for BoW:",
     np.mean(cross_val_score(mnb, X_train_bow.toarray(), y_train, cv=10, scoring='f1_macro')))
print("10-fold Cross Validation Accuracy NB for BoW:",
      np.mean(cross_val_score(mnb, X_train_bow.toarray(), y_train, cv=10, scoring='accuracy')))

10-fold Cross Validation Precision NB for BoW: 0.7413755088411647
10-fold Cross Validation Recall NB for BoW: 0.7663864912480829
10-fold Cross Validation F-Measure NB for BoW: 0.7507234216792603
10-fold Cross Validation Accuracy NB for BoW: 0.7945229068945576


In [150]:
print("Precision NB for BoW:",metrics.precision_score(y_test, y_pred_mnb_bow, average=None))
print("Recall NB for BoW:",metrics.recall_score(y_test, y_pred_mnb_bow, average=None))
print("F-Measure NB for BoW:", metrics.f1_score(y_test, y_pred_mnb_bow, average=None))
print()
print("Accuracy NB for BoW:",metrics.accuracy_score(y_test,y_pred_mnb_bow))

Precision NB for BoW: [0.68045113 0.72044199]
Recall NB for BoW: [0.78151986 0.60538533]
F-Measure NB for BoW: [0.72749196 0.65792129]

Accuracy NB for BoW: 0.6966442953020134


Nice! **~20%** improvement in scores

## Creation of a custom feature vector: TF/IDF Vector & Part-of-Speech

### Part-of-Speech frequency features

Use nltk's pos_tag method for each word of every comment in our data. Set the tagset attribute to 'universal' in the pos_tag method.

In [86]:
X_train_tagged = X_train.apply(lambda item: nltk.pos_tag(nltk.word_tokenize(item), tagset='universal'))
X_test_tagged = X_test.apply(lambda item: nltk.pos_tag(nltk.word_tokenize(item), tagset='universal'))

Let's see what the result looks like for comment 1

In [87]:
X_train_tagged[1]

[('i', 'NOUN'),
 ('really', 'ADV'),
 ('don', 'ADJ'),
 ('t', 'NOUN'),
 ('understand', 'VERB'),
 ('your', 'PRON'),
 ('point', 'NOUN'),
 ('it', 'PRON'),
 ('seems', 'VERB'),
 ('that', 'ADP'),
 ('you', 'PRON'),
 ('are', 'VERB'),
 ('mixing', 'VERB'),
 ('apples', 'NOUN'),
 ('and', 'CONJ'),
 ('oranges', 'NOUN')]

Frequency distribution (`nltk.FreqDist`) can be defined as a function mapping from each sample to the number of times that sample occurred as an outcome.<br>
It will be used to record the frequency of each word type in each comment

In [36]:
nltk.FreqDist(tag for word, tag in X_train_tagged[1])    

FreqDist({'NOUN': 5, 'VERB': 4, 'PRON': 3, 'ADV': 1, 'ADJ': 1, 'ADP': 1, 'CONJ': 1})

**Function fractPOS:** Creates a list of dictionaries iterating through every comment passed in X_tagged. Each dictionary holds the fraction (=frequency_of_tag/number_of_words_in_comment) of each tag for that comment:

In [37]:
def fractPOS(X_tagged):
    fractions = []
    for tagged_comment in X_tagged:
        n_of_words = len(tagged_comment)
        freq = nltk.FreqDist(tag for word, tag in tagged_comment)
        # freq[tag_type] if a tag type doesn't exist, zero is returned
        try:
            d = {
                'ADV': freq['ADV']/n_of_words,
                'VERB': freq['VERB']/n_of_words,
                'ADJ': freq['ADJ']/n_of_words,
                'NOUN': freq['NOUN']/n_of_words
            }
        except ZeroDivisionError: #n_of_words ==0
            d = {'ADV': 0, 'VERB': 0, 'ADJ': 0,'NOUN': 0}
        fractions.append(d)
    return fractions

Dataframe is created using the list of dictionaries.<br>
_We chose not to fill the dataframe row by row, because iteratively appending rows to a DataFrame can be computationally intensive_ 

In [88]:
tags = ['ADV', 'VERB', 'ADJ', 'NOUN']
X_train_freqdf = pd.DataFrame(fractPOS(X_train_tagged), columns=tags)
X_test_freqdf = pd.DataFrame(fractPOS(X_test_tagged), columns=tags)

Looking at our custom feature vector (dataframe) for the train data

In [89]:
X_train_freqdf

Unnamed: 0,ADV,VERB,ADJ,NOUN
0,0.000000,0.250000,0.000000,0.250000
1,0.062500,0.250000,0.062500,0.312500
2,0.086957,0.202899,0.057971,0.202899
3,0.033898,0.305085,0.084746,0.118644
4,0.000000,0.047619,0.063492,0.809524
...,...,...,...,...
3942,0.111111,0.333333,0.000000,0.111111
3943,0.076923,0.346154,0.076923,0.153846
3944,0.000000,0.307692,0.076923,0.269231
3945,0.111111,0.222222,0.027778,0.277778


### TF/IDF

Create a TF/IDF vector inluding bigrams and filtering out stopwords

In [167]:
tfidf_vectorizer= TfidfVectorizer(ngram_range=(1,2), max_features=4000, stop_words='english')

X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [168]:
pd.DataFrame(X_train_tfidf[0:1].T.todense(), index=tfidf_vectorizer.get_feature_names(), columns=["tfidf"])\
.sort_values(by=["tfidf"],ascending=False)

Unnamed: 0,tfidf
dad,0.855421
fuck,0.517933
pets,0.000000
ph,0.000000
philosophies,0.000000
...,...
food,0.000000
food stamps,0.000000
fool,0.000000
fooled,0.000000


### Combining the custom part-of-speech features with the TF/IDF vector

TFIDF vector is a matrix where the rows are comments and the columns are features.<br>
To combine all features, the custom features will be added as columns to the end of the TF/IDF matrix.<br>

In [169]:
type(X_train_tfidf)

scipy.sparse.csr.csr_matrix

TF/IDF matrix is a sparse matrix (from Scipy).<br>
To save memory, do not convert to dense, rather use `scipy.sparse.hstack` to stack the matrices horizontally (column wise)

In [170]:
from scipy.sparse import hstack

X_train_combined = hstack([X_train_tfidf, X_train_freqdf])
X_test_combined = hstack([X_test_tfidf, X_test_freqdf])

The combined matrix consists of 3947 rows (same as the number of comments in the train dataset) and 4000 features from TFIDF + 4 custom part-of-speech features

In [171]:
X_train_combined.shape

(3947, 4004)

Source:
_(https://stackoverflow.com/questions/48573174/how-to-combine-tfidf-features-with-other-features)_

## Support Vector Machines (SVM)

According to the assigment details, scoring should be calculated calculated using: classification accuracy and F1 score.

Find optimal parameters for the SVM model as shown in this example:<br>
https://scikit-learn.org/stable/auto_examples/model_selection/plot_grid_search_digits.html

In [23]:
# Set the parameters by cross-validation
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]

scores = ['f1_macro', 'accuracy']

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)

    clf = GridSearchCV(
        svm.SVC(), tuned_parameters, scoring=score
    )
    clf.fit(X_train_combined, y_train)

    print("Best parameters set found on development set:")
    print(clf.best_params_)
    print()

# Tuning hyper-parameters for f1_macro
Best parameters set found on development set:
{'C': 1, 'kernel': 'linear'}

# Tuning hyper-parameters for accuracy
Best parameters set found on development set:
{'C': 1, 'kernel': 'linear'}



In [157]:
#instantiate the model
svm_clf = svm.SVC(C=1, kernel='linear')

# train the model on the custom training set
svm_clf.fit(X_train_combined, y_train)
# predict the custom test set
y_pred_svm = svm_clf.predict(X_test_combined)

In [113]:
print("10-fold Cross Validation F-Measure SVM for custom feature vector:",
     np.mean(cross_val_score(svm_clf, X_train_combined, y_train, cv=10, scoring='f1_macro')))
print("10-fold Cross Validation Accuracy SVM for custom feature vector:",
      np.mean(cross_val_score(svm_clf, X_train_combined, y_train, cv=10, scoring='accuracy')))

10-fold Cross Validation F-Measure SVM for custom feature vector: 0.7716475529688832
10-fold Cross Validation Accuracy SVM for custom feature vector: 0.8386172331812635


Test Scores:

In [158]:
print("F-Measure SVM for custom feature vector:", metrics.f1_score(y_test, y_pred_svm, average=None))
print("Accuracy SVM for custom feature vector:",metrics.accuracy_score(y_test, y_pred_svm))

F-Measure SVM for custom feature vector: [0.75342948 0.56914567]
Accuracy SVM for custom feature vector: 0.6863534675615213


## Random Forest

In [159]:
# Instantiate the model
rf = RandomForestClassifier()

# Train the model on the custom training set
rf.fit(X_train_combined, y_train)
# predict the custom test set
y_pred_rf = rf.predict(X_test_combined)

In [160]:
print("10-fold Cross Validation F-Measure RF for custom feature vector:",
     np.mean(cross_val_score(rf, X_train_combined, y_train, cv=10, scoring='f1_macro')))
print("10-fold Cross Validation Accuracy RF for custom feature vector:",
      np.mean(cross_val_score(rf, X_train_combined, y_train, cv=10, scoring='accuracy')))

10-fold Cross Validation F-Measure RF for custom feature vector: 0.7289609591596767
10-fold Cross Validation Accuracy RF for custom feature vector: 0.8208719398573538


Test Scores:

In [161]:
print("F-Measure RF for custom feature vector:", metrics.f1_score(y_test, y_pred_rf, average=None))
print("Accuracy RF for custom feature vector:",metrics.accuracy_score(y_test, y_pred_rf))

F-Measure RF for custom feature vector: [0.74498963 0.53172589]
Accuracy RF for custom feature vector: 0.6697986577181209


##  Naive Bayes

In [172]:
# Train the model on the custom training set
mnb.fit(X_train_combined.toarray(), y_train)
# predict the custom test set
y_pred_mnb_bow = mnb.predict(X_test_combined.toarray())

Test Scores:

In [173]:
print("F-Measure NB for custom feature vector:", metrics.f1_score(y_test, y_pred_mnb_bow, average=None))
print()
print("Accuracy NB for custom feature vector:",metrics.accuracy_score(y_test,y_pred_mnb_bow))

F-Measure NB for custom feature vector: [0.71946986 0.31667948]

Accuracy NB for custom feature vector: 0.6022371364653244
