# Insulting Comments Detection System

The purpose of this project is to create a machine learning system that takes as input a comment, and ranks it as insulting or neutral.

## Import Libraries

In [2]:
# Ignoring unnecessory warnings
import warnings
warnings.filterwarnings("ignore")  
# Specialized container datatypes
import collections
# For Map vizualization
import folium
from nltk.corpus import genesis
# For data vizualization 
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
# For large and multi-dimensional arrays
import numpy as np
# For data manipulation and analysis
import pandas as pd
# Natural language processing library
import nltk
nltk.download('genesis')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import stopwords
from nltk.collocations import (
    BigramAssocMeasures,
    BigramCollocationFinder)
from nltk.stem import LancasterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
import gensim
from gensim.parsing.preprocessing import remove_stopwords
from gensim.parsing.preprocessing import STOPWORDS
# For image processing
from PIL import Image, ImageOps
# For random selection 
import random
# For basic cleaning and data preprocessing 
import re
import string 
# Communicating with operating and file system
import os
# Machine learning libary
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.externals import joblib
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import precision_recall_fscore_support 
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import NearestNeighbors
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, normalize, label_binarize
from sklearn.svm import SVC
# For wordcloud generating 
from wordcloud import WordCloud

[nltk_data] Downloading package genesis to /home/nikos/nltk_data...
[nltk_data]   Package genesis is already up-to-date!
[nltk_data] Downloading package stopwords to /home/nikos/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/nikos/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/nikos/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


## Importing files and creating datasets

Given a directory with all the data, we create 2 different data frames; one for training our algorithms, and one for testing. 

In [3]:
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/impermium_verification_labels.csv')

In [4]:
train_df.info()
train_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3947 entries, 0 to 3946
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Insult   3947 non-null   int64 
 1   Date     3229 non-null   object
 2   Comment  3947 non-null   object
dtypes: int64(1), object(2)
memory usage: 92.6+ KB


Unnamed: 0,Insult,Date,Comment
0,1,20120618192155Z,"""You fuck your dad."""
1,0,20120528192215Z,"""i really don't understand your point.\xa0 It ..."
2,0,,"""A\\xc2\\xa0majority of Canadians can and has ..."
3,0,,"""listen if you dont wanna get married to a man..."
4,0,20120619094753Z,"""C\xe1c b\u1ea1n xu\u1ed1ng \u0111\u01b0\u1edd..."


In [5]:
test_df.info()
test_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2235 entries, 0 to 2234
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   id       2235 non-null   int64 
 1   Insult   2235 non-null   int64 
 2   Date     2235 non-null   object
 3   Comment  2235 non-null   object
 4   Usage    2235 non-null   object
dtypes: int64(2), object(3)
memory usage: 87.4+ KB


Unnamed: 0,id,Insult,Date,Comment,Usage
0,1,0,20120603163526Z,"""like this if you are a tribe fan""",PrivateTest
1,2,1,20120531215447Z,"""you're idiot.......................""",PrivateTest
2,3,1,20120823164228Z,"""I am a woman Babs, and the only ""war on women...",PrivateTest
3,4,1,20120826010752Z,"""WOW & YOU BENEFITTED SO MANY WINS THIS YEAR F...",PrivateTest
4,5,1,20120602223825Z,"""haha green me red you now loser whos winning ...",PrivateTest


## Preprocessing and sanitizing the data

Firstly, we have to preprocess our training and test set. We are going to convert everything to lowercase, and remove any punctuation points, weird characters and links

In [6]:
def text_normalization(text):
    # convert text to lowercase
    text = text.lower()
    # remove all special characters, punctuation and spaces from string
    text = re.sub('\n|\r|\t', '', text)
    text = re.sub(r'[^\w\s]+', '', text)
    # first group of special chars: \u followed by a number
    text = re.sub('u\d\w+', '', text)
    # second group: \x followed by a letter
    text = re.sub('x[a-z]\d', '', text)
    # remove links
    text = re.sub(r'^http?://', ' ', text)
    text = re.sub(r'^www://', ' ', text)
    # return normalized text
    return text

In [7]:
def preprocess(input_df, is_test):
    if (is_test == False):
        # Remove rows with missing values in column col
        input_df.dropna(inplace=True)
    # Speed up code using numpy vectorization
    vfunc = np.vectorize(text_normalization)
    input_df.Comment = vfunc(input_df.Comment.values)
    # return processed input_df
    return input_df

Let's apply the above preprocessing techniques on training set

In [8]:
preprocessed_train_df = preprocess(train_df, False)
preprocessed_train_df

Unnamed: 0,Insult,Date,Comment
0,1,20120618192155Z,you fuck your dad
1,0,20120528192215Z,i really dont understand your point it seems t...
4,0,20120619094753Z,cc b xu bi txecnh 2011 c n ho khng ncc ng dn...
5,0,20120620171226Z,sdl ok but i would hope theyd sign him to a on...
6,0,20120503012628Z,yeah and where are you now
...,...,...,...
3942,1,20120502172717Z,you are both morons and that is never happening
3943,0,20120528164814Z,many toolbars include spell check like yahoo f...
3944,0,20120620142813Z,lambeauorwrigleykmossnsioux falls sd i told my...
3945,0,20120528205648Z,how about felix he is sure turning into one he...


Let's apply now the above preprocessing techniques on test set

In [9]:
preprocessed_test_df = preprocess(test_df, True)
preprocessed_test_df

Unnamed: 0,id,Insult,Date,Comment,Usage
0,1,0,20120603163526Z,like this if you are a tribe fan,PrivateTest
1,2,1,20120531215447Z,youre idiot,PrivateTest
2,3,1,20120823164228Z,i am a woman babs and the only war on women i ...,PrivateTest
3,4,1,20120826010752Z,wow you benefitted so many wins this year fro...,PrivateTest
4,5,1,20120602223825Z,haha green me red you now loser whos winning n...,PrivateTest
...,...,...,...,...,...
2230,2231,0,20120528100303Z,fuckin lame come on wtf stop fucking over my b...,PrivateTest
2231,2232,1,20120531185813Z,you shut your ignorant pie hole you little ins...,PrivateTest
2232,2233,0,20120529130822Z,sweetie pie is looking very much like her cous...,PrivateTest
2233,2234,1,20120531045826Z,ball4real where are you with your miami gayness,PrivateTest


## Classification 

### Using Naive Bayes classifier

At first, we are goint to use the classic Naive Bayes algorithm to classify our data, and try to achieve better results every time

Define a function to convert words to vectors and then call the NB algorithm and present the results

In [10]:
def NB_using_CV(train_df, test_df, bigrams, laplace):
    # We are going to use the sklearn's count vectorizer
    # Check if we want to call useing bigrams
    if (bigrams):
        cv = CountVectorizer(ngram_range=(1,2))
    else:
        cv = CountVectorizer()
    # Seperate our train data into x and y
    x_train = train_df['Processed_Comment']
    y_train = train_df.Insult
    # find the x for our test set
    x_test = test_df['Processed_Comment']
    y_test = test_df.Insult
    # use the cv in our sets to convert the words
    x_train = cv.fit_transform(x_train)
    x_test = cv.transform(x_test)
    # Call the NB algorithm
    if (laplace):
        # if we want a laplace smoothing, call multinomial nb, with the default alpha parameter of 1
        clf = MultinomialNB()
    else:
        # else, just call the gaussian nb
        clf = GaussianNB()
    clf.fit(x_train.toarray(), y_train)
    y_pred = clf.predict(x_test.toarray())
    # Print the results
    print(classification_report(y_test,y_pred))
    accuracy = accuracy_score(y_test, y_pred);
    print('Accuracy is {:.3f}'.format(accuracy))

### First try: no features

Important: All the modification of the data will be __temporary__, thus we are going to store the changes of each stage in a new column, named "processed_comment"

In [11]:
train_df['Processed_Comment'] = train_df.Comment
test_df['Processed_Comment'] = test_df.Comment

At first, without any preprocessing(apart from the one we did at the beggining), let's try and see the results of the NB algorithm

In [12]:
NB_using_CV(train_df, test_df, False, False)

              precision    recall  f1-score   support

           0       0.56      0.46      0.50      1158
           1       0.51      0.61      0.56      1077

    accuracy                           0.53      2235
   macro avg       0.54      0.53      0.53      2235
weighted avg       0.54      0.53      0.53      2235

Accuracy is 0.532


As we can see, our results are not that great. Let's try to do some preprocessing to the data. We're gonna start by lemmatization

### Second try: Lemmatization

Define a lemmatization function that is easy to use

In [13]:
def Lemmatization(text):
    lemmatizer = WordNetLemmatizer()
    lem_sentence = []
    token_words = word_tokenize(text)
    lem_sentence = [lemmatizer.lemmatize(word) for word in token_words]
    text = " ".join(lem_sentence)
    return text

Apply the lemmatizing in our data, and check again for better results

In [14]:
# Apply the lemmatization in both our sets
vfunc = np.vectorize(Lemmatization);
train_df['Processed_Comment'] = vfunc(train_df['Processed_Comment'].values)
test_df['Processed_Comment'] = vfunc(test_df['Processed_Comment'].values)
#Re-run the NB algorithm
NB_using_CV(train_df, test_df, False, False)

              precision    recall  f1-score   support

           0       0.56      0.45      0.50      1158
           1       0.51      0.62      0.56      1077

    accuracy                           0.53      2235
   macro avg       0.53      0.53      0.53      2235
weighted avg       0.53      0.53      0.53      2235

Accuracy is 0.529


The results are just as horrible as before. Lets try another technique: remove all the stop words

### Third try: Remove stop-words

In [15]:
def Remove_stop_words(text):
    removed = remove_stopwords(text)
    text = "".join(removed)
    return text

Apply the method in our data, and check again for better results

In [16]:
# Apply the sotp words remover in both our sets
vfunc = np.vectorize(Remove_stop_words);
train_df['Processed_Comment'] = vfunc(train_df['Processed_Comment'].values)
test_df['Processed_Comment'] = vfunc(test_df['Processed_Comment'].values)
#Re-run the NB algorithm
NB_using_CV(train_df, test_df, False, False)

              precision    recall  f1-score   support

           0       0.56      0.44      0.49      1158
           1       0.51      0.63      0.56      1077

    accuracy                           0.53      2235
   macro avg       0.53      0.53      0.53      2235
weighted avg       0.53      0.53      0.53      2235

Accuracy is 0.529


### Fourth try: Use bigrams

Let's now check the results if we give our algorithm bigrams, instead of words. This techinque is known for producing better results

In [17]:
# Run the algorithm by passing "true" as a parameter, so the cv runs with a ngram_range as an argument
NB_using_CV(train_df, test_df, True, False)

              precision    recall  f1-score   support

           0       0.57      0.49      0.53      1158
           1       0.52      0.60      0.56      1077

    accuracy                           0.54      2235
   macro avg       0.55      0.55      0.54      2235
weighted avg       0.55      0.54      0.54      2235

Accuracy is 0.544


### Last try: Use LaPlace smoothing

Finally, we will try a technique called LaPlace smoothing, just by passing a different alpha parameter to the NB algorithm

In [18]:
# as a matter of fact, the correct parameter for alpha is 1
alpha = 1
# RUn the algorithm again
NB_using_CV(train_df, test_df, False, True)

              precision    recall  f1-score   support

           0       0.65      0.88      0.75      1158
           1       0.79      0.49      0.60      1077

    accuracy                           0.69      2235
   macro avg       0.72      0.68      0.67      2235
weighted avg       0.72      0.69      0.68      2235

Accuracy is 0.690


## Feature Engineering

Next up, we are going to add more features in order to achieve better results. 

### Part Of Speech Features

Oh, by the way, drop the processed comment, we're not gonna need it anymore

In [19]:
train_df.drop('Processed_Comment', 1)
test_df.drop('Processed_Comment', 1)

Unnamed: 0,id,Insult,Date,Comment,Usage
0,1,0,20120603163526Z,like this if you are a tribe fan,PrivateTest
1,2,1,20120531215447Z,youre idiot,PrivateTest
2,3,1,20120823164228Z,i am a woman babs and the only war on women i ...,PrivateTest
3,4,1,20120826010752Z,wow you benefitted so many wins this year fro...,PrivateTest
4,5,1,20120602223825Z,haha green me red you now loser whos winning n...,PrivateTest
...,...,...,...,...,...
2230,2231,0,20120528100303Z,fuckin lame come on wtf stop fucking over my b...,PrivateTest
2231,2232,1,20120531185813Z,you shut your ignorant pie hole you little ins...,PrivateTest
2232,2233,0,20120529130822Z,sweetie pie is looking very much like her cous...,PrivateTest
2233,2234,1,20120531045826Z,ball4real where are you with your miami gayness,PrivateTest


We are going to create 4 new columns in our dataframe, each one containing the fraction of a part of speech from the following: noun, verb, adverb, adjective.

In [20]:
# define 4 fumctions to extract the pos percentages
def count_nouns(text):
    tokens = nltk.word_tokenize(text)
    pos = nltk.pos_tag(tokens)
    count = sum ([1 for char in pos if (char[1] == "NN" or char[1] == "NNS" or char[1] == "NNP" or char[1] == "NNPS")])
    if (len(pos) != 0):
        return round(count/(len(pos)), 3)*100
    else:
        return 0

def count_verbs(text):
    tokens = nltk.word_tokenize(text)
    pos = nltk.pos_tag(tokens)
    count = sum ([1 for char in pos if (char[1] == "VB" or char[1] == "VBD" or char[1] == "VBG" or char[1] == "VBN" or char[1] == "VBP" or char[1] == "VBZ")])
    if (len(pos) != 0):
        return round(count/(len(pos)), 3)*100
    else:
        return 0


def count_adverbs(text):
    tokens = nltk.word_tokenize(text)
    pos = nltk.pos_tag(tokens)
    count = sum ([1 for char in pos if (char[1] == "RB" or char[1] == "RBR" or char[1] == "RBS")])
    if (len(pos) != 0):
        return round(count/(len(pos)), 3)*100
    else:
        return 0

def count_adjectives(text):
    tokens = nltk.word_tokenize(text)
    pos = nltk.pos_tag(tokens)
    count = sum ([1 for char in pos if (char[1] == "JJ" or char[1] == "JJR" or char[1] == "JJS")])
    if (len(pos) != 0):
        return round(count/(len(pos)), 3)*100
    else:
        return 0

Create 4 new columns in each set to store the new info that we gathered

In [21]:
train_df['noun%'] = train_df.Comment.apply(lambda row : count_nouns(row))
test_df['noun%'] = train_df.Comment.apply(lambda row : count_nouns(row))

train_df['verb%'] = train_df.Comment.apply(lambda row : count_verbs(row))
test_df['verb%'] = train_df.Comment.apply(lambda row : count_verbs(row))

train_df['adverb%'] = train_df.Comment.apply(lambda row : count_adverbs(row))
test_df['adverb%'] = train_df.Comment.apply(lambda row : count_adverbs(row))

train_df['adjectives%'] = train_df.Comment.apply(lambda row : count_adjectives(row))
test_df['adjectives%'] = train_df.Comment.apply(lambda row : count_adjectives(row))

train_df.dropna(inplace=True)
test_df.dropna(inplace=True)

train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)


Let's take a look at the new columns of our dataframes

In [22]:
test_df

Unnamed: 0,id,Insult,Date,Comment,Usage,Processed_Comment,noun%,verb%,adverb%,adjectives%
0,1,0,20120603163526Z,like this if you are a tribe fan,PrivateTest,like tribe fan,25.0,25.0,0.0,0.0
1,2,1,20120531215447Z,youre idiot,PrivateTest,youre idiot,26.7,26.7,6.7,6.7
2,5,1,20120602223825Z,haha green me red you now loser whos winning n...,PrivateTest,haha green red loser winning moron,71.2,4.5,0.0,19.7
3,6,0,20120603202442Z,nme and god both hatefaggotsnnwhats the differ...,PrivateTest,nme god hatefaggotsnnwhats difference afagand ...,16.7,14.6,2.1,16.7
4,7,1,20120603163604Z,oh go kiss the ass of a goatand you dummycraps...,PrivateTest,oh kiss goatand dummycraps insult veteranseven...,16.7,16.7,16.7,0.0
...,...,...,...,...,...,...,...,...,...,...
1826,2230,1,20120825233538Z,you must have crack in your bong as it aint weed,PrivateTest,crack bong aint weed,28.1,21.1,3.5,3.5
1827,2231,0,20120528100303Z,fuckin lame come on wtf stop fucking over my b...,PrivateTest,fuckin lame come wtf stop fucking bity shit,0.0,66.7,0.0,0.0
1828,2232,1,20120531185813Z,you shut your ignorant pie hole you little ins...,PrivateTest,shut ignorant pie hole little insignificunt in...,33.3,13.3,6.7,0.0
1829,2233,0,20120529130822Z,sweetie pie is looking very much like her cous...,PrivateTest,sweetie pie looking like cousin maggie ewhat b...,26.0,20.0,8.0,8.0


### TF-IDF Features

Next up, we are going to transform our data to a feature vector. In order to do that, we're gonna use TfidfVectorizer

First for our train dataset

In [23]:
tfidf_vec = TfidfVectorizer()
tfidf_train_vec = tfidf_vec.fit_transform(train_df.Comment)
tfidf_train_data = pd.DataFrame(tfidf_train_vec.toarray())
tfidf_train_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13378,13379,13380,13381,13382,13383,13384,13385,13386,13387
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Then, for our test set

In [24]:
tfidf_vec_test = tfidf_vec.transform(test_df.Comment)
tfidf_test_data = pd.DataFrame(tfidf_vec_test.toarray())
tfidf_test_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13378,13379,13380,13381,13382,13383,13384,13385,13386,13387
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Add all the features that we extracted in our final data

In [25]:
final_train_data = pd.concat([train_df['noun%'], train_df['verb%'], train_df['adverb%'], train_df['adjectives%'], tfidf_train_data], axis=1)
final_test_data = pd.concat([test_df['noun%'], test_df['verb%'], test_df['adverb%'], test_df['adjectives%'], tfidf_test_data], axis=1)

## Classification using SVM and RandomForests

Now that we've extracted those features, we are going to check them with 2 different classifiers

### SVM

Our first classifier is SVM

In [34]:
clf = SVC(probability=True, kernel='rbf')
y_train = train_df.Insult
clf.fit(final_train_data, y_train)
y_pred = clf.predict(final_test_data)

Let's take a look at our scores

In [37]:
y_test = test_df.Insult
# Print the results
print(classification_report(y_test,y_pred))
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy is {:.3f}'.format(accuracy))

              precision    recall  f1-score   support

           0       0.52      1.00      0.69       955
           1       0.00      0.00      0.00       876

    accuracy                           0.52      1831
   macro avg       0.26      0.50      0.34      1831
weighted avg       0.27      0.52      0.36      1831

Accuracy is 0.522


Next, we're gonna try another classifier

### RandomForests

In [38]:
clf = RandomForestClassifier()
y_train = train_df.Insult
clf.fit(final_train_data, y_train)
y_pred = clf.predict(final_test_data)

Let's take a look at our scores

In [39]:
y_test = test_df.Insult
# Print the results
print(classification_report(y_test,y_pred))
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy is {:.3f}'.format(accuracy))

              precision    recall  f1-score   support

           0       0.59      0.97      0.74       955
           1       0.89      0.28      0.42       876

    accuracy                           0.64      1831
   macro avg       0.74      0.62      0.58      1831
weighted avg       0.73      0.64      0.59      1831

Accuracy is 0.637
