# Insulting Comments Detection System

The purpose of this project is to create a machine learning system that takes as input a comment, and ranks it as insulting or neutral.

## Import Libraries

In [32]:
# Ignoring unnecessory warnings
import warnings
warnings.filterwarnings("ignore")  
# Specialized container datatypes
import collections
# For Map vizualization
import folium
from nltk.corpus import genesis
# For data vizualization 
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
# For large and multi-dimensional arrays
import numpy as np
# For data manipulation and analysis
import pandas as pd
# Natural language processing library
import nltk
nltk.download('genesis')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.collocations import (
    BigramAssocMeasures,
    BigramCollocationFinder)
from nltk.stem import LancasterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
import gensim
from gensim.parsing.preprocessing import remove_stopwords
from gensim.parsing.preprocessing import STOPWORDS
# For image processing
from PIL import Image, ImageOps
# For random selection 
import random
# For basic cleaning and data preprocessing 
import re
import string 
# Communicating with operating and file system
import os
# Machine learning libary
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.externals import joblib
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import precision_recall_fscore_support 
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import NearestNeighbors
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, normalize, label_binarize
from sklearn.svm import SVC
# For wordcloud generating 
from wordcloud import WordCloud

[nltk_data] Downloading package genesis to /home/nikos/nltk_data...
[nltk_data]   Package genesis is already up-to-date!
[nltk_data] Downloading package stopwords to /home/nikos/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/nikos/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Importing files and creating datasets

Given a directory with all the data, we create 2 different data frames; one for training our algorithms, and one for testing. 

In [2]:
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/impermium_verification_labels.csv')

In [3]:
train_df.info()
train_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3947 entries, 0 to 3946
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Insult   3947 non-null   int64 
 1   Date     3229 non-null   object
 2   Comment  3947 non-null   object
dtypes: int64(1), object(2)
memory usage: 92.6+ KB


Unnamed: 0,Insult,Date,Comment
0,1,20120618192155Z,"""You fuck your dad."""
1,0,20120528192215Z,"""i really don't understand your point.\xa0 It ..."
2,0,,"""A\\xc2\\xa0majority of Canadians can and has ..."
3,0,,"""listen if you dont wanna get married to a man..."
4,0,20120619094753Z,"""C\xe1c b\u1ea1n xu\u1ed1ng \u0111\u01b0\u1edd..."


In [4]:
test_df.info()
test_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2235 entries, 0 to 2234
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   id       2235 non-null   int64 
 1   Insult   2235 non-null   int64 
 2   Date     2235 non-null   object
 3   Comment  2235 non-null   object
 4   Usage    2235 non-null   object
dtypes: int64(2), object(3)
memory usage: 87.4+ KB


Unnamed: 0,id,Insult,Date,Comment,Usage
0,1,0,20120603163526Z,"""like this if you are a tribe fan""",PrivateTest
1,2,1,20120531215447Z,"""you're idiot.......................""",PrivateTest
2,3,1,20120823164228Z,"""I am a woman Babs, and the only ""war on women...",PrivateTest
3,4,1,20120826010752Z,"""WOW & YOU BENEFITTED SO MANY WINS THIS YEAR F...",PrivateTest
4,5,1,20120602223825Z,"""haha green me red you now loser whos winning ...",PrivateTest


## Preprocessing and sanitizing the data

Firstly, we have to preprocess our training and test set. We are going to convert everything to lowercase, and remove any punctuation points, weird characters and links

In [5]:
def text_normalization(text):
    # convert text to lowercase
    text = text.lower()
    # remove all special characters, punctuation and spaces from string
    text = re.sub(r'\*',' ', text)
    # remove links
    text = re.sub(r'^https?://', ' ', text)
    # return normalized text
    return text

In [6]:
def preprocess(input_df, is_test):
    if (is_test == False):
        # Remove rows with missing values in column col
        input_df.dropna(inplace=True)
    # Speed up code using numpy vectorization
    vfunc = np.vectorize(text_normalization)
    input_df.Comment = vfunc(input_df.Comment.values)
    # return processed input_df
    return input_df

Let's apply the above preprocessing techniques on training set

In [7]:
preprocessed_train_df = preprocess(train_df, False)
preprocessed_train_df

Unnamed: 0,Insult,Date,Comment
0,1,20120618192155Z,"""you fuck your dad."""
1,0,20120528192215Z,"""i really don't understand your point.\xa0 it ..."
4,0,20120619094753Z,"""c\xe1c b\u1ea1n xu\u1ed1ng \u0111\u01b0\u1edd..."
5,0,20120620171226Z,"""@sdl ok, but i would hope they'd sign him to ..."
6,0,20120503012628Z,"""yeah and where are you now?"""
...,...,...,...
3942,1,20120502172717Z,"""you are both morons and that is never happening"""
3943,0,20120528164814Z,"""many toolbars include spell check, like yahoo..."
3944,0,20120620142813Z,"""@lambeauorwrigley\xa0\xa0@k.moss\xa0\nsioux f..."
3945,0,20120528205648Z,"""how about felix? he is sure turning into one ..."


Let's apply now the above preprocessing techniques on test set

In [8]:
preprocessed_test_df = preprocess(test_df, True)
preprocessed_test_df

Unnamed: 0,id,Insult,Date,Comment,Usage
0,1,0,20120603163526Z,"""like this if you are a tribe fan""",PrivateTest
1,2,1,20120531215447Z,"""you're idiot.......................""",PrivateTest
2,3,1,20120823164228Z,"""i am a woman babs, and the only ""war on women...",PrivateTest
3,4,1,20120826010752Z,"""wow & you benefitted so many wins this year f...",PrivateTest
4,5,1,20120602223825Z,"""haha green me red you now loser whos winning ...",PrivateTest
...,...,...,...,...,...
2230,2231,0,20120528100303Z,"""fuckin lame come on wtf stop fucking over my ...",PrivateTest
2231,2232,1,20120531185813Z,"""you shut your ignorant pie hole you little in...",PrivateTest
2232,2233,0,20120529130822Z,"""sweetie pie is looking very much like her cou...",PrivateTest
2233,2234,1,20120531045826Z,"""ball4real where are you with your miami g-ayn...",PrivateTest


## Classification 

At first, we are goint to use the classic Naive Bayes algorithm to classify our data, and try to achieve better results every time

Define a function to convert words to vectors and then call the NB algorithm and present the results

In [12]:
def NB_using_CV(train_df, test_df, column):
    # We are going to use the sklearn's count vectorizer
    cv = CountVectorizer() 
    # Seperate our train data into x and y
    x_train = train_df[column]
    y_train = train_df.Insult
    # find the x for our test set
    x_test = test_df[column]
    y_test = test_df.Insult
    # use the cv in our sets to convert the words
    x_train = cv.fit_transform(x_train)
    x_test = cv.transform(x_test)
    # Call the NB algorithm
    clf = MultinomialNB()
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    # Print the results
    print(classification_report(y_test,y_pred))
    accuracy = accuracy_score(y_test, y_pred);
    print('Accuracy is {:.3f}'.format(accuracy))

At first, without any preprocessing(apart from the one we did at the beggining), let's try and see the results of the NB algorithm

In [13]:
NB_using_CV(train_df, test_df, 'Comment')

              precision    recall  f1-score   support

           0       0.67      0.78      0.72      1158
           1       0.72      0.59      0.65      1077

    accuracy                           0.69      2235
   macro avg       0.70      0.69      0.69      2235
weighted avg       0.69      0.69      0.69      2235

Accuracy is 0.691


As we can see, our results are not that great. Let's try to do some preprocessing to the data. We're gonna start by lemmatization

Define a lemmatization function that is easy to use

In [14]:
def Lemmatization(text):
    lemmatizer = WordNetLemmatizer()
    lem_sentence = []
    token_words = word_tokenize(text)
    lem_sentence = [lemmatizer.lemmatize(word) for word in token_words]
    text = " ".join(lem_sentence)
    return text

Apply the lemmatizing in our data, and check again for better results

In [16]:
# Apply the lemmatization in both our sets
vfunc = np.vectorize(Lemmatization);
train_df.Comment = vfunc(train_df.Comment.values)
test_df.Comment = vfunc(test_df.Comment.values)
#Re-run the NB algorithm
NB_using_CV(train_df, test_df, 'Comment')

              precision    recall  f1-score   support

           0       0.68      0.78      0.72      1158
           1       0.72      0.60      0.65      1077

    accuracy                           0.69      2235
   macro avg       0.70      0.69      0.69      2235
weighted avg       0.69      0.69      0.69      2235

Accuracy is 0.692


The results are just as horrible as before. Lets try another technique: remove all the stop words

In [17]:
def Remove_stop_words(text):
    removed = remove_stopwords(text)
    text = "".join(removed)
    return text

Apply the method in our data, and check again for better results

In [40]:
# Apply the sotp words remover in both our sets
vfunc = np.vectorize(Remove_stop_words);
train_df.Comment = vfunc(train_df.Comment.values)
test_df.Comment = vfunc(test_df.Comment.values)
#Re-run the NB algorithm
NB_using_CV(train_df, test_df, 'Comment')

              precision    recall  f1-score   support

           0       0.67      0.85      0.75      1158
           1       0.77      0.55      0.64      1077

    accuracy                           0.70      2235
   macro avg       0.72      0.70      0.69      2235
weighted avg       0.72      0.70      0.70      2235

Accuracy is 0.703


In [48]:
def bigrams(text):
#     nltk_tokens = nltk.word_tokenize(text)
#     return list(nltk.bigrams(nltk_tokens))
    bgrams = list()
    for line in text:
        token = nltk.word_tokenize(line)
        bigramss[line] = list(ngrams(token, 2)) 
    return bigrams

In [49]:
# Apply the sotp words remover in both our sets
vfunc = np.vectorize(bigrams);
train_df.Bigram = vfunc(train_df.Comment.values)
print(train_df.Bigram)
test_df.Bigram = vfunc(test_df.Comment.values)
#Re-run the NB algorithm
NB_using_CV(train_df, test_df, 'Bigram')

NameError: name 'bigramss' is not defined