In [1]:
# import necessary tools
import time
import pandas as pd

# train_test_split split the dataframe into training and testing data
from sklearn.model_selection import train_test_split

# CountVectorizer turns the text into a bag-of-words vectors
# each tokens acts as a feature for the ML classification problem
from sklearn.feature_extraction.text import CountVectorizer

# importing the naive bayes model class MultinomialNB (NaiveBayes),
# which works well with count_vectorizers as it expects integer inputs
from sklearn.naive_bayes import MultinomialNB

# metric model to evaluate the model performance
from sklearn import metrics

In [3]:
# load data into DataFrame
comments = pd.read_csv('attack_annotated_comments.tsv', sep='\t', index_col=0)
annotations = pd.read_csv('attack_annotations.tsv', sep='\t')

In [4]:
# print the # of unique rev_id
print('There are', len(annotations['rev_id'].unique()), 'unique rev_id')

There are 115864 unique rev_id


In [5]:
# labels a comment as an attack if the majority of annotators did so
labels = annotations.groupby('rev_id')['attack'].mean() > 0.5

In [6]:
# insert column 'labels' in DataFrame 'comments'
comments['attack'] = labels

In [8]:
# Parsing: remove "NEWLINE_TOKEN" and "TAB_TOKEN" tokens from 'comment'
comments['comment'] = comments['comment'].apply(lambda x: x.replace("NEWLINE_TOKEN", " "))
comments['comment'] = comments['comment'].apply(lambda x: x.replace("TAB_TOKEN", " "))

In [9]:
print('This is your DataFrame:\n')
print(comments.head())

This is your DataFrame:

                                                  comment  year  logged_in  \
rev_id                                                                       
37675   `- This is not ``creative``.  Those are the di...  2002      False   
44816   `  :: the term ``standard model`` is itself le...  2002      False   
49851     True or false, the situation as of March 200...  2002      False   
89320    Next, maybe you could work on being less cond...  2002       True   
93890                This page will need disambiguation.   2002       True   

             ns  sample  split  attack  
rev_id                                  
37675   article  random  train   False  
44816   article  random  train   False  
49851   article  random  train   False  
89320   article  random    dev   False  
93890   article  random  train   False  


In [10]:
print('These is the head of the comments classified as an attack\n')
print(comments.query('attack')['comment'].head())

These is the head of the comments classified as an attack

rev_id
801279             Iraq is not good  ===  ===  USA is bad   
2702703      ____ fuck off you little asshole. If you wan...
4632658         i have a dick, its bigger than yours! hahaha
6545332      == renault ==  you sad little bpy for drivin...
6545351      == renault ==  you sad little bo for driving...
Name: comment, dtype: object


In [11]:
# create y which is the outcome label the model has to learn
y = comments['attack']

Now we split the dataframe into training and testing data.
split the features ('comment' column) and the label y ('attack'column) based on a given test size such as 0.33 (33%)

The function will take 33% of rows to be marked as test data and move them from the training data.
The test data is later used to see how the model has learned

The resulting data from train_test_split() are:

training data as X_train, 
training labels as y_train,
testing data as X_test and
testing labels as y_test

(We use random state, so we can have repeatable results when we run the code again)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(comments['comment'], y, test_size=0.33, random_state=53)

We create count vectorizer that turn the text into a bag-of-words vectors.
Each tokens acts as a feature for the machine learning classification problem

In [13]:
count_vectorizer = CountVectorizer()

fit transform on the training data creates a bag-of-words vectors
It will generate a mapping of words with IDs and vectors
representing how many times words appears in the comment

In [14]:
count_train = count_vectorizer.fit_transform(X_train.values)
count_test = count_vectorizer.transform(X_test.values)

Now it is time to build the Naive Bayes classifier

In [15]:
# class inizialization  and fit calling on training data
nb_classifier = MultinomialNB()
nb_classifier.fit(count_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

We call predict with the count_vectorizer test data.
predict will use the trained model to predict the label based on the test data vectors.

In [16]:
# we save the predicted labels in variable pred to test the accuracy
pred = nb_classifier.predict(count_test)

The results:

In [17]:
# testing accuracy
print(metrics.accuracy_score(y_test, pred))

0.934302751334


In [22]:
# further evaluation of our model with confusion matrix which shows correct/incorrect labels
print(metrics.confusion_matrix(y_test, pred, labels=[False, True]))
print('\n')
print(metrics.classification_report(y_test, pred))

[[32920   751]
 [ 1761  2804]]


             precision    recall  f1-score   support

      False       0.95      0.98      0.96     33671
       True       0.79      0.61      0.69      4565

avg / total       0.93      0.93      0.93     38236

