In [1]:
# import necessary tools
import time
import pandas as pd

# train_test_split split the dataframe into training and testing data
from sklearn.model_selection import train_test_split

# CountVectorizer turns the text into a bag-of-words vectors
# each tokens acts as a feature for the ML classification problem
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.neural_network import MLPClassifier

# metric model to evaluate the model performance
from sklearn import metrics

In [2]:
# load data into DataFrame
comments = pd.read_csv('attack_annotated_comments.tsv', sep='\t', index_col=0)
annotations = pd.read_csv('attack_annotations.tsv', sep='\t')

In [3]:
# print the # of unique rev_id
print('There are', len(annotations['rev_id'].unique()), 'unique rev_id')

There are 115864 unique rev_id


In [4]:
# labels a comment as an attack if the majority of annotators did so
labels = annotations.groupby('rev_id')['attack'].mean() > 0.5

In [5]:
# insert column 'labels' in DataFrame 'comments'
comments['attack'] = labels

In [6]:
# Parsing: remove "NEWLINE_TOKEN" and "TAB_TOKEN" tokens from 'comment'
comments['comment'] = comments['comment'].apply(lambda x: x.replace("NEWLINE_TOKEN", " "))
comments['comment'] = comments['comment'].apply(lambda x: x.replace("TAB_TOKEN", " "))

In [7]:
print('This is your DataFrame:\n')
print(comments.head())

This is your DataFrame:

                                                  comment  year  logged_in  \
rev_id                                                                       
37675   `- This is not ``creative``.  Those are the di...  2002      False   
44816   `  :: the term ``standard model`` is itself le...  2002      False   
49851     True or false, the situation as of March 200...  2002      False   
89320    Next, maybe you could work on being less cond...  2002       True   
93890                This page will need disambiguation.   2002       True   

             ns  sample  split  attack  
rev_id                                  
37675   article  random  train   False  
44816   article  random  train   False  
49851   article  random  train   False  
89320   article  random    dev   False  
93890   article  random  train   False  


In [8]:
print('These is the head of the comments classified as an attack\n')
print(comments.query('attack')['comment'].head())


These is the head of the comments classified as an attack

rev_id
801279             Iraq is not good  ===  ===  USA is bad   
2702703      ____ fuck off you little asshole. If you wan...
4632658         i have a dick, its bigger than yours! hahaha
6545332      == renault ==  you sad little bpy for drivin...
6545351      == renault ==  you sad little bo for driving...
Name: comment, dtype: object


In [9]:
# create y which is the outcome label the model has to learn
y = comments['attack']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(comments['comment'], y, test_size=0.33, random_state=53)


In [11]:
count_vectorizer = CountVectorizer()

In [12]:
count_train = count_vectorizer.fit_transform(X_train.values)
count_test = count_vectorizer.transform(X_test.values)

In [14]:
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1)
clf.fit(count_train, y_train)

MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(5, 2), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=1, shuffle=True,
       solver='lbfgs', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False)

In [15]:
pred = clf.predict(count_test)

In [16]:
print(metrics.accuracy_score(y_test, pred))

0.939794957632


In [17]:
# further evaluation of our model with confusion matrix which shows correct/incorrect labels
print(metrics.confusion_matrix(y_test, pred, labels=[False, True]))
print('\n')
print(metrics.classification_report(y_test, pred))

[[32792   879]
 [ 1423  3142]]


             precision    recall  f1-score   support

      False       0.96      0.97      0.97     33671
       True       0.78      0.69      0.73      4565

avg / total       0.94      0.94      0.94     38236

