In [1]:
import time
import pickle
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn import cross_validation
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB



In [2]:
# load data into DataFrame
comments = pd.read_csv('attack_annotated_comments.tsv', sep='\t', index_col=0)
annotations = pd.read_csv('attack_annotations.tsv', sep='\t')

In [3]:
# print the # of unique rev_id
print('There are', len(annotations['rev_id'].unique()), 'unique rev_id')

There are 115864 unique rev_id


In [4]:
# labels a comment as an attack if the majority of annotators did so
labels = annotations.groupby('rev_id')['attack'].mean() > 0.5


In [5]:
# insert labels in comments
comments['attack'] = labels

In [6]:
# Parsing: remove newline and tab tokens
comments['comment'] = comments['comment'].apply(lambda x: x.replace("NEWLINE_TOKEN", " "))
comments['comment'] = comments['comment'].apply(lambda x: x.replace("TAB_TOKEN", " "))

In [7]:
print('This is your DataFrame:\n')
print(comments.head())
print('These are the head of the comments classified as an attack\n')
print(comments.query('attack')['comment'].head())

This is your DataFrame:

                                                  comment  year  logged_in  \
rev_id                                                                       
37675   `- This is not ``creative``.  Those are the di...  2002      False   
44816   `  :: the term ``standard model`` is itself le...  2002      False   
49851     True or false, the situation as of March 200...  2002      False   
89320    Next, maybe you could work on being less cond...  2002       True   
93890                This page will need disambiguation.   2002       True   

             ns  sample  split  attack  
rev_id                                  
37675   article  random  train   False  
44816   article  random  train   False  
49851   article  random  train   False  
89320   article  random    dev   False  
93890   article  random  train   False  
These are the head of the comments classified as an attack

rev_id
801279             Iraq is not good  ===  ===  USA is bad   
2702703      

In [8]:
X = comments['comment']
y = comments['attack']

In [9]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.33, random_state=53)

In [10]:
# Setup the pipeline
steps = [('vec', CountVectorizer(analyzer='char', ngram_range=(1, 5), max_features=10000)),
         ('tfidf', TfidfTransformer(norm='l2', sublinear_tf=True)),
         ('clf', MultinomialNB(alpha=0.01))]

pipeline = Pipeline(steps)

In [11]:
# Fit to the training set
pipeline.fit(X_train, y_train)

Pipeline(steps=[('vec', CountVectorizer(analyzer='char', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=10000, min_df=1,
        ngram_range=(1, 5), preprocessor=None, stop_words=None,
        strip...inear_tf=True, use_idf=True)), ('clf', MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True))])

In [12]:
# Predict the labels of the test set: y_pred
y_pred = pipeline.predict(X_test)

In [13]:
# Compute and print metrics
print("Accuracy: {}".format(pipeline.score(X_test, y_test)))

Accuracy: 0.9200753216863689


In [14]:
print(metrics.classification_report(y_test, y_pred))

             precision    recall  f1-score   support

      False       0.95      0.96      0.95     33671
       True       0.68      0.63      0.65      4565

avg / total       0.92      0.92      0.92     38236

