In [1]:
# import necessary tools
import time
import pandas as pd

# train_test_split split the dataframe into training and testing data
from sklearn.model_selection import train_test_split

# CountVectorizer turns the text into a bag-of-words vectors
# each tokens acts as a feature for the ML classification problem
from sklearn.feature_extraction.text import CountVectorizer

# The goal of using tf-idf instead of the raw frequencies 
# of occurrence of a token in a given document is to scale down
# the impact of tokens that occur very frequently in a given corpus
# and that are hence empirically less informative than features 
# that occur in a small fraction of the training corpus
from sklearn.feature_extraction.text import TfidfTransformer

# LogisticRegression
from sklearn.linear_model import LogisticRegression

# metric model to evaluate the model performance
from sklearn import metrics

In [2]:
# load data into DataFrame
comments = pd.read_csv('attack_annotated_comments.tsv', sep='\t', index_col=0)
annotations = pd.read_csv('attack_annotations.tsv', sep='\t')

In [3]:
# print the # of unique rev_id
print('There are', len(annotations['rev_id'].unique()), 'unique rev_id')

There are 115864 unique rev_id


In [4]:
# labels a comment as an attack if the majority of annotators did so
labels = annotations.groupby('rev_id')['attack'].mean() > 0.5

In [5]:
# insert column 'labels' in DataFrame 'comments'
comments['attack'] = labels

In [6]:
# Parsing: remove "NEWLINE_TOKEN" and "TAB_TOKEN" tokens from 'comment'
comments['comment'] = comments['comment'].apply(lambda x: x.replace("NEWLINE_TOKEN", " "))
comments['comment'] = comments['comment'].apply(lambda x: x.replace("TAB_TOKEN", " "))

In [7]:
print('This is your DataFrame:\n')
print(comments.head())

This is your DataFrame:

                                                  comment  year  logged_in  \
rev_id                                                                       
37675   `- This is not ``creative``.  Those are the di...  2002      False   
44816   `  :: the term ``standard model`` is itself le...  2002      False   
49851     True or false, the situation as of March 200...  2002      False   
89320    Next, maybe you could work on being less cond...  2002       True   
93890                This page will need disambiguation.   2002       True   

             ns  sample  split  attack  
rev_id                                  
37675   article  random  train   False  
44816   article  random  train   False  
49851   article  random  train   False  
89320   article  random    dev   False  
93890   article  random  train   False  


In [8]:
print('These is the head of the comments classified as an attack\n')
print(comments.query('attack')['comment'].head())

These is the head of the comments classified as an attack

rev_id
801279             Iraq is not good  ===  ===  USA is bad   
2702703      ____ fuck off you little asshole. If you wan...
4632658         i have a dick, its bigger than yours! hahaha
6545332      == renault ==  you sad little bpy for drivin...
6545351      == renault ==  you sad little bo for driving...
Name: comment, dtype: object


In [9]:
# create y which is the outcome label the model has to learn
y = comments['attack']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(comments['comment'], y, test_size=0.33, random_state=53)

In [11]:
# ngram_range(1, 2) means unigrams and bigrams
vectorizer = CountVectorizer(max_features=10000, ngram_range=(1, 2))

In [12]:
#  Tf-idf Transformer
count_train = vectorizer.fit_transform(X_train)
tf_transformer = TfidfTransformer(norm = 'l2')
count_train = tf_transformer.fit_transform(count_train)

count_test = vectorizer.transform(X_test.values)

In [13]:
lr_classifier = LogisticRegression()
lr_classifier.fit(count_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [14]:
pred = lr_classifier.predict(count_test)

In [15]:
# testing accuracy
print(metrics.accuracy_score(y_test, pred))


0.786457788472


In [16]:
print(metrics.confusion_matrix(y_test, pred, labels=[False, True]))
print(metrics.classification_report(y_test, pred))

[[25914  7757]
 [  408  4157]]
             precision    recall  f1-score   support

      False       0.98      0.77      0.86     33671
       True       0.35      0.91      0.50      4565

avg / total       0.91      0.79      0.82     38236

