# Machine Learning Sentiment Analysis

---

## Classifying Wikipedia Comments


Nathaniel Haddad - 2019

In [1]:
import pandas as pd
import urllib
import nltk
from sklearn.externals import joblib
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegressionCV
from sklearn.naive_bayes import BernoulliNB
from sklearn.pipeline import FeatureUnion
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier



## Dataset


In [2]:
# download annotated comments and annotations
ANNOTATED_COMMENTS_URL = 'https://ndownloader.figshare.com/files/7554634' 
ANNOTATIONS_URL = 'https://ndownloader.figshare.com/files/7554637' 

def download_file(url, fname):
    urllib.request.urlretrieve(url, fname)

download_file(ANNOTATED_COMMENTS_URL, 'attack_annotated_comments.tsv')
download_file(ANNOTATIONS_URL, 'attack_annotations.tsv')

Normal File Retrieval

In [3]:
comments = pd.read_csv('attack_annotated_comments.tsv', sep = '\t', index_col = 0)
annotations = pd.read_csv('attack_annotations.tsv',  sep = '\t')

In [4]:
len(annotations['rev_id'].unique())

115864

In [5]:
# labels a comment as an atack if the majority of annoatators did so
labels = annotations.groupby('rev_id')['attack'].mean() > 0.5

In [6]:
# join labels and comments
comments['attack'] = labels

In [7]:
# remove newline and tab tokens
comments['comment'] = comments['comment'].apply(lambda x: x.replace("NEWLINE_TOKEN", " "))
comments['comment'] = comments['comment'].apply(lambda x: x.replace("TAB_TOKEN", " "))

## Text Preprocessing

In [8]:
comments.sample(5)

Unnamed: 0_level_0,comment,year,logged_in,ns,sample,split,attack
rev_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
108333063,:My response: Wikipedia:Suspected sock puppet...,2007,True,user,blocked,train,False
48533215,== Rusty Harding == The last version before...,2006,True,user,random,train,False
37004221,I removed NOTHING from the Poincare page or...,2006,False,article,random,train,False
395701986,` ::Fetchcomms has stated that you should gain...,2010,True,user,random,train,False
80463997,== List moved from article due to disputed n...,2006,True,article,random,train,False


In [9]:
nltk.download("stopwords")
STOPWORDS = set(nltk.corpus.stopwords.words("english"))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nathanielhaddad/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
"""
function: parse_text
param(s): text, a string
returns: a string
"""
def parse_text(text):
  new_text = []
  # split text into list of items
  words_and_symbols = str(text).split()
  # iterate through each item and create a new string of alphabet characters
  for item in words_and_symbols:
    # make item lower case
    item = item.lower()
    # remove non-alpha characters and stopwords
    if item.isalpha() and item not in STOPWORDS:
        new_text.append(word)
  return " ".join(new_text)

In [11]:
# After testing text cleaning results above, no need to use this
# comments["comment"] = comments["comment"].apply(parse_text)

In [12]:
comments.query('attack')['comment'].head(10)

rev_id
801279             Iraq is not good  ===  ===  USA is bad   
2702703      ____ fuck off you little asshole. If you wan...
4632658         i have a dick, its bigger than yours! hahaha
6545332      == renault ==  you sad little bpy for drivin...
6545351      == renault ==  you sad little bo for driving...
7977970    34, 30 Nov 2004 (UTC)  ::Because you like to a...
8359431    `  ::You are not worth the effort. You are arg...
8724028    Yes, complain to your rabbi and then go shoot ...
8845700                     i am using the sandbox, ass wipe
8845736      == GOD DAMN ==  GOD DAMN it fuckers, i am us...
Name: comment, dtype: object

## Metric Functions

In [13]:
"""
function: build_confusion_matrix
params: model, a function
returns: nothing
does: builds and prints a confusion matrix
"""
def build_confusion_matrix(model, y_pred):
  cm = confusion_matrix(y_pred, test_comments['attack'])
  print(cm)

In [14]:
"""
function: precision_recall_fscore
params: clf, a function
returns: nothing
does: calculates precision, recall, and f-score of given function
"""
def precision_recall_fscore(clf, y_pred):
  metrics = precision_recall_fscore_support(
      y_true=test_comments['attack'], y_pred=y_pred,
      average='weighted')
  print('Test Precision: %.5f' %metrics[0])
  print('Test Recall: %.5f' %metrics[1])
  print('Test F-Score: : %.5f' %metrics[2])

In [15]:
"""
function: get_metrics
params: clf, a function
returns: nothing
does: prints out confusion matrix, precision, recall, f-score, and ROC AUC
"""
def get_metrics(clf):
  y_pred = clf.predict(test_comments['comment'])
  build_confusion_matrix(clf, y_pred)
  precision_recall_fscore(clf, y_pred)
  auc = roc_auc_score(test_comments['attack'], clf.predict_proba(test_comments['comment'])[:,1])
  print('Test ROC AUC: %.5f' %auc)

## Models

---

### Logistic Regression (strawman)

In [16]:
train_comments = comments.query("split=='train'")
test_comments = comments.query("split=='test'")

In [17]:
clf = Pipeline([
    ('vect', CountVectorizer(max_features = 10000, ngram_range = (1,2))),
    ('tfidf', TfidfTransformer(norm = 'l2')),
    ('clf', LogisticRegression(solver='lbfgs')),
])

clf = clf.fit(train_comments['comment'], train_comments['attack'])
get_metrics(clf)

[[20280  1236]
 [  142  1520]]
Test Precision: 0.93923
Test Recall: 0.94055
Test F-Score: : 0.93396
Test ROC AUC: 0.95699


In [18]:
# correctly classify nice comment
clf.predict(['Thanks for you contribution, you did a great job!'])

array([False])

In [19]:
# correctly classify nasty comment
clf.predict(['People as stupid as you should not edit Wikipedia!'])

array([ True])

In [20]:
# save the model
joblib.dump(clf, 'models/lr_base_model.pkl')
print('Model Saved')

Model Saved


### Logistic Regression (\#2)

In [21]:
# create a new training set made up of validation set and previous training set
train_comments = comments.query("split=='train'")
val_comments = comments.query("split=='dev'")
test_comments = comments.query("split=='test'")
train_comments = pd.concat([val_comments, train_comments])

In [22]:
clf = Pipeline([
    # replace CountVectorizer with TfidfVectorizer
    ('vect', TfidfVectorizer(max_df=1.0, min_df=1, max_features=None, norm = 'l2')),
    ('clf', LogisticRegression(solver='lbfgs')),
])

parameters = {'vect__analyzer': ('word', 'char', 'char_wb'),
              'vect__ngram_range': [(1,1), (1,2)]}

clf = GridSearchCV(clf, parameters, cv=3, n_jobs=-1)
clf = clf.fit(train_comments['comment'], train_comments['attack'])
get_metrics(clf)



[[20270  1194]
 [  152  1562]]
Test Precision: 0.94044
Test Recall: 0.94193
Test F-Score: : 0.93588
Test ROC AUC: 0.96020


In [23]:
# correctly classify nice comment
clf.predict(['Thanks for you contribution, you did a great job!'])

array([False])

In [24]:
# correctly classify nasty comment
clf.predict(['People as stupid as you should not edit Wikipedia!'])

array([ True])

In [25]:
# save the model
joblib.dump(clf, 'models/lr_2_model.pkl')
print('Model Saved')

Model Saved


### Logisitic Regression (\#3)

In [26]:
# create feature union of character and word TFIDF vectorizers
vectorizerW = TfidfVectorizer(lowercase=True, analyzer='word', stop_words=None, ngram_range = (1,1), max_df=1.0, min_df=1, max_features=None, norm = 'l2')
vectorizerC = TfidfVectorizer(lowercase=True, analyzer='char', stop_words=None, ngram_range = (1,1), max_df=1.0, min_df=1, max_features=None, norm = 'l2')
combined_features = FeatureUnion([('word', vectorizerW), ('char', vectorizerC)])

In [27]:
clf = Pipeline([
    ('features', combined_features),
    ('clf', LogisticRegression(n_jobs=-1)),
])

parameters = {'clf__solver': ('newton-cg', 'lbfgs')}

clf = GridSearchCV(clf, parameters, cv=3, n_jobs=-1)
clf = clf.fit(train_comments['comment'], train_comments['attack'])
get_metrics(clf)

[[20262  1153]
 [  160  1603]]
Test Precision: 0.94177
Test Recall: 0.94335
Test F-Score: : 0.93780
Test ROC AUC: 0.96259


In [28]:
# correctly classify nice comment
clf.predict(['Thanks for you contribution, you did a great job!'])

array([False])

In [29]:
# correctly classify nasty comment
clf.predict(['People as stupid as you should not edit Wikipedia!'])

array([ True])

In [30]:
# save the model
joblib.dump(clf, 'models/lr_3_model.pkl')
print('Model Saved')

Model Saved


### Logistic Regression (\#4)

https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegressionCV.html

In [31]:
clf = Pipeline([
    ('features', combined_features),
    ('clf', LogisticRegressionCV(cv=3,max_iter=100, solver='lbfgs', random_state=12345)),
])

parameters = {'clf__fit_intercept': (True, False),
              'clf__refit': (True, False)}

clf = GridSearchCV(clf, parameters, cv=3, n_jobs=-1)
clf = clf.fit(train_comments['comment'], train_comments['attack'])
get_metrics(clf)



[[20248  1095]
 [  174  1661]]
Test Precision: 0.94352
Test Recall: 0.94525
Test F-Score: : 0.94036
Test ROC AUC: 0.96411


In [32]:
# correctly classify nice comment
clf.predict(['Thanks for you contribution, you did a great job!'])

array([False])

In [33]:
# correctly classify nasty comment
clf.predict(['People as stupid as you should not edit Wikipedia!'])

array([ True])

In [34]:
# save the model
joblib.dump(clf, 'models/lr_4_model.pkl')
print('Model Saved')

Model Saved


### Multi-Layer Perceptron

https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html

In [35]:
clf = Pipeline([
    ('features', combined_features),
    ('clf', MLPClassifier(hidden_layer_sizes=(150), max_iter=50, 
                          activation='relu', random_state=12345, 
                          validation_fraction=0.2, verbose=True, early_stopping=True)),
])

clf = clf.fit(train_comments['comment'], train_comments['attack'])
get_metrics(clf)

Iteration 1, loss = 0.23772074
Validation score: 0.937588
Iteration 2, loss = 0.12249908
Validation score: 0.942658
Iteration 3, loss = 0.08261204
Validation score: 0.942173
Iteration 4, loss = 0.06003344
Validation score: 0.941472
Iteration 5, loss = 0.04581809
Validation score: 0.938990
Iteration 6, loss = 0.03649753
Validation score: 0.937318
Iteration 7, loss = 0.03010518
Validation score: 0.935861
Iteration 8, loss = 0.02558240
Validation score: 0.935538
Iteration 9, loss = 0.02213808
Validation score: 0.932247
Iteration 10, loss = 0.01996782
Validation score: 0.932571




[[20180  1090]
 [  242  1666]]
Test Precision: 0.93977
Test Recall: 0.94253
Test F-Score: : 0.93789
Test ROC AUC: 0.95430


In [36]:
# correctly classify nice comment
clf.predict(['Thanks for you contribution, you did a great job!'])

array([False])

In [37]:
# correctly classify nasty comment
clf.predict(['People as stupid as you should not edit Wikipedia!'])

array([ True])

In [38]:
# save the model
joblib.dump(clf, 'models/mlp_model.pkl')
print('Model Saved')

Model Saved


### Bernoulli Naive Bayes

https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.BernoulliNB.html#sklearn.naive_bayes.BernoulliNB

In [41]:
clf = Pipeline([
    ('features', combined_features),
    ('clf', BernoulliNB()),
])

parameters = {'clf__alpha': (0.2,0.4,0.6,0.8,1)}

clf = GridSearchCV(clf, parameters, cv=3)
clf = clf.fit(train_comments['comment'], train_comments['attack'])
get_metrics(clf)

[[16522   822]
 [ 3900  1934]]
Test Precision: 0.87875
Test Recall: 0.79627
Test F-Score: : 0.82447
Test ROC AUC: 0.83327


In [42]:
# correctly classify nice comment
clf.predict(['Thanks for you contribution, you did a great job!'])

array([ True])

In [43]:
# correctly classify nasty comment
clf.predict(['People as stupid as you should not edit Wikipedia!'])

array([ True])

In [44]:
# save the model
joblib.dump(clf, 'models/nb_model.pkl')
print('Model Saved')

Model Saved


### Random Forest Classifier

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

In [45]:
clf = Pipeline([
    ('features', combined_features),
    ('clf', RandomForestClassifier(n_estimators=50)),
])

clf = clf.fit(train_comments['comment'], train_comments['attack'])
get_metrics(clf)

[[20398  1968]
 [   24   788]]
Test Precision: 0.91896
Test Recall: 0.91406
Test F-Score: : 0.89260
Test ROC AUC: 0.93215


In [46]:
# correctly classify nice comment
clf.predict(['Thanks for you contribution, you did a great job!'])

array([False])

In [47]:
# correctly classify nasty comment
clf.predict(['People as stupid as you should not edit Wikipedia!'])

array([ True])

In [48]:
# save the model
joblib.dump(clf, 'models/forest_model.pkl')
print('Model Saved')

Model Saved
