# Understanding User Comments via Sentiment Analysis

---

*Scikit-Learn*

Nathaniel Haddad - 2019

In [None]:
import pandas as pd
import urllib
import nltk
from sklearn.externals import joblib
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer, HashingVectorizer
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import roc_auc_score, confusion_matrix, precision_recall_fscore_support
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

## Functions

In [None]:
def download_file(url: str, fname: str) -> None:
    """
    function: download_file
    param(s): url (str): url to files; fname (str): the filename
    returns: nothing
    does: downloads files to local directory
    """
    urllib.request.urlretrieve(url, fname)

In [None]:
def parse_text(text: str) -> str:
    """
    function: parse_text
    param(s): text, a string
    returns: a string
    """
    new_text = []
    # split text into list of items
    words_and_symbols = str(text).split()
    # iterate through each item and create a new string of alphabet characters
    for item in words_and_symbols:
    # make item lower case
    item = item.lower()
    # remove non-alpha characters and stopwords
    if item.isalpha() and item not in STOPWORDS:
        new_text.append(word)
    return " ".join(new_text)

In [None]:
def build_confusion_matrix(model, y_pred) -> None:
    """
    function: build_confusion_matrix
    params: model, a function
    returns: nothing
    does: builds and prints a confusion matrix
    """
    cm = confusion_matrix(y_pred, test_comments['attack'])
    print(cm)

In [None]:
def precision_recall_fscore(clf, y_pred) -> None:
    """
    function: precision_recall_fscore
    params: clf, a function
    returns: nothing
    does: calculates precision, recall, and f-score of given function
    """
    metrics = precision_recall_fscore_support(
      y_true = test_comments['attack'], y_pred = y_pred, average = 'weighted')
    print('Test Precision: {precision:.5f}'.format(precision = metrics[0]))
    print('Test Recall: {recall:.5f}'.format(recall = metrics[1]))
    print('Test F-Score: : {fscore:.5f}'.format(fscore = metrics[2]))

In [None]:
def get_metrics(clf) -> None:
    """
    function: get_metrics
    params: clf, a function
    returns: nothing
    does: prints out confusion matrix, precision, recall, f-score, and ROC AUC
    """
    y_pred = clf.predict(test_comments['comment'])
    build_confusion_matrix(clf, y_pred)
    precision_recall_fscore(clf, y_pred)
    auc = roc_auc_score(test_comments['attack'], clf.predict_proba(test_comments['comment'])[:,1])
    print('Test ROC AUC: {aucscore:.5f}'.format(aucscore = auc)

## Dataset


In [None]:
# download annotated comments and annotations
ANNOTATED_COMMENTS_URL = 'https://ndownloader.figshare.com/files/7554634' 
ANNOTATIONS_URL = 'https://ndownloader.figshare.com/files/7554637'

download_file(ANNOTATED_COMMENTS_URL, 'attack_annotated_comments.tsv')
download_file(ANNOTATIONS_URL, 'attack_annotations.tsv')

In [None]:
comments = pd.read_csv('attack_annotated_comments.tsv', sep = '\t', index_col = 0)
annotations = pd.read_csv('attack_annotations.tsv',  sep = '\t')

In [None]:
len(annotations['rev_id'].unique())

In [None]:
# labels a comment as an atack if the majority of annoatators did so
labels = annotations.groupby('rev_id')['attack'].mean() > 0.5

In [None]:
# join labels and comments
comments['attack'] = labels

In [None]:
# remove newline and tab tokens
comments['comment'] = comments['comment'].apply(lambda x: x.replace("NEWLINE_TOKEN", " "))
comments['comment'] = comments['comment'].apply(lambda x: x.replace("TAB_TOKEN", " "))

## Text Preprocessing

In [None]:
comments.sample(5)

In [None]:
nltk.download("stopwords")
STOPWORDS = set(nltk.corpus.stopwords.words("english"))

In [None]:
# After testing text cleaning results above, no need to use this
# comments["comment"] = comments["comment"].apply(parse_text)

In [None]:
# comments.query('attack')['comment'].head(10) # uncomment this to read some horrible stuff :(

## Models
---

### Logistic Regression (baseline)

In [None]:
train_comments = comments.query("split=='train'")
test_comments = comments.query("split=='test'")

In [None]:
clf = Pipeline([
    ('vect', CountVectorizer(max_features = 10000, ngram_range = (1,2))),
    ('tfidf', TfidfTransformer(norm = 'l2')),
    ('clf', LogisticRegression(solver ='lbfgs')),
])

clf = clf.fit(train_comments['comment'], train_comments['attack'])
get_metrics(clf)

In [None]:
# correctly classify nice comment
clf.predict(['Thanks for you contribution, you did a great job!'])

In [None]:
# correctly classify nasty comment
clf.predict(['People as stupid as you should not edit Wikipedia!'])

In [None]:
# save the model
joblib.dump(clf, 'models/lr_base_model.pkl')
print('Model Saved')

### Logistic Regression (\#2)

In [None]:
# create a new training set made up of validation set and previous training set
train_comments = comments.query("split=='train'")
val_comments = comments.query("split=='dev'")
test_comments = comments.query("split=='test'")
train_comments = pd.concat([val_comments, train_comments])

In [None]:
clf = Pipeline([
    # replace CountVectorizer with TfidfVectorizer
    ('vect', TfidfVectorizer(max_df=1.0, min_df=1, max_features=None, norm = 'l2')),
    ('clf', LogisticRegression(solver='lbfgs')),
])

parameters = {'vect__analyzer': ('word', 'char', 'char_wb'),
              'vect__ngram_range': [(1,1), (1,2)]}

clf = GridSearchCV(clf, parameters, cv = 3, n_jobs = -1)
clf = clf.fit(train_comments['comment'], train_comments['attack'])
get_metrics(clf)

In [None]:
# correctly classify nice comment
clf.predict(['Thanks for you contribution, you did a great job!'])

In [None]:
# correctly classify nasty comment
clf.predict(['People as stupid as you should not edit Wikipedia!'])

In [None]:
# save the model
joblib.dump(clf, 'models/lr_2_model.pkl')
print('Model Saved')

### Logisitic Regression (\#3)

In [None]:
# create feature union of character and word TFIDF vectorizers
vectorizerW = TfidfVectorizer(lowercase=True, analyzer='word', stop_words=None, ngram_range = (1,1), max_df=1.0, min_df=1, max_features=None, norm = 'l2')
vectorizerC = TfidfVectorizer(lowercase=True, analyzer='char', stop_words=None, ngram_range = (1,1), max_df=1.0, min_df=1, max_features=None, norm = 'l2')
combined_features = FeatureUnion([('word', vectorizerW), ('char', vectorizerC)])

In [None]:
clf = Pipeline([
    ('features', combined_features),
    ('clf', LogisticRegression(n_jobs=-1)),
])

parameters = {'clf__solver': ('newton-cg', 'lbfgs')}

clf = GridSearchCV(clf, parameters, cv=3, n_jobs=-1)
clf = clf.fit(train_comments['comment'], train_comments['attack'])
get_metrics(clf)

In [None]:
# correctly classify nice comment
clf.predict(['Thanks for you contribution, you did a great job!'])

In [None]:
# correctly classify nasty comment
clf.predict(['People as stupid as you should not edit Wikipedia!'])

In [None]:
# save the model
joblib.dump(clf, 'models/lr_3_model.pkl')
print('Model Saved')

### Logistic Regression (\#4)

https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegressionCV.html

In [None]:
clf = Pipeline([
    ('features', combined_features),
    ('clf', LogisticRegressionCV(cv=3,max_iter=100, solver='lbfgs', random_state=12345)),
])

parameters = {'clf__fit_intercept': (True, False),
              'clf__refit': (True, False)}

clf = GridSearchCV(clf, parameters, cv=3, n_jobs=-1)
clf = clf.fit(train_comments['comment'], train_comments['attack'])
get_metrics(clf)

In [None]:
# correctly classify nice comment
clf.predict(['Thanks for you contribution, you did a great job!'])

In [None]:
# correctly classify nasty comment
clf.predict(['People as stupid as you should not edit Wikipedia!'])

In [None]:
# save the model
joblib.dump(clf, 'models/lr_4_model.pkl')
print('Model Saved')

### Multi-Layer Perceptron

https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html

In [None]:
clf = Pipeline([
    ('features', combined_features),
    ('clf', MLPClassifier(hidden_layer_sizes=(150), max_iter=50, 
                          activation='relu', random_state=12345, 
                          validation_fraction=0.2, verbose=True, early_stopping=True)),
])

clf = clf.fit(train_comments['comment'], train_comments['attack'])
get_metrics(clf)

In [None]:
# correctly classify nice comment
clf.predict(['Thanks for you contribution, you did a great job!'])

In [None]:
# correctly classify nasty comment
clf.predict(['People as stupid as you should not edit Wikipedia!'])

In [None]:
# save the model
joblib.dump(clf, 'models/mlp_model.pkl')
print('Model Saved')

### Bernoulli Naive Bayes

https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.BernoulliNB.html#sklearn.naive_bayes.BernoulliNB

In [None]:
clf = Pipeline([
    ('features', combined_features),
    ('clf', BernoulliNB()),
])

parameters = {'clf__alpha': (0.2,0.4,0.6,0.8,1)}

clf = GridSearchCV(clf, parameters, cv=3)
clf = clf.fit(train_comments['comment'], train_comments['attack'])
get_metrics(clf)

In [None]:
# correctly classify nice comment
clf.predict(['Thanks for you contribution, you did a great job!'])

In [None]:
# correctly classify nasty comment
clf.predict(['People as stupid as you should not edit Wikipedia!'])

In [None]:
# save the model
joblib.dump(clf, 'models/nb_model.pkl')
print('Model Saved')

### Random Forest Classifier

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

In [None]:
clf = Pipeline([
    ('features', combined_features),
    ('clf', RandomForestClassifier(n_estimators=50)),
])

clf = clf.fit(train_comments['comment'], train_comments['attack'])
get_metrics(clf)

In [None]:
# correctly classify nice comment
clf.predict(['Thanks for you contribution, you did a great job!'])

In [None]:
# correctly classify nasty comment
clf.predict(['People as stupid as you should not edit Wikipedia!'])

In [None]:
# save the model
joblib.dump(clf, 'models/forest_model.pkl')
print('Model Saved')