# NATURAL LANGUAGE PROCESSING 
## Topic: Toxic Comment Classification
### In this notebook, we use different classifier models to classify the toxic comments.

1. Logistic Regression

2. Naive Bayes

3. Support Vector Machine (SVM)

### Then we use Cross Validation Score to try to optimize the model performance.


## Introduction

##### This is an analysis of Wikipedia comments to create models that identify various types of toxic comments. There is a lot of racist content and swear words in the dataset and some of it will pop up in the analysis. 

In [2]:
import pandas as pd
import pickle
import numpy as np
import nltk
from nltk.corpus import stopwords
import keras
import time
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string
from collections import namedtuple
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split 
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse
from sklearn.pipeline import make_union
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

In [35]:
# Global random state and k-fold strategy 
seed = 42
k = 5
cv = StratifiedKFold(n_splits=k, random_state=seed, shuffle=True)

In [36]:
start = time.time()
def print_time(start):
    time_now = time.time() - start 
    minutes = int(time_now / 60)
    seconds = int(time_now % 60)
    if seconds < 10:
        print('Elapsed time was %d:0%d.' % (minutes, seconds))
    else:
        print('Elapsed time was %d:%d.' % (minutes, seconds))

## Feature Engineering 

In [37]:
def feature_engineering(df, sparse=0): 
    
    # Comment length
    df['length'] = df.comment_text.apply(lambda x: len(x))
    

    # Capitalization percentage
    def pct_caps(s):
        return sum([1 for c in s if c.isupper()]) / (sum(([1 for c in s if c.isalpha()])) + 1)
    df['caps'] = df.comment_text.apply(lambda x: pct_caps(x))

    # Mean Word length 
    def word_length(s):
        s = s.split(' ')
        return np.mean([len(w) for w in s if w.isalpha()])
    df['word_length'] = df.comment_text.apply(lambda x: word_length(x))

    # Average number of exclamation points 
    df['exclamation'] = df.comment_text.apply(lambda s: len([c for c in s if c == '!']))

    # Average number of question marks 
    df['question'] = df.comment_text.apply(lambda s: len([c for c in s if c == '?']))
    
    # Normalize
    for label in ['length', 'caps', 'word_length', 'question', 'exclamation']:
        minimum = df[label].min()
        diff = df[label].max() - minimum
        df[label] = df[label].apply(lambda x: (x-minimum) / (diff))

    # Strip IP Addresses
    ip = re.compile('(([2][5][0-5]\.)|([2][0-4][0-9]\.)|([0-1]?[0-9]?[0-9]\.)){3}'
                    +'(([2][5][0-5])|([2][0-4][0-9])|([0-1]?[0-9]?[0-9]))')
    def strip_ip(s, ip):
        try:
            found = ip.search(s)
            return s.replace(found.group(), ' ')
        except:
            return s

    df.comment_text = df.comment_text.apply(lambda x: strip_ip(x, ip))
    
    return df

def merge_features(comment_text, data, engineered_features):
    new_features = sparse.csr_matrix(df[engineered_features].values)
    if np.isnan(new_features.data).any():
        new_features.data = np.nan_to_num(new_features.data)
    return sparse.hstack([comment_text, new_features])

## Loading Dataset

In [38]:
# Reset data and create holdout set. 

df = pd.read_csv('train.csv')
targets = list(df.columns[2:])
df_targets = df[targets].copy()

df_sub = pd.read_csv('test.csv', dtype={'id': object}, na_filter=False)

submission = pd.DataFrame()
submission['id'] = df_sub.id.copy()

# Feature Engineering
df = feature_engineering(df)
df_sub = feature_engineering(df_sub)

print('Training labels:')
print(list(df_targets.columns))
print(df_targets.shape)

print('\nTraining data')
df.drop(list(df_targets.columns), inplace=True, axis=1)
df.drop('id', inplace=True, axis=1)
print(list(df.columns))
print(df.shape)


print('\nSubmission data')
df_sub.drop('id', inplace=True, axis=1)
print(list(df_sub.columns))
print(df_sub.shape)

toxic_rows = df_targets.sum(axis=1)
toxic_rows = (toxic_rows > 0)
targets.append('any_label')
df_targets['any_label'] = toxic_rows.astype(int)

new_features = list(df.columns[1:])
print(new_features)

from sklearn.model_selection import train_test_split
df, holdout, df_targets, holdout_targets = train_test_split(df, df_targets, test_size=0.2, random_state=seed)

  return _methods._mean(a, axis=axis, dtype=dtype,


Training labels:
['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
(159571, 6)

Training data
['comment_text', 'length', 'caps', 'word_length', 'exclamation', 'question']
(159571, 6)

Submission data
['comment_text', 'length', 'caps', 'word_length', 'exclamation', 'question']
(153164, 6)
['length', 'caps', 'word_length', 'exclamation', 'question']


In [39]:
new_features

['length', 'caps', 'word_length', 'exclamation', 'question']

## Multilabel Function 

In [40]:
from sklearn.base import clone

def multi_cv(model, data, labels, k=5, nb_features=False, shuffle=True):
    cv = StratifiedKFold(n_splits=k, random_state=seed, shuffle=True)

    def log_count_ratio(x, y):
        x = sparse.csr_matrix(x)

        p = abs(x[np.where(y==1)].sum(axis=0))
        p = p + 1
        p = p / np.sum(p)

        q = abs(x[np.where(y==0)].sum(axis=0))
        q = q + 1
        q = q / np.sum(q)

        return np.log(p/q)
    
    scores = []
    r_values = []
    for label in labels.columns:
        if nb_features:
            r = log_count_ratio(data, labels[label])
            r_values.append(r)
            data = data.multiply(r)
            if np.isnan(data.data).any():
                data.data = np.nan_to_num(data.data)
        score = np.mean(cross_val_score(clone(model), data, labels[label], scoring='f1', cv=cv))
        print(label + ' f1 score: %.4f' % score)
        scores.append(score)
    print('Average (excluding any) f1 score: %.4f' % np.mean(scores[:-1]))
    if nb_features:
        return scores, r_values
    else:
        return scores

# Vectorizing text

Text must be vectorized before being fed into machine learning models. This is a method of converting textual data into numerical data that a computer can comprehend. The characteristics of vectorized data are generally word counts or another manner of describing the occurance of letters or words in a string, and the data is usually sparse. This is accomplished through the use of a vectorizer object, which holds a dictionary of letters or words, as well as their associated integer representations and pertinent statistics, if appropriate.

Term frequency - inverted document frequency is the method we'll employ here. This is a statistic that compares the frequency of a string of characters in a single document (here, a single remark) against the inverse of its frequency over all documents in the dataset to determine its usefulness. That a term that appears often in a comment in this dataset but appears in only a few comments in the dataset is likely valuable to the model. However, a string that appears in practically every page is nearly worthless. 

In [41]:
start = time.time()
comment_vector = TfidfVectorizer(max_features=10000, analyzer='word', stop_words='english')
training_comments = comment_vector.fit_transform(df.comment_text)
holdout_comments = comment_vector.transform(holdout.comment_text)
submission_comments = comment_vector.transform(df_sub.comment_text)
print_time(start)

print(training_comments.shape)

Elapsed time was 0:19.
(127656, 10000)


# Apply Machine Learning Algorithm

### Logistic Regression

In [42]:
start = time.time()

for target in targets: 
    lr = LogisticRegression(random_state=seed, max_iter=500)
    print(target + ' score: %.4f' % np.mean(cross_val_score(lr, training_comments, df_targets[target], scoring='f1', cv=cv)))
print_time(start)

toxic score: 0.7203
severe_toxic score: 0.3203
obscene score: 0.7464
threat score: 0.1982
insult score: 0.6261
identity_hate score: 0.2785
any_label score: 0.7295
Elapsed time was 0:51.


##### With engineered features added in. 

In [43]:
start = time.time()

for target in targets: 
    lr = LogisticRegression(random_state=seed, max_iter=500)
    print(target + ' score: %.4f' % np.mean(cross_val_score(lr, merge_features(training_comments, df, new_features), df_targets[target], scoring='f1', cv=cv)))
print_time(start)

toxic score: 0.7249
severe_toxic score: 0.3485
obscene score: 0.7445
threat score: 0.2047
insult score: 0.6254
identity_hate score: 0.2846
any_label score: 0.7322
Elapsed time was 1:17.


### Naive Bayes

In [44]:
start = time.time() 

model = MultinomialNB(alpha=1.0)
_ = multi_cv(model, training_comments, df_targets)
print_time(start)

toxic f1 score: 0.6588
severe_toxic f1 score: 0.0992
obscene f1 score: 0.6637
threat f1 score: 0.0000
insult f1 score: 0.5625
identity_hate f1 score: 0.0451
any_label f1 score: 0.6681
Average (excluding any) f1 score: 0.3382
Elapsed time was 0:03.


##### With engineered features. 

In [45]:
start = time.time() 

model = MultinomialNB(alpha=1.0)
_ = multi_cv(model, merge_features(training_comments, df, new_features), df_targets)
print_time(start)

toxic f1 score: 0.6676
severe_toxic f1 score: 0.0966
obscene f1 score: 0.6709
threat f1 score: 0.0000
insult f1 score: 0.5717
identity_hate f1 score: 0.0387
any_label f1 score: 0.6748
Average (excluding any) f1 score: 0.3409
Elapsed time was 0:08.


### Support Vector Machine 

In [46]:
start = time.time()

model = LinearSVC(random_state=seed)
_ = multi_cv(model, training_comments, df_targets)
print_time(start)

toxic f1 score: 0.7543
severe_toxic f1 score: 0.3386
obscene f1 score: 0.7845
threat f1 score: 0.3520
insult f1 score: 0.6636
identity_hate f1 score: 0.3635
any_label f1 score: 0.7693
Average (excluding any) f1 score: 0.5427
Elapsed time was 0:53.


##### With engineered features. 

In [47]:
start = time.time()

model = LinearSVC(random_state=seed)
_ = multi_cv(model, merge_features(training_comments, df, new_features), df_targets)
print_time(start)

toxic f1 score: 0.7607
severe_toxic f1 score: 0.3600
obscene f1 score: 0.7848
threat f1 score: 0.3538
insult f1 score: 0.6639
identity_hate f1 score: 0.3675
any_label f1 score: 0.7735
Average (excluding any) f1 score: 0.5485
Elapsed time was 0:58.
