In [1]:
import pylab
import sklearn
from sklearn.utils import Bunch
from sklearn import metrics
import os
import nltk
import csv
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
comment_data = pd.read_csv('/Users/gregdecanio/Desktop/CAP4770/Group7_Project/CAP4770Group7/new_data.csv')

In [None]:
#comment_data.head()

In [None]:
#%matplotlib inline
#plot = comment_data['target'].hist(bins=20)
#print comment_data['target'].describe()

# Adding Discrete Classification to Data
### I will be classifying the data into 4 categories:
Not toxic: target < 0.5  
Toxic: target >= 0.5

In [3]:
%%time
def classifier(row):
  if row['target'] >= 0.5:
    return 'toxic'
  elif row['target'] < 0.5:
    return 'not-toxic'
  else:
    return 'undefined'

comment_data['toxicity_classification'] = comment_data.apply(classifier, axis=1)
comment_data.groupby(['toxicity_classification']).size().plot(kind='bar')
print comment_data.groupby(['toxicity_classification']).size()
print "Non-toxic count: ", comment_data.groupby(['toxicity_classification']).size()['not-toxic']
print "Toxic count: ", comment_data.groupby(['toxicity_classification']).size()['toxic']

toxicity_classification
not-toxic    96851
toxic        18149
dtype: int64
Non-toxic count:  96851
Toxic count:  18149
CPU times: user 4.68 s, sys: 231 ms, total: 4.91 s
Wall time: 5.38 s


# Cleaning Comment Text
## Removing Stop Words
https://stackoverflow.com/questions/19560498/faster-way-to-remove-stop-words-in-python  
https://www.geeksforgeeks.org/removing-stop-words-nltk-python/

In [4]:
%%time
#Removing stop words from the comments, where stop words are defined in NLTK stop words dictionary
from nltk.corpus import stopwords
stopWords = stopwords.words("english")

def removeStopWordsInComment(row):
    return ' '.join([word for word in row['comment_text'].split() if word not in stopWords])
    
comment_data['comment_no_stop_words'] = ''
comment_data['comment_no_stop_words'] = comment_data.apply(removeStopWordsInComment, axis=1)

  


CPU times: user 1min 13s, sys: 883 ms, total: 1min 14s
Wall time: 1min 16s


## Removing Punctuation
https://stackoverflow.com/questions/265960/best-way-to-strip-punctuation-from-a-string  
According to the Stack Overflow, I could probably configure a more efficient way to remove punctuation. However, what I have currently works fine and is adequate.

In [6]:
%%time
#Removing puncuation from the comments, where punctuation is defined by the STRING punctuation dictionary
import string
punc = set(string.punctuation)

def removePunctuation(row):
    return ''.join([ch for ch in row['comment_no_stop_words'] if ch not in punc])

comment_data['comment_no_stop_punc'] = ''
comment_data['comment_no_stop_punc'] = comment_data.apply(removePunctuation, axis=1)

CPU times: user 10.4 s, sys: 307 ms, total: 10.7 s
Wall time: 11.2 s


In [None]:
comment_data[['comment_text', 'target', 'toxicity_classification','comment_no_stop_words', 'comment_no_stop_punc']]

# Splitting Data Into Test/Train Sets 

In [7]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(comment_data, test_size=0.33, random_state=42)
print "Training data size: ", len(train)
print "Testing data size: ", len(test)

Training data size:  77050
Testing data size:  37950


In [8]:
train = train[['comment_text', 'target', 'toxicity_classification', 'comment_no_stop_punc']]
test = test[['comment_text', 'target', 'toxicity_classification', 'comment_no_stop_punc']]
train.head()

Unnamed: 0,comment_text,target,toxicity_classification,comment_no_stop_punc
25219,Your quote from the article really describes A...,0.0,not-toxic,Your quote article really describes Arab Sprin...
97436,The Crooks always seems to con there way in an...,0.65,toxic,The Crooks always seems con way win debate kan...
4346,Many illegal aliens are white hispanics.Many w...,0.5375,toxic,Many illegal aliens white hispanicsMany white ...
16948,Total misrepresentation. Why was Bernie agains...,0.0,not-toxic,Total misrepresentation Why Bernie it What AFL...
89159,When was this announced?,0.0,not-toxic,When announced


# TF-IDF

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

count_unclean_vect = CountVectorizer()
tfidf_unclean_transformer = TfidfTransformer()

count_clean_vect = CountVectorizer()
tfidf_clean_transformer = TfidfTransformer()

#Performing TF and TF-IDF transformation on training data

train_unclean_counts = count_unclean_vect.fit_transform(train['comment_text'])
print "Dimensions of Unclean Training Data Document Term Matrix: ", train_unclean_counts.shape
train_unclean_tfidf = tfidf_unclean_transformer.fit_transform(train_unclean_counts)
print "Dimensions of Unclean Training Data TF_IDF Matrix: ", train_unclean_tfidf.shape

train_clean_counts = count_clean_vect.fit_transform(train['comment_no_stop_punc'])
print "Dimensions of Clean Training Data Document Term Matrix: ", train_clean_counts.shape
train_clean_tfidf = tfidf_clean_transformer.fit_transform(train_clean_counts)
print "Dimensions of Clean Training Data TF_IDF Matrix: ", train_clean_tfidf.shape

Dimensions of Unclean Training Data Document Term Matrix:  (77050, 69165)
Dimensions of Unclean Training Data TF_IDF Matrix:  (77050, 69165)
Dimensions of Clean Training Data Document Term Matrix:  (77050, 90324)
Dimensions of Clean Training Data TF_IDF Matrix:  (77050, 90324)


In [10]:
#Performing TF and TF-IDF transformation on test data

test_unclean_counts = count_unclean_vect.transform(test['comment_text'])
print "Dimensions of Unclean Test Data Document Term Matrix: ", test_unclean_counts.shape
test_unclean_tfidf = tfidf_unclean_transformer.transform(test_unclean_counts)
print "Dimensions of Unclean Test Data TF_IDF Matrix: ", test_unclean_tfidf.shape

test_clean_counts = count_clean_vect.transform(test['comment_no_stop_punc'])
print "Dimensions of Clean Test Data Document Term Matrix: ", test_clean_counts.shape
test_clean_tfidf = tfidf_clean_transformer.transform(test_clean_counts)
print "Dimensions of Clean Test Data TF_IDF Matrix: ", test_clean_tfidf.shape

Dimensions of Unclean Test Data Document Term Matrix:  (37950, 69165)
Dimensions of Unclean Test Data TF_IDF Matrix:  (37950, 69165)
Dimensions of Clean Test Data Document Term Matrix:  (37950, 90324)
Dimensions of Clean Test Data TF_IDF Matrix:  (37950, 90324)


# Naive Bayes Classification
https://scikit-learn.org/0.19/modules/generated/sklearn.naive_bayes.MultinomialNB.html#sklearn.naive_bayes.MultinomialNB

In [11]:
%%time
#Training and creating the Multinomial Naive Bayes model (unclean)
from sklearn.naive_bayes import MultinomialNB
modelNB_unclean = MultinomialNB().fit(train_unclean_tfidf, train['toxicity_classification'])

CPU times: user 251 ms, sys: 49.2 ms, total: 300 ms
Wall time: 315 ms


In [12]:
%%time
#Training and creating the Multinomial Naive Bayes model (clean)
modelNB_clean = MultinomialNB().fit(train_clean_tfidf, train['toxicity_classification'])

CPU times: user 252 ms, sys: 24.7 ms, total: 277 ms
Wall time: 280 ms


In [13]:
%%time
#Calulate accuracy of predictions
predsNB_unclean = modelNB_unclean.predict(test_unclean_tfidf)
accNB_unclean = np.mean(predsNB_unclean == test['toxicity_classification']) 
print "Accuracy of Naive Bayes Classifier (Unclean): ", accNB_unclean, "\n"

predsNB_clean = modelNB_clean.predict(test_clean_tfidf)
accNB_clean = np.mean(predsNB_clean == test['toxicity_classification']) 
print "Accuracy of Naive Bayes Classifier (Clean): ", accNB_clean, "\n"

Accuracy of Naive Bayes Classifier (Unclean):  0.845876152833 

Accuracy of Naive Bayes Classifier (Clean):  0.84534914361 

CPU times: user 45.8 ms, sys: 17.8 ms, total: 63.6 ms
Wall time: 67 ms


In [14]:
%%time
#Show confustion matrix of predictions
print(metrics.classification_report(test['toxicity_classification'], predsNB_unclean))
print "Confusion Matrix (Unclean):\n", metrics.confusion_matrix(test['toxicity_classification'], predsNB_unclean)

print(metrics.classification_report(test['toxicity_classification'], predsNB_clean))
print "Confusion Matrix (Clean):\n", metrics.confusion_matrix(test['toxicity_classification'], predsNB_clean)

             precision    recall  f1-score   support

  not-toxic       0.85      1.00      0.92     31988
      toxic       0.91      0.02      0.04      5962

avg / total       0.86      0.85      0.78     37950

Confusion Matrix (Unclean):
[[31975    13]
 [ 5836   126]]
             precision    recall  f1-score   support

  not-toxic       0.85      1.00      0.92     31988
      toxic       0.95      0.02      0.03      5962

avg / total       0.86      0.85      0.78     37950

Confusion Matrix (Clean):
[[31983     5]
 [ 5864    98]]
CPU times: user 734 ms, sys: 12.9 ms, total: 747 ms
Wall time: 743 ms


# KNN Classification
Currently # of neighbors is arbitraily set to 2 (Greg, 11/20/2019)  
https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html#sklearn.neighbors.KNeighborsClassifier  
https://scikit-learn.org/stable/modules/neighbors.html

In [None]:
'''
%%time
#Training and creating the K Nearest Neighbors model (unclean)
from sklearn.neighbors import KNeighborsClassifier
modelKNN_unclean = KNeighborsClassifier(n_neighbors=2, algorithm='brute').fit(train_unclean_tfidf, train['toxicity_classification'])
'''

In [None]:
'''
%%time
#Training and creating the K Nearest Neighbors model (clean)
from sklearn.neighbors import KNeighborsClassifier
modelKNN_clean = KNeighborsClassifier(n_neighbors=2, algorithm='brute').fit(train_clean_tfidf, train['toxicity_classification'])
'''

In [None]:
'''
%%time
#Calulate accuracy of predictions
predsKNN_unclean = modelKNN_unclean.predict(test_unclean_tfidf)
accKNN_unclean = np.mean(predsKNN_unclean == test['toxicity_classification'])
print "Accuracy of K Nearest Neighbors Classifier (Unclean): ", accKNN_unclean

predsKNN_clean = modelKNN_unclean.predict(test_clean_tfidf)
accKNN_clean = np.mean(predsKNN_clean == test['toxicity_classification'])
print "Accuracy of K Nearest Neighbors Classifier (Unclean): ", accKNN_clean
'''

In [None]:
'''
%%time
#Show confustion matrix of predictions
print(metrics.classification_report(test['toxicity_classification'], predsKNN_unclean))
print "Confusion Matrix (Unclean):\n", metrics.confusion_matrix(test['toxicity_classification'], predsKNN_unclean)

print(metrics.classification_report(test['toxicity_classification'], predsKNN_clean))
print "Confusion Matrix (Clean):\n", metrics.confusion_matrix(test['toxicity_classification'], predsKNN_clean)
'''

# SVC Classification
https://scikit-learn.org/stable/modules/svm.html#classification

In [None]:
'''
%%time
#Training and creating the SVC model (unclean)
from sklearn import svm
modelSVC_unclean = svm.SVC(kernel='linear').fit(train_unclean_tfidf, train['toxicity_classification'])
'''

In [None]:
'''
%%time
#Training and creating the SVC model (clean)
modelSVC_clean = svm.SVC(kernel='linear').fit(train_clean_tfidf, train['toxicity_classification'])
'''

In [None]:
'''
%%time
#Calulate accuracy of predictions
predsSVC_unclean = modelSVC_unclean.predict(test_unclean_tfidf)
accSVC_unclean = np.mean(predsSVC_unclean == test['toxicity_classification'])
print "Accuracy of SCV Classifier (Unclean): ", accSVC_unclean

predsSVC_clean = modelSVC_clean.predict(test_clean_tfidf)
accSVC_clean = np.mean(predsSVC_clean == test['toxicity_classification'])
print "Accuracy of SVC Classifier (Clean): ", accSVC_clean
'''

In [None]:
'''
%%time
#Show confustion matrix of predictions
print(metrics.classification_report(test['toxicity_classification'], predsSVC_unclean))
print "Confusion Matrix (Unclean):\n", metrics.confusion_matrix(test['toxicity_classification'], predsSVC_unclean)

print(metrics.classification_report(test['toxicity_classification'], predsSVC_clean))
print "Confusion Matrix (Clean):\n", metrics.confusion_matrix(test['toxicity_classification'], predsSVC_clean)
'''



# SGD Classification
https://scikit-learn.org/stable/modules/sgd.html#classification

In [15]:
%%time
#Training and creating the SGD Classifier model (unclean)
from sklearn.linear_model import SGDClassifier
modelSGD_unclean = SGDClassifier(loss='hinge', penalty='l2',
                           max_iter=5).fit(train_unclean_tfidf, train['toxicity_classification'])

CPU times: user 213 ms, sys: 9.4 ms, total: 222 ms
Wall time: 241 ms


In [16]:
%%time
#Training and creating the SGD Classifier model (clean)
modelSGD_clean = SGDClassifier(loss='hinge', penalty='l2',
                           max_iter=5).fit(train_clean_tfidf, train['toxicity_classification'])

CPU times: user 216 ms, sys: 6.32 ms, total: 223 ms
Wall time: 226 ms


In [17]:
%%time
#Calulate accuracy of predictions
predsSGD_unclean = modelSGD_unclean.predict(test_unclean_tfidf)
accSGD_unclean = np.mean(predsSGD_unclean == test['toxicity_classification']) 
print "Accuracy of SGD Classifier (Unclean): ", accSGD_unclean, "\n"

predsSGD_clean = modelSGD_clean.predict(test_clean_tfidf)
accSGD_clean = np.mean(predsSGD_clean == test['toxicity_classification']) 
print "Accuracy of SGD Classifier (Clean): ", accSGD_clean, "\n"

Accuracy of SGD Classifier (Unclean):  0.874993412385 

Accuracy of SGD Classifier (Clean):  0.876574440053 

CPU times: user 34 ms, sys: 3.14 ms, total: 37.1 ms
Wall time: 39.2 ms


In [18]:
%%time
#Show confustion matrix of predictions
print(metrics.classification_report(test['toxicity_classification'], predsSGD_unclean))
print "Confusion Matrix (Unclean):\n", metrics.confusion_matrix(test['toxicity_classification'], predsSGD_unclean)

print(metrics.classification_report(test['toxicity_classification'], predsSGD_clean))
print "Confusion Matrix (Clean):\n", metrics.confusion_matrix(test['toxicity_classification'], predsSGD_clean)

             precision    recall  f1-score   support

  not-toxic       0.87      1.00      0.93     31988
      toxic       0.93      0.22      0.36      5962

avg / total       0.88      0.87      0.84     37950

Confusion Matrix (Unclean):
[[31887   101]
 [ 4643  1319]]
             precision    recall  f1-score   support

  not-toxic       0.87      1.00      0.93     31988
      toxic       0.92      0.23      0.37      5962

avg / total       0.88      0.88      0.84     37950

Confusion Matrix (Clean):
[[31872   116]
 [ 4568  1394]]
CPU times: user 731 ms, sys: 9.79 ms, total: 741 ms
Wall time: 736 ms


# Random Forest Classification
https://scikit-learn.org/0.19/modules/generated/sklearn.ensemble.RandomForestClassifier.html#sklearn.ensemble.RandomForestClassifier

In [19]:
%%time
#Training the data and creating the Random Forest model (unclean)
from sklearn.ensemble import RandomForestClassifier
modelRF_unclean = RandomForestClassifier().fit(train_unclean_tfidf, train['toxicity_classification'])

CPU times: user 33 s, sys: 273 ms, total: 33.3 s
Wall time: 34.3 s


In [20]:
%%time
#Training the data and creating the Random Forest model (clean)
modelRF_clean = RandomForestClassifier().fit(train_clean_tfidf, train['toxicity_classification'])

CPU times: user 42.3 s, sys: 344 ms, total: 42.7 s
Wall time: 43.5 s


In [21]:
%%time
#Calulate accuracy of predictions
predsRF_unclean = modelRF_unclean.predict(test_unclean_tfidf)
accRF_unclean = np.mean(predsRF_unclean == test['toxicity_classification']) 
print "Accuracy of RF Classifier (Unclean): ", accRF_unclean, "\n"

predsRF_clean = modelRF_clean.predict(test_clean_tfidf)
accRF_clean = np.mean(predsRF_clean == test['toxicity_classification']) 
print "Accuracy of RF Classifier (Clean): ", accRF_clean, "\n"

Accuracy of RF Classifier (Unclean):  0.875467720685 

Accuracy of RF Classifier (Clean):  0.885902503294 

CPU times: user 3.87 s, sys: 29.7 ms, total: 3.9 s
Wall time: 3.92 s


In [22]:
%%time
#Show confustion matrix of predictions
print(metrics.classification_report(test['toxicity_classification'], predsRF_unclean))
print "Confusion Matrix (Unclean):\n", metrics.confusion_matrix(test['toxicity_classification'], predsRF_unclean)

print(metrics.classification_report(test['toxicity_classification'], predsRF_clean))
print "Confusion Matrix (Clean):\n", metrics.confusion_matrix(test['toxicity_classification'], predsRF_clean)

             precision    recall  f1-score   support

  not-toxic       0.88      0.99      0.93     31988
      toxic       0.84      0.26      0.39      5962

avg / total       0.87      0.88      0.85     37950

Confusion Matrix (Unclean):
[[31688   300]
 [ 4426  1536]]
             precision    recall  f1-score   support

  not-toxic       0.89      0.98      0.94     31988
      toxic       0.80      0.36      0.50      5962

avg / total       0.88      0.89      0.87     37950

Confusion Matrix (Clean):
[[31446   542]
 [ 3788  2174]]
CPU times: user 913 ms, sys: 11.8 ms, total: 925 ms
Wall time: 923 ms


# Decision Tree Classification
https://scikit-learn.org/0.19/modules/generated/sklearn.tree.DecisionTreeClassifier.html#sklearn.tree.DecisionTreeClassifier

In [23]:
%%time
#Training the data and creating the Decision Tree model (unclean)
from sklearn.tree import DecisionTreeClassifier
modelDT_unclean = DecisionTreeClassifier().fit(train_unclean_tfidf, train['toxicity_classification'])

CPU times: user 6min 17s, sys: 2.01 s, total: 6min 19s
Wall time: 6min 32s


In [25]:
%%time
#Training the data and creating the Decision Tree model (clean)
modelDT_clean = DecisionTreeClassifier().fit(train_clean_tfidf, train['toxicity_classification'])

CPU times: user 5min 50s, sys: 619 ms, total: 5min 50s
Wall time: 5min 51s


In [26]:
%%time
#Calulate accuracy of predictions
predsDT_unclean = modelDT_unclean.predict(test_unclean_tfidf)
accDT_unclean = np.mean(predsDT_unclean == test['toxicity_classification']) 
print "Accuracy of DT Classifier (Unclean): ", accDT_unclean, "\n"

predsDT_clean = modelDT_clean.predict(test_clean_tfidf)
accDT_clean = np.mean(predsDT_clean == test['toxicity_classification']) 
print "Accuracy of DT Classifier (Clean): ", accDT_clean, "\n"

Accuracy of DT Classifier (Unclean):  0.87093544137 

Accuracy of DT Classifier (Clean):  0.872516469038 

CPU times: user 625 ms, sys: 59.4 ms, total: 684 ms
Wall time: 690 ms


In [27]:
%%time
#Show confustion matrix of predictions
print(metrics.classification_report(test['toxicity_classification'], predsDT_unclean))
print "Confusion Matrix (Unclean):\n", metrics.confusion_matrix(test['toxicity_classification'], predsDT_unclean)

print(metrics.classification_report(test['toxicity_classification'], predsDT_clean))
print "Confusion Matrix (Clean):\n", metrics.confusion_matrix(test['toxicity_classification'], predsDT_clean)

             precision    recall  f1-score   support

  not-toxic       0.92      0.93      0.92     31988
      toxic       0.59      0.57      0.58      5962

avg / total       0.87      0.87      0.87     37950

Confusion Matrix (Unclean):
[[29677  2311]
 [ 2587  3375]]
             precision    recall  f1-score   support

  not-toxic       0.92      0.93      0.92     31988
      toxic       0.60      0.59      0.59      5962

avg / total       0.87      0.87      0.87     37950

Confusion Matrix (Clean):
[[29620  2368]
 [ 2470  3492]]
CPU times: user 908 ms, sys: 16.3 ms, total: 924 ms
Wall time: 924 ms


## How a Pipeline Is Setup

In [None]:
text_clf = Pipeline([
     ('vect', CountVectorizer()),
     ('tfidf', TfidfTransformer()),
     ('model', SGDClassifier(loss='hinge', penalty='l2',
                           alpha=1e-3, random_state=42,
                           max_iter=5, tol=None)),
 ])