In [None]:
import pylab
import sklearn
from sklearn.utils import Bunch
from sklearn import metrics
import os
import nltk
import csv
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [None]:
comment_data = pd.read_csv('/Users/gregdecanio/Desktop/CAP4770/Group7_Project/CAP4770Group7/new_data.csv')

In [None]:
#comment_data.head()

In [None]:
#%matplotlib inline
#plot = comment_data['target'].hist(bins=20)
#print comment_data['target'].describe()

# Adding Discrete Classification to Data
### I will be classifying the data into 4 categories:
Not toxic: target < 0.5  
Toxic: target >= 0.5

In [None]:
%%time
def classifier(row):
  if row['target'] >= 0.5:
    return 'toxic'
  elif row['target'] < 0.5:
    return 'not-toxic'
  else:
    return 'undefined'

comment_data['toxicity_classification'] = comment_data.apply(classifier, axis=1)
comment_data.groupby(['toxicity_classification']).size().plot(kind='bar')
print comment_data.groupby(['toxicity_classification']).size()
print "Non-toxic count: ", comment_data.groupby(['toxicity_classification']).size()['not-toxic']
print "Toxic count: ", comment_data.groupby(['toxicity_classification']).size()['toxic']

# Cleaning Comment Text
## Removing Stop Words
https://stackoverflow.com/questions/19560498/faster-way-to-remove-stop-words-in-python  
https://www.geeksforgeeks.org/removing-stop-words-nltk-python/
## Removing Punctuation
https://stackoverflow.com/questions/265960/best-way-to-strip-punctuation-from-a-string  
According to the Stack Overflow, I could probably configure a more efficient way to remove punctuation. However, what I have currently works fine and is adequate.

In [None]:
%%time
#Removing stop words from the comments, where stop words are defined in NLTK stop words dictionary
from nltk.corpus import stopwords
stopWords = stopwords.words("english")

def removeStopWordsInComment(row):
    return ' '.join([word for word in row['comment_text'].split() if word not in stopWords])
    
comment_data['comment_no_stop_words'] = ''
comment_data['comment_no_stop_words'] = comment_data.apply(removeStopWordsInComment, axis=1)

In [None]:
%%time
#Removing puncuation from the comments, where punctuation is defined by the STRING punctuation dictionary
import string
punc = set(string.punctuation)

def removePunctuation(row):
    return ''.join([ch for ch in row['comment_no_stop_words'] if ch not in punc])

comment_data['comment_no_stop_punc'] = ''
comment_data['comment_no_stop_punc'] = comment_data.apply(removePunctuation, axis=1)

In [None]:
comment_data[['comment_text', 'target', 'toxicity_classification','comment_no_stop_words', 'comment_no_stop_punc']]

# Splitting Data Into Test/Train Sets 

In [None]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(comment_data, test_size=0.33, random_state=42)
print "Training data size: ", len(train)
print "Testing data size: ", len(test)

In [None]:
train = train[['comment_text', 'target', 'toxicity_classification', 'comment_no_stop_punc']]
test = test[['comment_text', 'target', 'toxicity_classification', 'comment_no_stop_punc']]
train.head()

# TF-IDF

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

count_unclean_vect = CountVectorizer()
tfidf_unclean_transformer = TfidfTransformer()

count_clean_vect = CountVectorizer()
tfidf_clean_transformer = TfidfTransformer()

#Performing TF and TF-IDF transformation on training data

train_unclean_counts = count_unclean_vect.fit_transform(train['comment_text'])
print "Dimensions of Unclean Training Data Document Term Matrix: ", train_unclean_counts.shape
train_unclean_tfidf = tfidf_unclean_transformer.fit_transform(train_unclean_counts)
print "Dimensions of Unclean Training Data TF_IDF Matrix: ", train_unclean_tfidf.shape

train_clean_counts = count_clean_vect.fit_transform(train['comment_no_stop_punc'])
print "Dimensions of Clean Training Data Document Term Matrix: ", train_clean_counts.shape
train_clean_tfidf = tfidf_clean_transformer.fit_transform(train_clean_counts)
print "Dimensions of Clean Training Data TF_IDF Matrix: ", train_clean_tfidf.shape

In [None]:
#Performing TF and TF-IDF transformation on test data

test_unclean_counts = count_unclean_vect.transform(test['comment_text'])
print "Dimensions of Unclean Test Data Document Term Matrix: ", test_unclean_counts.shape
test_unclean_tfidf = tfidf_unclean_transformer.transform(test_unclean_counts)
print "Dimensions of Unclean Test Data TF_IDF Matrix: ", test_unclean_tfidf.shape

test_clean_counts = count_clean_vect.transform(test['comment_no_stop_punc'])
print "Dimensions of Clean Test Data Document Term Matrix: ", test_clean_counts.shape
test_clean_tfidf = tfidf_clean_transformer.transform(test_clean_counts)
print "Dimensions of Clean Test Data TF_IDF Matrix: ", test_clean_tfidf.shape

# Naive Bayes Classification
https://scikit-learn.org/0.19/modules/generated/sklearn.naive_bayes.MultinomialNB.html#sklearn.naive_bayes.MultinomialNB

In [None]:
%%time
#Training and creating the Multinomial Naive Bayes model (unclean)
from sklearn.naive_bayes import MultinomialNB
modelNB_unclean = MultinomialNB().fit(train_unclean_tfidf, train['toxicity_classification'])

In [None]:
%%time
#Training and creating the Multinomial Naive Bayes model (clean)
modelNB_clean = MultinomialNB().fit(train_clean_tfidf, train['toxicity_classification'])

In [None]:
%%time
#Calulate accuracy of predictions
predsNB_unclean = modelNB_unclean.predict(test_unclean_tfidf)
accNB_unclean = np.mean(predsNB_unclean == test['toxicity_classification']) 
print "Accuracy of Naive Bayes Classifier (Unclean): ", accNB_unclean, "\n"

predsNB_clean = modelNB_clean.predict(test_clean_tfidf)
accNB_clean = np.mean(predsNB_clean == test['toxicity_classification']) 
print "Accuracy of Naive Bayes Classifier (Clean): ", accNB_clean, "\n"

In [None]:
%%time
#Show confustion matrix of predictions
print(metrics.classification_report(test['toxicity_classification'], predsNB_unclean))
print "Confusion Matrix (Unclean):\n", metrics.confusion_matrix(test['toxicity_classification'], predsNB_unclean)

print(metrics.classification_report(test['toxicity_classification'], predsNB_clean))
print "Confusion Matrix (Clean):\n", metrics.confusion_matrix(test['toxicity_classification'], predsNB_clean)

# KNN Classification
Currently # of neighbors is arbitraily set to 2 (Greg, 11/20/2019)  
https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html#sklearn.neighbors.KNeighborsClassifier  
https://scikit-learn.org/stable/modules/neighbors.html

In [None]:
%%time
#Training and creating the K Nearest Neighbors model (unclean)
from sklearn.neighbors import KNeighborsClassifier
modelKNN_unclean = KNeighborsClassifier(n_neighbors=2, algorithm='brute').fit(train_unclean_tfidf, train['toxicity_classification'])

In [None]:
%%time
#Training and creating the K Nearest Neighbors model (clean)
from sklearn.neighbors import KNeighborsClassifier
modelKNN_clean = KNeighborsClassifier(n_neighbors=2, algorithm='brute').fit(train_clean_tfidf, train['toxicity_classification'])

In [None]:
#%%time
#Calulate accuracy of predictions
#predsKNN_unclean = modelKNN_unclean.predict(test_unclean_tfidf)
#accKNN_unclean = np.mean(predsKNN_unclean == test['toxicity_classification'])
#print "Accuracy of K Nearest Neighbors Classifier (Unclean): ", accKNN_unclean

#predsKNN_clean = modelKNN_unclean.predict(test_clean_tfidf)
#accKNN_clean = np.mean(predsKNN_clean == test['toxicity_classification'])
#print "Accuracy of K Nearest Neighbors Classifier (Unclean): ", accKNN_clean

In [None]:
#%%time
#Show confustion matrix of predictions
#print(metrics.classification_report(test['toxicity_classification'], predsKNN_unclean))
#print "Confusion Matrix (Unclean):\n", metrics.confusion_matrix(test['toxicity_classification'], predsKNN_unclean)

#print(metrics.classification_report(test['toxicity_classification'], predsKNN_clean))
#print "Confusion Matrix (Clean):\n", metrics.confusion_matrix(test['toxicity_classification'], predsKNN_clean)

# SVC Classification
https://scikit-learn.org/stable/modules/svm.html#classification

In [None]:
#%%time
#Training and creating the SVC model (unclean)
#from sklearn import svm
#modelSVC_unclean = svm.SVC(kernel='linear').fit(train_unclean_tfidf, train['toxicity_classification'])

In [None]:
#%%time
#Training and creating the SVC model (clean)
#modelSVC_clean = svm.SVC(kernel='linear').fit(train_clean_tfidf, train['toxicity_classification'])

In [None]:
#%%time
#Calulate accuracy of predictions
#predsSVC_unclean = modelSVC_unclean.predict(test_unclean_tfidf)
#accSVC_unclean = np.mean(predsSVC_unclean == test['toxicity_classification'])
#print "Accuracy of SCV Classifier (Unclean): ", accSVC_unclean

#predsSVC_clean = modelSVC_clean.predict(test_clean_tfidf)
#accSVC_clean = np.mean(predsSVC_clean == test['toxicity_classification'])
#print "Accuracy of SVC Classifier (Clean): ", accSVC_clean

In [None]:
#%%time
#Show confustion matrix of predictions
#print(metrics.classification_report(test['toxicity_classification'], predsSVC_unclean))
#print "Confusion Matrix (Unclean):\n", metrics.confusion_matrix(test['toxicity_classification'], predsSVC_unclean)

#print(metrics.classification_report(test['toxicity_classification'], predsSVC_clean))
#print "Confusion Matrix (Clean):\n", metrics.confusion_matrix(test['toxicity_classification'], predsSVC_clean)



# SGD Classification
https://scikit-learn.org/stable/modules/sgd.html#classification

In [None]:
%%time
#Training and creating the SGD Classifier model (unclean)
from sklearn.linear_model import SGDClassifier
modelSGD_unclean = SGDClassifier(loss='hinge', penalty='l2',
                           max_iter=5).fit(train_unclean_tfidf, train['toxicity_classification'])

In [None]:
%%time
#Training and creating the SGD Classifier model (clean)
modelSGD_clean = SGDClassifier(loss='hinge', penalty='l2',
                           max_iter=5).fit(train_clean_tfidf, train['toxicity_classification'])

In [None]:
%%time
#Calulate accuracy of predictions
predsSGD_unclean = modelSGD_unclean.predict(test_unclean_tfidf)
accSGD_unclean = np.mean(predsSGD_unclean == test['toxicity_classification']) 
print "Accuracy of SGD Classifier (Unclean): ", accSGD_unclean, "\n"

predsSGD_clean = modelSGD_clean.predict(test_clean_tfidf)
accSGD_clean = np.mean(predsSGD_clean == test['toxicity_classification']) 
print "Accuracy of SGD Classifier (Clean): ", accSGD_clean, "\n"

In [None]:
%%time
#Show confustion matrix of predictions
print(metrics.classification_report(test['toxicity_classification'], predsSGD_unclean))
print "Confusion Matrix (Unclean):\n", metrics.confusion_matrix(test['toxicity_classification'], predsSGD_unclean)

print(metrics.classification_report(test['toxicity_classification'], predsSGD_clean))
print "Confusion Matrix (Clean):\n", metrics.confusion_matrix(test['toxicity_classification'], predsSGD_clean)

## How a Pipeline Is Setup

In [None]:
text_clf = Pipeline([
     ('vect', CountVectorizer()),
     ('tfidf', TfidfTransformer()),
     ('model', SGDClassifier(loss='hinge', penalty='l2',
                           alpha=1e-3, random_state=42,
                           max_iter=5, tol=None)),
 ])