In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from my_measures import BinaryClassificationPerformance
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from scipy.sparse import csr_matrix, hstack
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
%matplotlib inline

help(BinaryClassificationPerformance)

In [19]:
f = '/Users/smolloy/Dev/parsons/ml-2020_data/toxiccomments_train.csv'
toxic_data = pd.read_csv(f)

In [4]:
print("toxic_data is:", type(toxic_data))
print("toxic_data has", toxic_data.shape[0], "rows and", toxic_data.shape[1], "columns", "\n")
print("the data types for each of the columns in toxic_data:")
print(toxic_data.dtypes, "\n")
print("the first 10 rows in toxic_data:")
print(toxic_data.head(5))

toxic_data is: <class 'pandas.core.frame.DataFrame'>
toxic_data has 159571 rows and 8 columns 

the data types for each of the columns in toxic_data:
id               object
comment_text     object
toxic             int64
severe_toxic      int64
obscene           int64
threat            int64
insult            int64
identity_hate     int64
dtype: object 

the first 10 rows in toxic_data:
                 id                                       comment_text  toxic  \
0  0000997932d777bf  Explanation\nWhy the edits made under my usern...      0   
1  000103f0d9cfb60f  D'aww! He matches this background colour I'm s...      0   
2  000113f07ec002fd  Hey man, I'm really not trying to edit war. It...      0   
3  0001b41b1c6bb37e  "\nMore\nI can't make any real suggestions on ...      0   
4  0001d958c54c6e35  You, sir, are my hero. Any chance you remember...      0   

   severe_toxic  obscene  threat  insult  identity_hate  
0             0        0       0       0              0  
1     

In [5]:
print("The rate of toxic comments in the dataset: ")
print(toxic_data['toxic'].mean())

The rate of toxic comments in the dataset: 
0.09584448302009764


In [6]:
# http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
corpus = toxic_data.comment_text.as_matrix()
X_bag_of_words = vectorizer.fit_transform(corpus)
print(X_bag_of_words.toarray())

  after removing the cwd from sys.path.


[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [7]:
# vectorize Bag of Words from review text; as sparse matrix
from sklearn.feature_extraction.text import HashingVectorizer
hv = HashingVectorizer(n_features= 2 ** 17, alternate_sign=True)
X_hv = hv.fit_transform(toxic_data.comment_text)
print(X_hv.shape)

(159571, 131072)


In [8]:
# http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfTransformer.html
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer()
X_tfidf = transformer.fit_transform(X_hv)

In [9]:
print(type(X_tfidf))

<class 'scipy.sparse.csr.csr_matrix'>


In [10]:
# features from Amazon.csv to add to feature set
toxic_data['word_count'] = toxic_data['comment_text'].str.split(' ').str.len()
toxic_data['punc_count'] = toxic_data['comment_text'].str.count("\.")


X_quant_features = toxic_data[["word_count", "punc_count"]]
print(X_quant_features.head(10))
print(type(X_quant_features))

   word_count  punc_count
0          42           5
1          18           2
2          42           3
3         112           3
4          13           1
5          12           1
6           8           0
7          21           2
8          83           7
9          12           0
<class 'pandas.core.frame.DataFrame'>


In [11]:
from scipy.sparse import csr_matrix, hstack
X_quant_features_csr = csr_matrix(X_quant_features)
X_combined = hstack([X_tfidf, X_quant_features_csr])
X_matrix = csr_matrix(X_combined) # convert to sparse matrix
print(X_matrix.shape)

(159571, 131074)


In [12]:
# look at an example of a "row" of a sparse matrix
print(X_matrix[1234])

  (0, 13080)	-0.3352837541940735
  (0, 20637)	0.12273750000524668
  (0, 24734)	-0.05756850399552078
  (0, 28641)	0.30255698578348783
  (0, 36157)	-0.11300117557573076
  (0, 38365)	-0.2858310107057536
  (0, 43099)	0.06265026668613759
  (0, 43902)	-0.17992345892724868
  (0, 45980)	0.1435960247305051
  (0, 46353)	0.13568699421838595
  (0, 49453)	-0.1384992605089221
  (0, 50743)	-0.089351164688054
  (0, 51619)	0.11280487492095764
  (0, 54542)	-0.2221009321228698
  (0, 61069)	-0.41065661467663855
  (0, 65396)	-0.4538351025565183
  (0, 67646)	0.07583345070821097
  (0, 69740)	-0.11164777968958645
  (0, 92259)	0.2072488754885471
  (0, 98369)	0.1522711489306622
  (0, 111869)	0.21053732125232136
  (0, 116871)	-0.12394931615623446
  (0, 131072)	27.0


In [13]:
# feature scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler(with_mean=False)
X = sc.fit_transform(X_matrix)
print(X.shape)

(159571, 131074)


In [14]:
# look at an example of a "row" of a sparse matrix, after scaling
print(X[1234])

  (0, 13080)	-38.95725464549216
  (0, 20637)	3.349436911185673
  (0, 24734)	-0.8062888443587518
  (0, 28641)	31.74951192116682
  (0, 36157)	-2.5465838031790473
  (0, 38365)	-25.739199844475827
  (0, 43099)	1.1063300679196704
  (0, 43902)	-4.567234110131841
  (0, 45980)	2.023756518142615
  (0, 46353)	4.339484092685847
  (0, 49453)	-2.861090888355053
  (0, 50743)	-2.211898499002182
  (0, 51619)	2.6826550969136753
  (0, 54542)	-14.869262200434145
  (0, 61069)	-156.61578596302314
  (0, 65396)	-217.89132891482586
  (0, 67646)	1.466911611721104
  (0, 69740)	-3.340770670659583
  (0, 92259)	11.135702525315347
  (0, 98369)	4.93282853017257
  (0, 111869)	14.504120362070436
  (0, 116871)	-3.9167193534092553
  (0, 131072)	0.2686035598369133


In [15]:
# IMPORTANT, enter an integer into the variable below; any integer other than 74
my_random_state = 666

# create training and test sets
from sklearn.model_selection import train_test_split

# enter an integer for the random_state parameter; any integer will work
X_train, X_test, y_train, y_test, X_raw_train, X_raw_test = train_test_split(X, toxic_data['toxic'], toxic_data, test_size= 0.2, random_state=my_random_state)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
print(X_raw_train.shape)
print(X_raw_test.shape)

(127656, 131074)
(31915, 131074)
(127656,)
(31915,)
(127656, 10)
(31915, 10)


In [20]:
# ordinary least squares model
from sklearn import linear_model
ols = linear_model.SGDClassifier(loss="squared_loss")
ols.fit(X_train, y_train)

ols_performance_train = BinaryClassificationPerformance(ols.predict(X_train), y_train, 'ols_train')
ols_performance_train.compute_measures()
print(ols_performance_train.performance_measures)

{'Pos': 12174, 'Neg': 115482, 'TP': 6066, 'TN': 57296, 'FP': 58186, 'FN': 6108, 'Accuracy': 0.49634956445447137, 'Precision': 0.0944095125443566, 'Recall': 0.4982750123213406, 'desc': 'ols_train'}


In [21]:
# svm, linear model
from sklearn import linear_model
svm = linear_model.SGDClassifier()
svm.fit(X_train, y_train)

svm_performance_train = BinaryClassificationPerformance(svm.predict(X_train), y_train, 'svm_train')
svm_performance_train.compute_measures()
print(svm_performance_train.performance_measures)

{'Pos': 12174, 'Neg': 115482, 'TP': 12063, 'TN': 115347, 'FP': 135, 'FN': 111, 'Accuracy': 0.9980729460424892, 'Precision': 0.9889326119035907, 'Recall': 0.9908822079842287, 'desc': 'svm_train'}


In [22]:
# logistical regression model (note: n_iter has been changed for max_iter)
from sklearn import linear_model
lgs = linear_model.SGDClassifier(loss='log', max_iter=60, alpha=0.00001)
lgs.fit(X_train, y_train)

lgs_performance_train = BinaryClassificationPerformance(lgs.predict(X_train), y_train, 'lgs_train')
lgs_performance_train.compute_measures()
print(lgs_performance_train.performance_measures)

{'Pos': 12174, 'Neg': 115482, 'TP': 12073, 'TN': 115355, 'FP': 127, 'FN': 101, 'Accuracy': 0.9982139499905998, 'Precision': 0.9895901639344262, 'Recall': 0.9917036306883522, 'desc': 'lgs_train'}




In [24]:
# naive bayes model
from sklearn.naive_bayes import MultinomialNB
nbs = MultinomialNB()
nbs.fit(X_train, y_train)

nbs_performance_train = BinaryClassificationPerformance(nbs.predict(X_train), y_train, 'nbs_train')
nbs_performance_train.compute_measures()
print(nbs_performance_train.performance_measures)

ValueError: Negative values in data passed to MultinomialNB (input X)

In [25]:
# Perceptron
from sklearn import linear_model
prc = linear_model.SGDClassifier(loss='perceptron')
prc.fit(X_train, y_train)

prc_performance_train = BinaryClassificationPerformance(prc.predict(X_train), y_train, 'prc_train')
prc_performance_train.compute_measures()
print(prc_performance_train.performance_measures)

{'Pos': 12174, 'Neg': 115482, 'TP': 12041, 'TN': 115318, 'FP': 164, 'FN': 133, 'Accuracy': 0.997673434856176, 'Precision': 0.9865628840639082, 'Recall': 0.9890750780351569, 'desc': 'prc_train'}


In [None]:
# Ridge Regression Classifier model
from sklearn import linear_model
rdg = linear_model.RidgeClassifier()
rdg.fit(X_train, y_train)

rdg_performance_train = BinaryClassificationPerformance(rdg.predict(X_train), y_train, 'rdg_train')
rdg_performance_train.compute_measures()
print(rdg_performance_train.performance_measures)

In [27]:
# random forest classifier model
from sklearn.ensemble import RandomForestClassifier
rdf = RandomForestClassifier(max_depth=2, random_state=0)
rdf.fit(X_train, y_train)

rdf_performance_train = BinaryClassificationPerformance(rdf.predict(X_train), y_train, 'rdf_train')
rdf_performance_train.compute_measures()
print(rdf_performance_train.performance_measures)

{'Pos': 12174, 'Neg': 115482, 'TP': 0, 'TN': 115482, 'FP': 0, 'FN': 12174, 'Accuracy': 0.9046343297612334, 'Precision': nan, 'Recall': 0.0, 'desc': 'rdf_train'}


  self.performance_measures['Precision'] = self.performance_measures['TP'] / (self.performance_measures['TP'] + self.performance_measures['FP'])


In [30]:
fits = [ols_performance_train, svm_performance_train, lgs_performance_train, nbs_performance_train, prc_performance_train, rdg_performance_train, rdf_performance_train]

for fit in fits:
    plt.plot(fit.performance_measures['FP'] / fit.performance_measures['Neg'], 
             fit.performance_measures['TP'] / fit.performance_measures['Pos'], 'bo')
    plt.text(fit.performance_measures['FP'] / fit.performance_measures['Neg'], 
             fit.performance_measures['TP'] / fit.performance_measures['Pos'], fit.desc)
plt.axis([0, 1, 0, 1])
plt.title('ROC plot: test set')
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.show()

NameError: name 'nbs_performance_train' is not defined