In [95]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from my_measures import BinaryClassificationPerformance
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from scipy.sparse import csr_matrix, hstack
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
%matplotlib inline

In [96]:
help(BinaryClassificationPerformance)

Help on class BinaryClassificationPerformance in module my_measures:

class BinaryClassificationPerformance(builtins.object)
 |  BinaryClassificationPerformance(predictions, labels, desc, probabilities=None)
 |  
 |  Performance measures to evaluate the fit of a binary classification model, v1.02
 |  
 |  Methods defined here:
 |  
 |  __init__(self, predictions, labels, desc, probabilities=None)
 |      Initialize attributes: predictions-vector of predicted values for Y, labels-vector of labels for Y
 |  
 |  compute_measures(self)
 |      Compute performance measures defined by Flach p. 57
 |  
 |  img_indices(self)
 |      Get the indices of true and false positives to be able to locate the corresponding images in a list of image names
 |  
 |  ----------------------------------------------------------------------
 |  Data descriptors defined here:
 |  
 |  __dict__
 |      dictionary for instance variables (if defined)
 |  
 |  __weakref__
 |      list of weak references to the obj

In [106]:
# function that takes raw data and completes all preprocessing required before model fits
def process_raw_data(fn, my_random_seed, test=False):
    # read and summarize data




# QUALITATIVE FEATURES
# remove self-identifying toxic measures
    toxic_data = pd.read_csv(fn)
    if (not test):
        # add an indicator for obscene, threat, insult, or indentity hate
        toxic_data['any_toxic'] = (toxic_data['obscene'] + toxic_data['threat'] + toxic_data['insult'] + toxic_data['identity_hate'] > 0 )
        # print("toxic_data is:", type(toxic_data))
        # print("toxic_data has", toxic_data.shape[0], "rows and", toxic_data.shape[1], "columns", "\n")
        # print("the data types for each of the columns in toxic_data:")
        # print(toxic_data.dtypes, "\n")
        # print("The first 10 rows in toxic_data:")
        # print(toxic_data.head(10))
        # if (not test):
        #     print("The rate of 'toxic' Wikipedia comments in the dataset: ")
        #     print(toxic_data['any_toxic'].mean())

    # vectorize Bag of Words from review text; as sparse matrix
    hv = HashingVectorizer(n_features=2 ** 17, alternate_sign=False)
    X_hv = hv.fit_transform(toxic_data.comment_text)
    print("Shape of HashingVectorizer X:")
    print(X_hv.shape)
    
    # http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfTransformer.html
    transformer = TfidfTransformer()
    X_tfidf = transformer.fit_transform(X_hv)
    



# QUANTITATIVE FEATURES
# number-based features from toxic comments to add to feature set

    # count of excessive exclamation points
    toxic_data['exclamation_count'] = toxic_data['comment_text'].str.count("\!\!\!")

    # boolean all-caps responses
    toxic_data_isupper = toxic_data['comment_text'].str.isupper(
            # if isupper_count is False
            #     print('0')
            # else:
            #     print('1')
                )  

    # transform booleans to integers
    def boolstr_to_floatstr(b):
      if b == 'True':
          return '1'
      elif b == 'False':
          return '0'
      else:
          return b

    toxic_data['caps_count'] = np.vectorize(boolstr_to_floatstr)(toxic_data_isupper).astype(float)


    # count of use of the slang "sjw"
    toxic_data['sjw_count'] = toxic_data['comment_text'].str.count("sjw")


    X_quant_features = toxic_data[["exclamation_count", "caps_count", "sjw_count"]]
    print("Quantitative features include exclamation point count, uppercase usage, and count of disparaging language: ")
    print(X_quant_features.head(10))
    
    # Combine all quantitative features into a single sparse matrix
    X_quant_features_csr = csr_matrix(X_quant_features)
    X_combined = hstack([X_tfidf, X_quant_features_csr])
    X_matrix = csr_matrix(X_combined) # convert to sparse matrix
    print("Size of combined bag of words and new quantitative variables matrix:")
    print(X_matrix.shape)
    
    # Create `X`, scaled matrix of features
    # feature scaling
    sc = StandardScaler(with_mean=False)
    X = sc.fit_transform(X_matrix)
    print(X.shape)
    if (not test):
        y = toxic_data['any_toxic']
    
    # Create Training and Test Sets
    # enter an integer for the random_state parameter; any integer will work
    if (test):
        X_submission_test = X
        print("Shape of X_test for submission:")
        print(X_submission_test.shape)
        print('SUCCESS!')
        return(toxic_data, X_submission_test)
    else: 
        X_train, X_test, y_train, y_test, X_raw_train, X_raw_test = train_test_split(X, y, toxic_data, test_size=0.2, random_state=666)
        print("Shape of X_train and X_test:")
        print(X_train.shape)
        print(X_test.shape)
        print("Shape of y_train and y_test:")
        print(y_train.shape)
        print(y_test.shape)
        print("Shape of X_raw_train and X_raw_test:")
        print(X_raw_train.shape)
        print(X_raw_test.shape)
        print('SUCCESS!')
        return(X_train, X_test, y_train, y_test, X_raw_train, X_raw_test)

In [107]:
# CHANGE FILE PATH and my_random_seed number: 
X_train, X_test, y_train, y_test, X_raw_train, X_raw_test = process_raw_data(fn='/Users/smolloy/Dev/parsons/ml-2020_data/toxiccomments_train.csv', my_random_seed=42)

Shape of HashingVectorizer X:
(159571, 131072)
Quantitative features include exclamation point count, uppercase usage, and count of disparaging language: 
   exclamation_count  caps_count  sjw_count
0                  0         0.0          0
1                  0         0.0          0
2                  0         0.0          0
3                  0         0.0          0
4                  0         0.0          0
5                  0         0.0          0
6                  0         1.0          0
7                  0         0.0          0
8                  0         0.0          0
9                  0         0.0          0
Size of combined bag of words and new quantitative variables matrix:
(159571, 131075)
(159571, 131075)
Shape of X_train and X_test:
(127656, 131075)
(31915, 131075)
Shape of y_train and y_test:
(127656,)
(31915,)
Shape of X_raw_train and X_raw_test:
(127656, 12)
(31915, 12)
SUCCESS!


In [108]:
# logistical regression model - most accurate in sample data

from sklearn import linear_model
lgs = linear_model.SGDClassifier(loss='log')
lgs.fit(X_train, y_train)

lgs_performance_train = BinaryClassificationPerformance(lgs.predict(X_train), y_train, 'lgs_train')
lgs_performance_train.compute_measures()
print(lgs_performance_train.performance_measures)

{'Pos': 8366, 'Neg': 119290, 'TP': 8297, 'TN': 119219, 'FP': 71, 'FN': 69, 'Accuracy': 0.9989033026258068, 'Precision': 0.9915152963671128, 'Recall': 0.991752330863017, 'desc': 'lgs_train'}


In [110]:
# linear SVM model

from sklearn import linear_model
svm = linear_model.SGDClassifier()
svm.fit(X_train, y_train)

svm_performance_train = BinaryClassificationPerformance(svm.predict(X_train), y_train, 'svm_train')
svm_performance_train.compute_measures()
print(svm_performance_train.performance_measures)

{'Pos': 8366, 'Neg': 119290, 'TP': 8265, 'TN': 119210, 'FP': 80, 'FN': 101, 'Accuracy': 0.9985821269662217, 'Precision': 0.9904134212103056, 'Recall': 0.9879273248864451, 'desc': 'svm_train'}


In [111]:
# ols model, usually bad

from sklearn import linear_model
ols = linear_model.SGDClassifier(loss="squared_loss")
ols.fit(X_train, y_train)

ols_performance_train = BinaryClassificationPerformance(ols.predict(X_train), y_train, 'ols_train')
ols_performance_train.compute_measures()
print(ols_performance_train.performance_measures)

{'Pos': 8366, 'Neg': 119290, 'TP': 4342, 'TN': 59862, 'FP': 59428, 'FN': 4024, 'Accuracy': 0.5029454158049759, 'Precision': 0.0680884428414615, 'Recall': 0.5190054984460913, 'desc': 'ols_train'}


In [112]:
# naive bayes, variables must be independent (these happen to be)

from sklearn.naive_bayes import MultinomialNB
nbs = MultinomialNB()
nbs.fit(X_train, y_train)

nbs_performance_train = BinaryClassificationPerformance(nbs.predict(X_train), y_train, 'nbs_train')
nbs_performance_train.compute_measures()
print(nbs_performance_train.performance_measures)

{'Pos': 8366, 'Neg': 119290, 'TP': 8263, 'TN': 109395, 'FP': 9895, 'FN': 103, 'Accuracy': 0.9216801403772639, 'Precision': 0.45506113008040533, 'Recall': 0.9876882620129094, 'desc': 'nbs_train'}


In [113]:
# perceptron

from sklearn import linear_model
prc = linear_model.SGDClassifier(loss='perceptron')
prc.fit(X_train, y_train)

prc_performance_train = BinaryClassificationPerformance(prc.predict(X_train), y_train, 'prc_train')
prc_performance_train.compute_measures()
print(prc_performance_train.performance_measures)

{'Pos': 8366, 'Neg': 119290, 'TP': 8271, 'TN': 119156, 'FP': 134, 'FN': 95, 'Accuracy': 0.9982061164379269, 'Precision': 0.9840571088637715, 'Recall': 0.9886445135070524, 'desc': 'prc_train'}


In [None]:
# ridge regression classifier

from sklearn import linear_model
rdg = linear_model.RidgeClassifier()
rdg.fit(X_train, y_train)

rdg_performance_train = BinaryClassificationPerformance(rdg.predict(X_train), y_train, 'rdg_train')
rdg_performance_train.compute_measures()
print(rdg_performance_train.performance_measures)

In [None]:
# random forrest classifier

from sklearn.ensemble import RandomForestClassifier
rdf = RandomForestClassifier(max_depth=2, random_state=0)
rdf.fit(X_train, y_train)

rdf_performance_train = BinaryClassificationPerformance(rdf.predict(X_train), y_train, 'rdf_train')
rdf_performance_train.compute_measures()
print(rdf_performance_train.performance_measures)