In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
from my_measures import BinaryClassificationPerformance
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from scipy.sparse import csr_matrix, hstack
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
%matplotlib inline

In [10]:
help(BinaryClassificationPerformance)

Help on class BinaryClassificationPerformance in module my_measures:

class BinaryClassificationPerformance(builtins.object)
 |  BinaryClassificationPerformance(predictions, labels, desc, probabilities=None)
 |  
 |  Performance measures to evaluate the fit of a binary classification model, v1.02
 |  
 |  Methods defined here:
 |  
 |  __init__(self, predictions, labels, desc, probabilities=None)
 |      Initialize attributes: predictions-vector of predicted values for Y, labels-vector of labels for Y
 |  
 |  compute_measures(self)
 |      Compute performance measures defined by Flach p. 57
 |  
 |  img_indices(self)
 |      Get the indices of true and false positives to be able to locate the corresponding images in a list of image names
 |  
 |  ----------------------------------------------------------------------
 |  Data descriptors defined here:
 |  
 |  __dict__
 |      dictionary for instance variables (if defined)
 |  
 |  __weakref__
 |      list of weak references to the obj

In [37]:
# function that takes raw data and completes all preprocessing required before model fits
def process_raw_data(fn, my_random_seed, test=False):
    # read and summarize data




# QUALITATIVE FEATURES
# remove self-identifying toxic measures
    toxic_data = pd.read_csv(fn)
    if (not test):
        # add an indicator for obscene, threat, insult, or indentity hate
        
        # testing removing all but one feature
        # toxic_data['any_toxic'] = ( toxic_data['obscene'] > 0 )
        # toxic_data['any_toxic'] = ( toxic_data['threat'] > 0 )     
        # toxic_data['any_toxic'] = ( toxic_data['insult'] > 0 )       
        # toxic_data['any_toxic'] = ( toxic_data['identity_hate'] > 0 )     

        toxic_data['any_toxic'] = (toxic_data['obscene'] + toxic_data['threat'] + toxic_data['insult'] + toxic_data['identity_hate'] > 0 )
        # print("toxic_data is:", type(toxic_data))
        # print("toxic_data has", toxic_data.shape[0], "rows and", toxic_data.shape[1], "columns", "\n")
        # print("the data types for each of the columns in toxic_data:")
        # print(toxic_data.dtypes, "\n")
        # print("The first 10 rows in toxic_data:")
        # print(toxic_data.head(10))
        # if (not test):
        #     print("The rate of 'toxic' Wikipedia comments in the dataset: ")
        #     print(toxic_data['any_toxic'].mean())

    # vectorize Bag of Words from review text; as sparse matrix
    if (not test): # fit_transform()
        hv = HashingVectorizer(n_features=2 ** 17, alternate_sign=False)
        X_hv = hv.fit_transform(toxic_data.comment_text)
        fitted_transformations.append(hv)
        print("Shape of HashingVectorizer X:")
        print(X_hv.shape)
    else: # transform() 
        X_hv = fitted_transformations[0].transform(toxic_data.comment_text)
        print("Shape of HashingVectorizer X:")
        print(X_hv.shape)
        
    # http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfTransformer.html
    if (not test):
        transformer = TfidfTransformer()
        X_tfidf = transformer.fit_transform(X_hv)
        fitted_transformations.append(transformer)
    else:
        X_tfidf = fitted_transformations[1].transform(X_hv)
    



# QUANTITATIVE FEATURES
# number-based features from toxic comments to add to feature set

    # count of excessive exclamation points
    toxic_data['exclamations'] = toxic_data['comment_text'].str.count("\!\!\!")

    # boolean all-caps responses
    toxic_data_isupper = toxic_data['comment_text'].str.isupper(
            # if isupper_count is False
            #     print('0')
            # else:
            #     print('1')
                )  

    # transform booleans to integers
    def boolstr_to_floatstr(b):
      if b == 'True':
          return '1'
      elif b == 'False':
          return '0'
      else:
          return b

    toxic_data['allCaps'] = np.vectorize(boolstr_to_floatstr)(toxic_data_isupper).astype(float)


    # count of use of the slang "sjw"
    toxic_data['sjw_count'] = toxic_data['comment_text'].str.count("sjw")


    X_quant_features = toxic_data[["exclamations", "allCaps", "sjw_count"]]
    X_quant_features_csr = csr_matrix(X_quant_features)
    X_combined = hstack([X_tfidf, X_quant_features_csr])
    X_matrix = csr_matrix(X_combined) # convert to sparse matrix

    print("Quantitative features include exclamation point count, uppercase usage, and count of disparaging language: ")
    print(X_quant_features.head(10))




    #COMBINING FEATURES    m
    # Create `X`, scaled matrix of features
    # feature scaling
    if (not test):
        sc = StandardScaler(with_mean=False)
        X = sc.fit_transform(X_matrix)
        fitted_transformations.append(sc)
        print(X.shape)
        y = toxic_data['any_toxic']
    else:
        X = fitted_transformations[2].transform(X_matrix)
        print(X.shape)
    
    # Create Training and Test Sets
    # enter an integer for the random_state parameter; any integer will work
    if (test):
        X_submission_test = X
        print("Shape of X_test for submission:")
        print(X_submission_test.shape)
        print('SUCCESS!')
        return(toxic_data, X_submission_test)
    else: 
        X_train, X_test, y_train, y_test, X_raw_train, X_raw_test = train_test_split(X, y, toxic_data, test_size=0.2, random_state=my_random_seed)
        print("Shape of X_train and X_test:")
        print(X_train.shape)
        print(X_test.shape)
        print("Shape of y_train and y_test:")
        print(y_train.shape)
        print(y_test.shape)
        print("Shape of X_raw_train and X_raw_test:")
        print(X_raw_train.shape)
        print(X_raw_test.shape)
        print('SUCCESS!')
        return(X_train, X_test, y_train, y_test, X_raw_train, X_raw_test)

In [38]:
# create an empty list to store any use of fit_transform() to transform() later
# it is a global list to store model and feature extraction fits
fitted_transformations = []

# CHANGE FILE PATH and my_random_seed number (any integer other than 74 will do): 
X_train, X_test, y_train, y_test, X_raw_train, X_raw_test = process_raw_data(fn='/Users/smolloy/Dev/parsons/ml-2020_data/toxiccomments_train.csv', my_random_seed=42)

print("Number of fits stored in `fitted_transformations` list: ")
print(len(fitted_transformations))

Shape of HashingVectorizer X:
(159571, 131072)
Quantitative features include exclamation point count, uppercase usage, and count of disparaging language: 
   exclamations  allCaps  sjw_count
0             0      0.0          0
1             0      0.0          0
2             0      0.0          0
3             0      0.0          0
4             0      0.0          0
5             0      0.0          0
6             0      1.0          0
7             0      0.0          0
8             0      0.0          0
9             0      0.0          0
(159571, 131075)
Shape of X_train and X_test:
(127656, 131075)
(31915, 131075)
Shape of y_train and y_test:
(127656,)
(31915,)
Shape of X_raw_train and X_raw_test:
(127656, 12)
(31915, 12)
SUCCESS!
Number of fits stored in `fitted_transformations` list: 
3


In [40]:
# logistical regression model - most accurate in sample data

from sklearn import linear_model
lgs = linear_model.SGDClassifier(loss='log')
lgs.fit(X_train, y_train)

lgs_performance_train = BinaryClassificationPerformance(lgs.predict(X_train), y_train, 'lgs_train')
lgs_performance_train.compute_measures()
print(lgs_performance_train.performance_measures)

{'Pos': 8363, 'Neg': 119293, 'TP': 8290, 'TN': 119183, 'FP': 110, 'FN': 73, 'Accuracy': 0.9985664598608761, 'Precision': 0.986904761904762, 'Recall': 0.9912710749730957, 'desc': 'lgs_train'}


In [42]:
lgs_predictions = lgs.predict(X_train)

In [43]:
# SUBMISSION CODE: 
raw_data, X_test_submission = process_raw_data(fn='/Users/smolloy/Dev/parsons/ml-2020_data/toxiccomments_train.csv', my_random_seed=42, test=True)
print("Number of rows in the submission test set (should be 153,164): ")

Shape of HashingVectorizer X:
(159571, 131072)
Quantitative features include exclamation point count, uppercase usage, and count of disparaging language: 
   exclamations  allCaps  sjw_count
0             0      0.0          0
1             0      0.0          0
2             0      0.0          0
3             0      0.0          0
4             0      0.0          0
5             0      0.0          0
6             0      1.0          0
7             0      0.0          0
8             0      0.0          0
9             0      0.0          0
(159571, 131075)
Shape of X_test for submission:
(159571, 131075)
SUCCESS!
Number of rows in the submission test set (should be 153,164): 


In [45]:
# store the id from the raw data
my_submission = pd.DataFrame(raw_data["id"])

# concatenate predictions to the id i.e. lgs.predict for logistical regression model
my_submission["prediction"] = lgs.predict(X_test_submission)

# look at the proportion of positive predictions
print(my_submission['prediction'].mean())

0.06831441803335192


In [46]:
raw_data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,exclamations,allCaps,sjw_count
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,0,0.0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,0,0.0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,0,0.0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,0,0.0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,0,0.0,0


In [47]:
my_submission.head()

Unnamed: 0,id,prediction
0,0000997932d777bf,False
1,000103f0d9cfb60f,False
2,000113f07ec002fd,False
3,0001b41b1c6bb37e,False
4,0001d958c54c6e35,False


In [48]:
my_submission.shape

(159571, 2)

In [49]:
# EXPORT FINAL SUBMISSION:
my_submission.to_csv('/Users/smolloy/Dev/parsons/ml-2020/jupyter/_firstProject/TC/molloy_toxiccommentsV1_submission.csv', index=False)