In [3]:
import pandas as pd
import pickle
import os

# Setup for pickling
dest = '..\\pickles'
if not os.path.exists(dest):
    os.makedirs(dest)


In [4]:
# import cleaned data
train_data = pd.read_csv('../data/cleaned_data.csv', index_col='id')

In [5]:
# Create and pickle things for text transformation (Stop words, stemmer, lematizer)

In [6]:
# Create and pickle hashing vectorizer with best params
from sklearn.feature_extraction.text import HashingVectorizer

hash_vectorizer = HashingVectorizer(alternate_sign=True, n_features=600000, ngram_range=(1,1))
vectorized_train_data = hash_vectorizer.fit_transform(train_data.comment_text)

pickle.dump(hash_vectorizer, open(os.path.join(dest, 'hash_vect.pkl'), 'wb'), protocol=4)

In [7]:
# Select columns and split data
from sklearn.model_selection import train_test_split
y = train_data.target
X = vectorized_train_data

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                    test_size=.2, random_state=1, stratify=y)

In [8]:
# Create, train, and pickle model
from sklearn.linear_model import LogisticRegression
best_lr = LogisticRegression(
    random_state=0, solver='saga', penalty='l2', C=1,
).fit(X_train, y_train)

# Make predictions on test data
Logistic_Regressor_predictions = best_lr.predict(X_test)

# Calculate and show model metrics
from sklearn import metrics
print(metrics.classification_report(y_test, Logistic_Regressor_predictions,target_names=['Non-Toxic', 'Toxic']))

pickle.dump(best_lr, open(os.path.join(dest, 'trained_log_reg.pkl'), 'wb'), protocol=4)

              precision    recall  f1-score   support

   Non-Toxic       0.95      0.99      0.97    332018
       Toxic       0.78      0.44      0.56     28867

    accuracy                           0.94    360885
   macro avg       0.86      0.71      0.77    360885
weighted avg       0.94      0.94      0.94    360885



In [9]:
# Create and pickle hashing vectorizer with best params
from sklearn.feature_extraction.text import HashingVectorizer

hash_vectorizer = HashingVectorizer(alternate_sign=False, n_features=600000, ngram_range=(1,2))
vectorized_train_data = hash_vectorizer.fit_transform(train_data.comment_text)

from sklearn.linear_model import SGDClassifier
best_sgd = SGDClassifier(
    random_state=0, class_weight=None, loss='squared_hinge', penalty='l2', shuffle=False
).fit(X_train, y_train)

pickle.dump(best_sgd, open(os.path.join(dest, 'trained_sgd.pkl'), 'wb'), protocol=4)



In [10]:
# Make predictions on test data
pred = best_sgd.predict(X_test)

# Calculate and show model metrics
from sklearn import metrics
print(metrics.classification_report(y_test, pred, target_names=['Non-Toxic', 'Toxic']))

              precision    recall  f1-score   support

   Non-Toxic       0.95      0.96      0.96    332018
       Toxic       0.49      0.47      0.48     28867

    accuracy                           0.92    360885
   macro avg       0.72      0.71      0.72    360885
weighted avg       0.92      0.92      0.92    360885

