# Train Logistic Regression using Frequency Vectorizer

In [1]:
# import cleaned data
import pandas as pd
train_data = pd.read_csv('../../data/cleaned_data.csv', index_col='id')

In [2]:
# Apply frequency vectorizer to comment data
from sklearn.feature_extraction.text import TfidfVectorizer

tfid_vectorizer = TfidfVectorizer()
tfid_tranformed_train_data = tfid_vectorizer.fit_transform(train_data.comment_text)

In [3]:
# Select columns and split data
from sklearn.model_selection import train_test_split
y = train_data.target
X = tfid_tranformed_train_data

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                    test_size=.2, random_state=1, stratify=y)

print("Number of features: ", X_train.shape[1])
print("Number of train samples: ", y_train.shape[0])
print("Number of train samples: ", y_test.shape[0])

Number of features:  500866
Number of train samples:  1443538
Number of train samples:  360885


In [13]:
# Import and train classifier
from sklearn.linear_model import LogisticRegression
trained_Logistic_Regressor = LogisticRegression(random_state=0, solver='sag').fit(X_train, y_train)

In [14]:
# Make predictions on test data
Logistic_Regressor_predictions = trained_Logistic_Regressor.predict(X_test)

In [15]:
# Find missclassified samples
import numpy as np
misclassified = np.where(y_test != Logistic_Regressor_predictions)
miss_df = pd.DataFrame(misclassified)

In [16]:
# Calculate and show model metrics
from sklearn import metrics
print(metrics.classification_report(y_test, Logistic_Regressor_predictions,target_names=['Non-Toxic', 'Toxic']))

              precision    recall  f1-score   support

   Non-Toxic       0.95      0.99      0.97    332018
       Toxic       0.77      0.45      0.57     28867

    accuracy                           0.95    360885
   macro avg       0.86      0.72      0.77    360885
weighted avg       0.94      0.95      0.94    360885

