In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, auc
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import make_pipeline as make_pipeline_imb
from imblearn.metrics import classification_report_imbalanced
from sklearn.linear_model import LogisticRegression

In [2]:
def load_data(filename):
    """
    Load data from file in same directory
    
    Args: 
        filename: a string 
    returns: 
        A pandas dataframe with columns "comment text" and "toxic"
    """
    columns = ['comment_text', 'toxic', 'severe_toxic', 'obscene', 
               'threat', 'insult', 'identity_hate']
    return pd.read_csv(filename, usecols = columns)

In [None]:
data = load_data("train.csv")

In [None]:
data.head(10)

In [None]:
# separate the data into features (X)
# and labels (y), then
# split into test/train
X = data['comment_text']
y = data['toxic', 'severe_toxic', 'obscene', 
         'threat', 'insult', 'identity_hate']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.10, random_state=42)

In [None]:
def class_metrics(y_test, y_preds):
    """
    Output the accuracy, precision, recall, and f-score
    
    Args: 
        y_test: the array of test labels (0/1)
        y_preds: the array of predicted labels 
    """
    print('Accuracy score: ', format(accuracy_score(y_test,y_preds)))
    print('Precision score: ', format(precision_score(y_test, y_preds)))
    print('Recall score: ', format(recall_score(y_test, y_preds)))
    print('F1 score: ', format(f1_score(y_test,y_preds)))
    
def draw_roc(y_test, y_preds):
    """
    Draw the ROC/AUC Curve
    
    Args: 
        y_test: the array of test labels (0/1)
        y_preds: the array of predicted labels 
    """
    fpr, tpr, threshold = roc_curve(y_test, y_preds)
    roc_auc = auc(fpr, tpr)
    plt.title('Receiver Operating Characteristic')
    plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.savefig("roc_auc.png")
    plt.show()    

In [None]:
# Form a pipeline of Vectorizer, TFIDF, 
# Logistic Regression
lr_pipe =  Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', LogisticRegression())
                      ])
# train LR pipeline 
lr_pipe.fit(X_train, y_train)