In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed

# For example, here are several helpful packages to load in

import os
import string

import joblib
import matplotlib.pyplot as plt
import nltk
import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import spacy
from funcsigs import signature
from sklearn.base import TransformerMixin
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    auc,
    balanced_accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    multilabel_confusion_matrix,
    precision_recall_curve,
    precision_score,
    recall_score,
    roc_auc_score,
    roc_curve,
)
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.utils import class_weight, shuffle
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS

In [None]:
# set data dir
data_dir = "./data/"

In [None]:
train_df = pd.read_csv(f"{data_dir}/train.csv")  # limiting the rows
train_df.head()

In [None]:
val_df = pd.read_csv(f"{data_dir}/valid.csv")
val_df.head()

In [None]:
test_df = pd.read_csv(f"{data_dir}/valid.csv")
test_df.head()

In [None]:
# create dataset splits for training etc

X_train = train_df["text"]
y_train = train_df["label"]

X_val = val_df["text"]
y_val = val_df["label"]

X_test = test_df["text"]
y_test = test_df["label"]

# Train, etc. using `sklearn` 

In [None]:
# first come cleaning and preprocessing functions

# create list of punctuations
punctuations = string.punctuation

# create list of stop words
nlp = spacy.load("en_core_web_sm")

stop_words = spacy.lang.en.stop_words.STOP_WORDS

In [None]:
# load spacy English parser with tokenizer, tagger, parser, NER and word vectors
parser = English()

In [None]:
# create spacy tokenizer function


def spacy_tokenizer(sentence):
    # create a spacy token object
    # print(f"Raw sentence was: {sentence}")
    mytokens = parser(sentence)

    mytokens = [word.lower_ for word in mytokens]

    # remove STOP WORDS
    mytokens = [
        word for word in mytokens if word not in stop_words and word not in punctuations
    ]
    # print(f"my tokens after removal of stop: {mytokens}")

    # return processed tokens

    return mytokens


def spacy_tokenizer_lemmatize(sentence):
    # create a spacy token object
    # print(f"Raw sentence was: {sentence}")
    mytokens = parser(sentence)

    # lemmatize each token and convert to lowercase
    mytokens = [
        word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_
        for word in mytokens
    ]
    mytokens = [word.lower_ for word in mytokens]

    # remove STOP WORDS
    mytokens = [
        word for word in mytokens if word not in stop_words and word not in punctuations
    ]
    # print(f"my tokens after removal of stop: {mytokens}")

    # return processed tokens

    return mytokens


# custom transformer class using spaCy


class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        # clean text
        return [clean_text(text) for text in X]

    def fit(self, x, y=None, **fit_params):
        return self

    def get_params(self, deep=True):
        return {}


def clean_text(text):
    return text.strip().lower()

In [None]:
X_train[0]

In [None]:
parser(X_train[0])

Wrap preprocessing and training etc into a few functions

In [None]:
def save_trained_model(clf, clf_name_str, save_dir):

    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    print(f"Saving trained model at: {save_dir}/{clf_name_str}.joblib")


def setup_vectorised_data(
    path_to_data,
    text_col="text",
    label_col="label",
    max_features=50000,
    vectorizer="tfidf",
    pretrained_vect_dir=None,
    save_vectorizer=True,
    save_dir=None,
    ngram_range=(1, 2),
    lemmatize=False,
):

    """
    Function: Given a dataset with text and labels - vectorise using scikit-learn
    methods and return feature vectors for downstream training and/or testing

    args:

        path_to_data: String -> the directory containing the raw dataset
        text_col:  String -> the column of dataframe containing the raw text
        label_col: String -> the column of the dataframe containing the target/class
        label
        max_features: Int -> maximum number of features i.e. vocab size for the
        vectorizer
        vectorizer: String -> the string identifier for the scikit learn vectorization
        method to use - options are tfidf or bow
        pretrained_vect_dir: String -> if a vectorizer has already been fitted to
        training data and is to be loaded in rather than fitting a new one,
        provide path to saved vectorizer
        save_vectorizer: Boolean -> whether or not to save the fitted vectorizer
        save_dir: String -> path to save the fitted vectorizer
        ngram_range: Tuple -> (low,max) range for the ngrams to be used when
        creating vectors
        lemmatize: Boolean -> whether or not to apply lemmatization when using
        spacy tokenizer


    """
    train_data = pd.read_csv(f"{path_to_data}/train.csv", index_col=None)
    # lets shuffle the data
    train_data = shuffle(train_data, random_state=42)
    valid_data = pd.read_csv(f"{path_to_data}/valid.csv", index_col=None)
    test_data = pd.read_csv(f"{path_to_data}/test.csv", index_col=None)

    print(
        (
            f"Train data shape: {train_data.shape} with label counts: "
            f"{train_data.label.value_counts()}\n"
        )
    )
    print(
        (
            f"Validation data shape: {valid_data.shape} with label counts: "
            f"{valid_data.label.value_counts()}\n"
        )
    )
    print(
        (
            f"Test data shape: {test_data.shape} with label counts: "
            f"{test_data.label.value_counts()}"
        )
    )

    # extract the text and label data
    X_train = train_data[text_col]
    y_train = train_data[label_col].astype("int64")
    X_valid = valid_data[text_col]
    y_valid = valid_data[label_col].astype("int64")
    X_test = test_data[text_col]
    y_test = test_data[label_col].astype("int64")

    # are we lemmatizing?
    if lemmatize:
        print("Will be lemmatizing - be warned this can remove a lot of data")
        tokenizer = spacy_tokenizer_lemmatize
    else:
        tokenizer = spacy_tokenizer

    # use pretrained vectorizer?
    if vectorizer == "pretrained":
        print(f"Loading pretrained vectorizer from: {pretrained_vect_dir}")
        vect = joblib.load(pretrained_vect_dir)

    elif vectorizer == "count":
        print(f"Using count vectorizer with {max_features} max features")
        vect = CountVectorizer(
            max_features=max_features, tokenizer=tokenizer, ngram_range=ngram_range
        )
        # fit to train data
        vect.fit(X_train.values)

    elif vectorizer == "tfidf":
        print(f"Using tfidf vectorizer")
        vect = TfidfVectorizer(
            max_features=max_features, tokenizer=tokenizer, ngram_range=ngram_range
        )
        # fit to train data
        vect.fit(X_train.values)

    else:
        raise NotImplementedError

    # now transform the data
    print(f"Transforming training data!")
    X_train_tf = vect.transform(X_train.values)
    print(f"Transforming validation data!")
    X_valid_tf = vect.transform(X_valid.values)
    print(f"Transforming test data!")
    X_test_tf = vect.transform(X_test.values)

    # save vectorizer if desired
    if save_vectorizer:
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
        print(f"Saving vectorizer at: {save_dir}")
        joblib.dump(vect, f"{save_dir}/{vectorizer}.joblib")

    print(
        (
            f"Shape of vectorized data:\n\n Train:{X_train_tf.shape}\n"
            f"Valid:{X_valid_tf.shape}\nTest:{X_test_tf.shape}"
        )
    )
    return X_train_tf, y_train, X_valid_tf, y_valid, X_test_tf, y_test


def run_training(
    X_train_tf,
    y_train,
    X_valid_tf,
    y_valid,
    X_test_tf,
    y_test,
    save_dir,
    clf_name=None,
    save_model=True,
    run_grid=False,
):

    print(f"Running training!")

    # set up clf and fit/train
    model = clf_name
    clf_name_str = str(clf_name).split("(")[0]
    # do CV for RF
    if clf_name_str == "RandomForestClassifier" and run_grid:
        print(f"Running grid search to find optimal hyperparams for random forest")
        param_grid = {
            "n_estimators": [50, 200, 500],
            "max_features": ["sqrt", "log2"],
            "max_depth": [4, 6, 8, 10, 15, 20],
            "criterion": ["gini", "entropy"],
        }

        CV_rfc = GridSearchCV(estimator=model, param_grid=param_grid, cv=5)
        CV_rfc.fit(X_train_tf, y_train)
        best_params = CV_rfc.best_params_
        print(f"Best params for RF were: {best_params}")
        model = RandomForestClassifier(**best_params)

    # now fit to training data
    model.fit(X_train_tf, y_train)

    # if save model
    if save_model:
        save_trained_model(model, clf_name_str=clf_name_str, save_dir=save_dir)

    return model, clf_name_str


def run_evaluation(
    model,
    X_train_tf,
    y_train,
    X_valid_tf,
    y_valid,
    X_test_tf,
    y_test,
    save_dir,
    threshold=0.5,
):

    print("Running evaluation!")

    # get model predicts and probabilities
    y_train_preds = model.predict(X_train_tf)
    y_train_pred_probs = model.predict_proba(X_train_tf)[:, 1]
    y_valid_preds = model.predict(X_valid_tf)
    y_valid_pred_probs = model.predict_proba(X_valid_tf)[:, 1]
    y_test_preds = model.predict(X_test_tf)
    y_test_pred_probs = model.predict_proba(X_test_tf)[:, 1]

    # set up save dir
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    target_names = ["low-severity", "high-severity"]

    def calc_accuracy(y_actual, y_pred, thresh):
        return (
            sum((y_pred > thresh) & (y_actual == 1))
            + sum((y_pred < thresh) & (y_actual == 0))
        ) / len(y_actual)

    def calc_recall(y_actual, y_pred, thresh):
        return sum((y_pred > thresh) & (y_actual == 1)) / sum(y_actual)

    def calc_precision(y_actual, y_pred, thresh):
        return sum((y_pred > thresh) & (y_actual == 1)) / sum(y_pred > thresh)

    def calc_specificity(y_actual, y_pred, thresh):
        return sum((y_pred < thresh) & (y_actual == 0)) / sum(y_actual == 0)

    def calc_prevelance(y_actual, y_pred, thresh):
        return sum((y_actual == 1)) / len(y_actual)

    fpr_train, tpr_train, thresholds_train = roc_curve(y_train, y_train_pred_probs)
    fpr_valid, tpr_valid, thresholds_valid = roc_curve(y_valid, y_valid_pred_probs)
    fpr_test, tpr_test, thresholds_test = roc_curve(y_test, y_test_pred_probs)

    thresh = threshold

    auc_train = roc_auc_score(y_train, y_train_pred_probs)
    auc_valid = roc_auc_score(y_valid, y_valid_pred_probs)
    auc_test = roc_auc_score(y_test, y_test_pred_probs)

    print(f"Train AUC: {auc_train}")
    print(f"Valid AUC: {auc_valid}")
    print(f"Test AUC: {auc_test}")

    print(
        (
            "Train accuracy: "
            f"{calc_accuracy(y_train, y_train_pred_probs, thresh = thresh)}"
        )
    )
    print(
        (
            "Valid accuracy: "
            f"{calc_accuracy(y_valid, y_valid_pred_probs, thresh = thresh)}"
        )
    )
    print(f"Test accuracy: {calc_accuracy(y_test, y_test_pred_probs, thresh = thresh)}")

    print(f"Train recall: {calc_recall(y_train, y_train_pred_probs, thresh = thresh)}")
    print(f"Valid recall: {calc_recall(y_valid, y_valid_pred_probs, thresh = thresh)}")
    print(f"Test recall: {calc_recall(y_test, y_test_pred_probs, thresh = thresh)}")

    print(
        (
            "Train precision: "
            f"{calc_precision(y_train, y_train_pred_probs, thresh = thresh)}"
        )
    )
    print(
        (
            "Valid precision: "
            f"{calc_precision(y_valid, y_valid_pred_probs, thresh = thresh)}"
        )
    )
    print(
        f"Test precision: {calc_precision(y_test, y_test_pred_probs, thresh = thresh)}"
    )

    print(
        (
            "Train specificity: "
            f"{calc_specificity(y_train, y_train_pred_probs, thresh = thresh)}"
        )
    )

    print(
        (
            "Valid specificity: "
            f"{calc_specificity(y_valid, y_valid_pred_probs, thresh = thresh)}"
        )
    )
    print(
        (
            "Test specificity: "
            f"{calc_specificity(y_test, y_test_pred_probs, thresh = thresh)}"
        )
    )

    print(
        (
            "Train class prevelance: "
            f"{calc_prevelance(y_train, y_train_pred_probs, thresh = thresh)}"
        )
    )
    print(
        (
            "Valid class prevelance: "
            f"{calc_prevelance(y_valid, y_valid_pred_probs, thresh = thresh)}"
        )
    )
    print(
        (
            "Test class prevelance: "
            f"{calc_prevelance(y_test, y_test_pred_probs, thresh = thresh)}"
        )
    )

    # make plots

    plt.figure(figsize=(12, 8))
    plt.plot(fpr_train, tpr_train, "r-", label=f"Train AUC: {auc_train:.2f}")
    plt.plot(fpr_valid, tpr_valid, "b-", label=f"Train AUC: {auc_valid:.2f}")
    plt.plot(fpr_test, tpr_test, "g-", label=f"Train AUC: {auc_test:.2f}")
    plt.plot([0, 1], [0, 1], "-k")
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.suptitle("Evaluation - AUC ROC")
    plt.legend()
    plt.savefig(f"{save_dir}/eval_auc_plot.png")
    plt.show()

    # now for precision-recall curve for test data
    precision, recall, _ = precision_recall_curve(y_test, y_test_pred_probs)
    area = auc(recall, precision)

    step_kwargs = (
        {"step": "post"} if "step" in signature(plt.fill_between).parameters else {}
    )
    plt.figure(2)
    plt.figure(figsize=(12, 8))
    plt.step(recall, precision, color="b", alpha=0.2, where="post")
    plt.fill_between(recall, precision, alpha=0.2, color="b", **step_kwargs)
    plt.xlabel("Recall")
    plt.ylabel("Precision")
    plt.ylim([0.0, 1.05])
    plt.xlim([0.0, 1.0])
    plt.title(f"Precision-recall curve: AUC = {area}")
    plt.savefig(f"{save_dir}/eval_auprc_plot.png")
    plt.show()

    print(f"Test classification report\n")
    print(classification_report(y_test, y_test_preds, target_names=target_names))

    cm = confusion_matrix(y_test, y_test_preds, normalize="true")
    df_cm = pd.DataFrame(cm, target_names, target_names)
    plt.figure(figsize=(6, 6))
    plt.suptitle("Low severity vs high severity reports")
    sns.heatmap(df_cm, annot=True, cmap="Blues")
    plt.savefig(f"{save_dir}/test_confusion_matrix.png")
    plt.show()

In [None]:
# set data dir and save dirs
data_dir = "<INSERT DATA DIRECTORY>"
save_dir = "<INSERT SAVE DIRECTORY>"

## Experiment 1 - Random Forest - default values


In [None]:
# set up arguments

do_training = True
do_evaluation = True
load_model = False
run_grid = False
load_vect = False
save_vect = True
save_model = True
lemmatize = False
max_features = 50000
ngram_range = (1, 2)
vectorizer = "tfidf"
vectorizer_to_load = None
# assert vectorizer is not both loaded and saved
assert (
    load_vect != save_vect
), "if a vecotrizer is being loaded - do not want to save it again"
# assert load model is not true when train is also true
assert (
    load_model != do_training
), "if loading a clf model - do not want to train it again?"

if load_vect:
    vectorizer = "pretrained"
    vectorizer_to_load = None  # put path to saved vectorizer

# set up save directories
if lemmatize:
    model_dir = f"{save_dir}/models/{vectorizer}_features_{max_features}_lemmatized/"

else:
    model_dir = f"{save_dir}/models/{vectorizer}_features_{max_features}/"

# set up vectorized data
X_train_tf, y_train, X_valid_tf, y_valid, X_test_tf, y_test = setup_vectorised_data(
    path_to_data=data_dir,
    text_col="text",
    label_col="label",
    max_features=max_features,
    vectorizer=vectorizer,
    save_dir=model_dir,
    pretrained_vect_dir=vectorizer_to_load,
    save_vectorizer=save_vect,
    ngram_range=ngram_range,
    lemmatize=lemmatize,
)

### ***IMPORTANT***- if you have already created the train/valid/test vectors - then just rename the model and run below to get new train/test results

In [None]:
clf_name = RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0)

# if we are doing training
if do_training:
    model, clf_name_str = run_training(
        X_train_tf,
        y_train,
        X_valid_tf,
        y_valid,
        X_test_tf,
        y_test,
        save_dir=model_dir,
        clf_name=clf_name,
        save_model=True,
        run_grid=run_grid,
    )

# set up save directories
if lemmatize:
    results_dir = (
        f"{save_dir}/results/{vectorizer}_features_{max_features}"
        "_lemmatized/clf_name_str/"
    )

else:
    results_dir = (
        f"{save_dir}/results/{vectorizer}_features_{max_features}/clf_name_str/"
    )

if load_model:
    print(f"Loading model from:...")

if do_evaluation:
    run_evaluation(
        model,
        X_train_tf,
        y_train,
        X_valid_tf,
        y_valid,
        X_test_tf,
        y_test,
        save_dir=results_dir,
    )

In [None]:
### Experiment 2 - Random forest with grid search
# set up arguments

# do_training = True
# do_evaluation = True
# load_model = False
# run_grid = True
# load_vect = False
# save_vect = True
# save_model = True
# lemmatize = False
# max_features = 50000
# ngram_range = (1,2)
# vectorizer = "tfidf"
# vectorizer_to_load = None
# # assert vectorizer is not both loaded and saved
# assert load_vect != save_vect,
# # "if a vectorizer is being loaded - do not want to save it again"
# # assert load model is not true when train is also true
# assert load_model != do_training, "if loading a clf model - do not want to train it again?"

# if load_vect:
#     vectorizer = "pretrained"
#     vectorizer_to_load = None # put path to saved vectorizer

# # set up save directories
# if lemmatize:
#     model_dir = f"{save_dir}/models/{vectorizer}_features_{max_features}_lemmatized/"

# else:
#     model_dir = f"{save_dir}/models/{vectorizer}_features_{max_features}/"

# # set up vectorized data
# (
#   X_train_tf,
#   y_train,
#   X_valid_tf,
#   y_valid,
#   X_test_tf,
#   y_test
# ) = setup_vectorised_data(
#       path_to_data=data_dir,
#       text_col= 'text',
#       label_col='label',
#       max_features=max_features,
#       vectorizer=vectorizer,
#       save_dir= model_dir,
#       pretrained_vect_dir=vectorizer_to_load,
#       save_vectorizer=save_vect,
#       ngram_range=ngram_range,
#       lemmatize=lemmatize
# )

In [None]:
run_grid = True
clf_name = RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0)

# if we are doing training
if do_training:
    model, clf_name_str = run_training(
        X_train_tf,
        y_train,
        X_valid_tf,
        y_valid,
        X_test_tf,
        y_test,
        save_dir=model_dir,
        clf_name=clf_name,
        save_model=True,
        run_grid=run_grid,
    )

# set up save directories
if lemmatize:
    results_dir = (
        f"{save_dir}/results/{vectorizer}_features_{max_features}"
        "_lemmatized/clf_name_str/"
    )

else:
    results_dir = (
        f"{save_dir}/results/{vectorizer}_features_{max_features}/clf_name_str/"
    )

if load_model:
    print(f"Loading model from:...")

if do_evaluation:
    run_evaluation(
        model,
        X_train_tf,
        y_train,
        X_valid_tf,
        y_valid,
        X_test_tf,
        y_test,
        save_dir=results_dir,
    )