# Supervised Learning
Processing data and training both regular machine learning models as well as DL models.

In [None]:
# For local reproduction, please make sure to install pytorch with cuda compiled to enable GPU support.
# https://pytorch.org/get-started/locally/

In [None]:
# Install huggingface transformers for BERT
!pip install transformers

In [None]:
# Install fastai which is used for LSTM
# Jupyter has issues with installing fastai. We suggest to go outside of jupyter and "pip install fastai".
# This will have a better chance of installing fastai v2 which we need.

# Note: Fastai may require updating if used in colab.
# Fastai doesn't work well unless using conda or linux. We suggest running the LSTM portion in colab.
# Check the fastai documentation for installation instructions if any issue occurs: 
# You can comment out the below code to update it. Make sure the version is 2+.
# ! [ -e /content ] && pip install -Uqq fastai
import fastai
fastai.__version__

In [3]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import pickle
import seaborn as sns
from matplotlib import pyplot as plt
import plotly.express as px
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.utils import resample
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split

# For supervised models
from sklearn.model_selection import StratifiedKFold #stratified helps keep our dataset balanced
from sklearn.model_selection import cross_val_score
from sklearn.dummy import DummyClassifier
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer, KNNImputer

# For pycaret model tests
from pycaret.classification import *

# For DL models
# Based on data from pytorch and hugging face tutorials
# https://www.thepythoncode.com/article/finetuning-bert-using-huggingface-transformers-python
from fastai.text.all import *
from fastai.tabular.all import *
import torch
from transformers.file_utils import is_tf_available, is_torch_available, is_torch_tpu_available
from transformers import BertTokenizerFast, BertForSequenceClassification
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments
import random

ModuleNotFoundError: No module named 'pycaret'

In [2]:
# Function copied from online tutorial to help reproducibility.
# https://www.thepythoncode.com/article/finetuning-bert-using-huggingface-transformers-python

def set_seed(seed: int):
    """
    Helper function for reproducible behavior to set the seed in ``random``, ``numpy``, ``torch``

    Args:
        seed (int): The seed to set.
    """
    random.seed(seed)
    np.random.seed(seed)
    if is_torch_available():
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        # ^^ safe to call this function even if cuda is not available

set_seed(1)

NameError: name 'random' is not defined

In [None]:
# If running our code from Google Drive, you can connect to drive to load files quickly.
# Uncomment out the below code to run. You may have to enter an authentication code.
# Be aware that Google Colab often will fail due to memory requirements.

# from google.colab import drive
# drive.mount('/content/drive', force_remount=True)

## Functions

In [None]:
def get_imputed_array(df,from_save_path=None,save_to_path=None):
    """
    Takes in a df of training data and returns an imputed numpy array.
    Requires sklearn imputers.
    
    Args:
    df -- The original X data dataframe
    from_save_path -- Str of path where to load the saved file from if available
    save_to_path -- Str of path where to save the imputed array to.
    
    Returns:
    np.array -- Numpy array of imputed X data
    
    """
    
    if from_save_path:
        return np.load(from_save_path)
    else:
        arr = df.to_numpy()
        arr_imputed = KNNImputer(missing_values=np.nan, add_indicator=False).fit_transform(arr)
        if save_to_path:
            with open(save_to_path, 'wb') as f:
                np.save(f, arr_imputed)
        return arr_imputed
    
def get_normalized_data(df, path,load=True):
    """
    Load imputed numpy array and standardize it which is useful for traditional machine learning.
    
    Args:
    df -- df to use in get imputed array function.
    path -- path to load imputed array from
    load -- boolean value. If set to true, function will load imputed array from path.
    
    Returns:
    np.array -- Normalized & imputed numpy array of numerical features
        
    """
    if load:
        X_imputed = get_imputed_array(df,from_save_path=path,save_to_path=None)
        
    # If we can't load it from storage, we will compute the array and then save it.
    # Note: Resource intensive.
    
    else:
        X_imputed = get_imputed_array(df,from_save_path=None,save_to_path=path)
        
    X_train_normalized = StandardScaler().fit_transform(X_imputed)
    
    return X_train_normalized

def add_full_sentence_embedding(df,features,path="./pickles/X_train_s_imputed.npy",load=False):
    """
    Takes in a df, converts sentence embedding to individual columns, imputes the data and returnss the array.
    
    Args:
    df -- main df to use. Must have sentence embedding column.
    features -- list of relevant features
    savepath -- path to save the numpy array to. Saves resources if running multiple times.
    
    Returns:
    np.array -- Normalized and imputed numpy array including 50 columns related to the sentence embedding.
    """
    # If load is set to true, load the imputed array from storage.
    if load:
        with open(path, 'rb') as f:
            X_imputed_s_embed = np.load(f)
    else:  
        # Creating separate columns for our sentence embedding column
        sentence_embed_df = pd.DataFrame(df["sentence_embed"].to_list())

        # Get the names of the columns. We have 50 because we used a 50 dimensional embedding.
        embed_col_nums = ["Embed-"+str(i) for i in range(50)]
        sentence_embed_df.columns = embed_col_nums

        # Concat the sentence embed df with the original train df to add the columns back in
        train_df_concat = pd.concat([df, sentence_embed_df], axis=1)

        combined_cols = features + embed_col_nums

        X_imputed_s_embed = get_imputed_array(train_df_concat[combined_cols],save_to_path=savepath)
        
    X_normalized_s_embed = StandardScaler().fit_transform(X_imputed_s_embed)
    
    return X_normalized_s_embed


def add_kmeans_label(arr):
    """
    Refit our KMeans label and add it to our numpy array as a feature.
    
    Args:
    arr -- array to fit the kmeans data and produce the labels
    
    Returns:
    np.array -- Numpy array with the kmeans predicted label column added
    """
    kmeans = KMeans(n_clusters = 2,  random_state = 42)
    kmeans.fit(arr)
    X_norm_kmeans = np.append(arr,np.array(kmeans.labels_).reshape(-1,1),1)
    return X_norm_kmeans

def get_sample(X,y,num_samples=10000):
    """
    Create samples which are useful for early tests and models that have resource limitations
    
    Args:
    x -- training data features
    y -- training data labels
    num_samples -- number of samples to take from the full dataset
    
    Returns:
    tuple -- tuple of X_sample array and y_sample array
    """
    to_sample = np.append(X,y.reshape(-1,1),axis=1)
    small_sample = resample(to_sample,n_samples=num_samples,stratify=y_train,random_state=42)
    X_sample = small_sample[:,:-1]
    y_sample = small_sample[:,-1]
    return X_sample,y_sample

def get_rf_feature_importances(X,y,features,n_estimators=100,max_depth=100,random_state=42,class_weight="balanced"):
    """
    Wrapper to train a random forest classifier and output the classifier, feature importances, and a graph of it.
    
    Args:
    X -- Training data features
    y -- Training labels to predict
    features -- list of relevant features to use in training the model
    n_estimators -- number of estimators for the random forest
    max_depth -- mmax depth of the random forest
    random_state -- int to set the random state value to help reduce randomization in results
    class_weight -- str for setting the class weight of the random forest
    
    Returns:
    clf_rf -- random forest classifier
    features_df -- dataframe of feature importances
    feature_chart -- feature importances chart
    
    """
    # Create classifier
    clf_rf = RandomForestClassifier(n_estimators=n_estimators, 
                                    max_depth=max_depth,
                                    random_state= random_state,
                                    class_weight=class_weight)
    # Fit it to data
    clf_rf.fit(X,y)
    
    # Get the sorted index of the features based on their importance
    index_sorted = clf_rf.feature_importances_.argsort()

    # Get the sorted features using the index_sorted list
    # We want the most important one first
    features_sorted = [str(features[i]) for i in index_sorted]

    
    # Storing the features and their importance value in a dict to be returned
    features_df = pd.DataFrame({"feature":features_sorted,"importance":clf_rf.feature_importances_[index_sorted]})
    
    # Plot the features
    # Based on code from tutorials
    
    # Quick height adjustment to make the graph more visually appealing and easier to read
    if (len(features)*20) < 600:
        chart_height = 600
    else:
        chart_height = len(features)*20
    
    feature_chart = px.bar(features_df, 
                           x='importance', 
                           y='feature',
                           title="Random Forest Feature Importance",
                           color="importance",
                           height=chart_height)
    feature_chart.show()
    
    return clf_rf,features_df, feature_chart

def plot_feature_accuracy(feature_importances,X_train,y_train,X_test,y_test,steps=1):
    """
    Loops through df of top features and retrains a Random Forest model.
    Each iteration adds the next most important feature.
    Tracks the accuracy as more features are added and returns the accuracy df and chart.
    
    Args:
    feature_importances -- dataframe of feature importances
    X_train -- Feature training values. Used to train the Random Forest.
    y_train -- Labels of training data. Used to train the Random Forest.
    X_test -- Feature test values. Used to evaluate the Random Forest.
    y_test -- Labels of test data. Used to evaluate the Random Forest.
    
    Returns:
    acc_df -- dataframe of accuracy scores by the number of features used in training the Random Forest.
    fig -- Plotly line chart of accuracy as number of features increases.
    """
    accuracy_dict = {}
    for i in tqdm(range(0,len(feature_importances),steps)):
        feat_num = i+1
#         print(features[-feat_num:])
        X = X_train[feature_importances[-feat_num:]["feature"]]
        X_t = X_test[feature_importances[-feat_num:]["feature"]]
        
        clf_rf = RandomForestClassifier(n_estimators=100,max_depth=100,random_state=42).fit(X,y_train)
        
        accuracy_dict[feat_num] = clf_rf.score(X_t,y_test)
        print(f"Accuracy for {feat_num} features: {accuracy_dict[feat_num]}")
        
    acc_df = pd.DataFrame({"Num Features":accuracy_dict.keys(),"Accuracy":accuracy_dict.values()})
    fig = px.line(acc_df,x="Num Features",y="Accuracy",title="Random Forest accuracy by number of features.<br>Top features first.",width=600,height=500)
    fig.show()
    
    return acc_df, fig

def combine_text_features(df,features):
    """
    Used for BERT models.
    Concats our numerical features with the text data.
    We separate them by a [SEP] token which is understood by BERT.
    
    Args:
    df -- main dataframe
    features -- list of relevant features to concatenate / combine
    
    Returns:
    df -- dataframe with combined text column added
    """
    
    # Round our values to make them less unique. This will suit the token representation better.
    df = df.round(2)
    
    # Get columns we need and convert to strings to concat with text.
    df[features] = df[features].astype(str)
    
    # Add features to end of text.
    # Adding [SEP] as a separator token for BERT models.
    df["combined_text"] = df[features].agg(" [SEP] ".join,axis=1)
    
    return df
    
    
class ComplexityDataset(torch.utils.data.Dataset):
    """
    Creates dataset based on complexity values.
    From: https://www.thepythoncode.com/article/finetuning-bert-using-huggingface-transformers-python
    """
    
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor([self.labels[idx]])
        return item

    def __len__(self):
        return len(self.labels)
    
def train_model_dl(df,training_args,model_path,model_name,label_names=["0","1"],test_size=0.05,max_length=64,bert=True):
    """
    Wrapper function to process a df and train a deep learning model.
    Will print out training metrics and save the model at the end.
    
    Based on HuggingFace tutorials as well as: 
    https://www.thepythoncode.com/article/finetuning-bert-using-huggingface-transformers-python
    
    Args:
    df -- main dataframe of data. Should include combined_text and label columns.
    training_args -- training argument object to pass to the trainer object in huggingface
    model_path -- path to save the model after training
    label_names -- our label names / values. In our case, 0 & 1.
    test_size -- size of test data to evaluate on. Default is 5% but also used 10% in practice.
    max_length -- maximum sequence length of the BERT model. Default is 64. Lower values produces less tokens and will truncate sentences.
    bert -- boolean value. If set to true, will slightly optimize for standard BERT training.
    
    Returns:
    model object -- HuggingFace pretrained BERT model that has been fine-tuned on our dataset.
    """
    
    # Train test split.
    X_train, X_test, y_train, y_test = train_test_split(df["combined_text"],
                                                        df["label"],
                                                        stratify=df["label"], test_size=test_size, random_state=42)
    
    # The deep learning models need a list, not a series.
    X_train = X_train.tolist()
    X_test = X_test.tolist()
    y_train = y_train.tolist()
    y_test = y_test.tolist()

    max_length = max_length
    
    # While the autotokenizer is fine, we can specify for BERT.
    if bert:
        tokenizer = BertTokenizerFast.from_pretrained(model_name, do_lower_case=True)
    else:
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        
    try:
        train_encodings = tokenizer(X_train, truncation=True, padding=True, max_length=max_length)
        valid_encodings = tokenizer(X_test, truncation=True, padding=True, max_length=max_length)
    except:
        tokenizer.add_special_tokens({'pad_token': '[PAD]'})
        train_encodings = tokenizer(X_train, truncation=True, padding=True, max_length=max_length)
        valid_encodings = tokenizer(X_test, truncation=True, padding=True, max_length=max_length)
    
    # convert our tokenized data into a torch Dataset
    train_dataset = ComplexityDataset(train_encodings, y_train)
    valid_dataset = ComplexityDataset(valid_encodings, y_test)
    
    # Init model and send to GPU
    
    if bert:
        model = BertForSequenceClassification.from_pretrained(model_name, num_labels=len(label_names)).to("cuda")
    else:
        model = AutoModelForSequenceClassification.from_pretrained(model_name).to("cuda")
    
    training_args = training_args
    
    trainer = Trainer(
    model=model,                         # Transformers model
    args=training_args,                  # training arguments
    train_dataset=train_dataset,         # training dataset
    eval_dataset=valid_dataset,          # evaluation dataset
    compute_metrics=compute_metrics,     # the callback that computes metrics of interest
    )
    
    trainer.train()
    
    trainer.evaluate()
    
    model_path = model_path
    model.save_pretrained(model_path)
    tokenizer.save_pretrained(model_path)
    
    return model

def compute_metrics(pred):
    """
    Computes metrics for use in our HuggingFace callbacks while training BERT models.
    
    Based on tutorial from: https://www.thepythoncode.com/article/finetuning-bert-using-huggingface-transformers-python
    Customized to add F1 score
    
    Args:
    pred -- predictions
    
    Returns:
    dict -- dict of accuracy and f1 score metrics
    """
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels,preds)
    return {
        'accuracy': acc,
        'f1':f1,
    }

def get_prediction(text,model):
    """
    Function for computing predictions using our deep learning model
    Based on tutorial: https://www.thepythoncode.com/article/finetuning-bert-using-huggingface-transformers-python
    Customized to export prediction probability.
    
    Args:
    text -- text to run through our model to predict a label
    model -- model to use for prediction task
    
    Returns:
    string -- name of predicted label
    float -- probability score of label that was predicted
    """
    
    # prepare our text into tokenized sequence
    inputs = tokenizer(text, padding=True, truncation=True, max_length=max_length, return_tensors="pt").to("cuda")
    # perform inference to our model
    outputs = model(**inputs)
    
    # get output probabilities by doing softmax
    probs = outputs[0].softmax(1)

    # executing argmax function to get the candidate label
    target_names = [0,1]
    return target_names[probs.argmax()],float(probs.max())

def dl_model_predict(model_path,model_name,bert=True):
    """
    Wrapper function to process a df, load a DL model, and predict labels for it.
    Based on Huggingface tutorials.
    
    Args:
    model_path -- path to load the model from
    model_name -- model to load. Used to initialize the tokenizer.
    bert -- boolean to slightly optimize for BERT specific functions. Default is True.
    
    Returns:
    huggingface pipeline object -- classifier built on huggingface pipeline object. Tokenizer + Model.
    """
    
    if bert:
        tokenizer = BertTokenizerFast.from_pretrained(model_name, do_lower_case=True)
    else:
        tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    # Init model and send to GPU
    
    if bert:
        model = BertForSequenceClassification.from_pretrained(model_path,local_files_only=True)
    else:
        model = AutoModelForSequenceClassification.from_pretrained(model_path,local_files_only=True)
        
    classifier = pipeline(model=model, tokenizer=tokenizer,task="text-classification")
    
    return classifier

def predict_batch(string_list,batch_size=64):
    """
    Takes in a list of strings and predicts the labels using our model. Sometimes faster than pandas apply/
    Batch helper code from stackoverflow
    https://stackoverflow.com/questions/8290397/how-to-split-an-iterable-in-constant-size-chunks
    
    Args:
    string_list -- list of strings to predict labels for.
    batch_size -- size of batch. Higher number can be faster but more memory intensive. Default is 64.
    
    Returns:
    list -- list of predictions dicts containing a predicted label and score for each string in the string list.
    """
    def batch(iterable, n=1):
        l = len(iterable)
        for ndx in range(0, l, n):
            yield iterable[ndx:min(ndx + n, l)]

    pred_list = []

    for x in tqdm(batch(test_strings, batch_size)):
        pred_list.extend(bert_clas(x))
        
    return pred_list

## Get Data & Process It

We can load our imputed data or regular dataframes from our previous stage in our pipeline

In [None]:
# Numerical features to be used in certain models
features = ["automated_readability_index",
            "coleman_liau_index",
            "flesch_kincaid_grade_level",
            "flesch_reading_ease",
            "gunning_fog_index",
            "lix",
            "perspicuity_index",
            "smog_index",
            "Passage Sum",
            "Dale Chall Sum",
            "Dale Chall Percent",
            "SAT Sum",
            "SAT Percent",
            "AoA_Freq",
            "AoA_Mean_Age",
            "Conc.M",
            "Percent_known",
            "Average_Embed",
            "TTR",
            "Sqrd_AoA_Mean_Age",
            "Max_AoA_Age",
            "Min_AoA_Age",
            "Pronoun Count",
            "Pronoun Percent",
            "commune_matched",
            "football_matched",
            "LRB_RRB_matched"
]

In [None]:
# Load main train df
train_df = pickle.load(open('pickles/train_df.pkl','rb'))
print(train_df.shape)

In [None]:
y_train = train_df["label"].to_numpy()

In [None]:
imputed_path = "./pickles/X_imputed.npy"
X_norm_kmeans = add_kmeans_label(get_normalized_data(train_df[features],imputed_path))

In [None]:
X_sample,y_sample = get_sample(X_norm_kmeans,y_train)

In [None]:
# For testing the full 50 dimensional sentence embedding
# Warning: Can be very resource intensive for only 1% improvement in accuracy.
X_normalized_s_embed = add_full_sentence_embedding(train_df,features,path="./pickles/X_train_s_imputed.npy",load=True)

## Supervised Models - Traditional
We trained multiple models to check how they performed. Most of the code here is based on scikit-learn tutorials or lecture material.

### Dummy Classifier

In [None]:
# Starting with a dummy classifier. Code is based on sklearn documentation.
clf_dummy = DummyClassifier(strategy="most_frequent")
scores_dummy = cross_val_score(clf_dummy,X_norm_kmeans,y_train,cv=10)

print("%0.2f accuracy with a standard deviation of %0.2f" % (scores_dummy.mean(), scores_dummy.std()))

### SVM

In [None]:
# linear kernel
clf_svm = svm.SVC(kernel='linear', C=1, random_state=42)
scores_svm = cross_val_score(clf_svm,X_sample,y_sample,cv=10)

print("%0.2f accuracy with a standard deviation of %0.2f" % (scores_svm.mean(), scores_svm.std()))

In [None]:
# rbf kernel
clf_svm = svm.SVC(kernel='rbf', C=1, random_state=42)
scores_svm = cross_val_score(clf_svm,X_sample,y_sample,cv=10)

print("%0.2f accuracy with a standard deviation of %0.2f" % (scores_svm.mean(), scores_svm.std()))

In [None]:
# poly kernel
clf_svm = svm.SVC(kernel='poly', C=1, random_state=42)
scores_svm = cross_val_score(clf_svm,X_sample,y_sample,cv=10)

print("%0.2f accuracy with a standard deviation of %0.2f" % (scores_svm.mean(), scores_svm.std()))

### Logistic Regression

In [None]:
%%time

clf_logreg = LogisticRegression(random_state=42)
scores_logreg = cross_val_score(clf_logreg,X_norm_kmeans,y_train,cv=10)

print("%0.2f accuracy with a standard deviation of %0.2f" % (scores_logreg.mean(), scores_logreg.std()))

### Random Forest

In [None]:
%%time
# RF cross val with normalized training data and kmeans label added
clf_rf = RandomForestClassifier(n_estimators=100, max_depth=100, random_state=42, class_weight="balanced")
scores_rf_kmeans= cross_val_score(clf_rf,X_norm_kmeans,y_train,cv=10)

print("%0.2f accuracy with a standard deviation of %0.2f" % (scores_rf_kmeans.mean(), scores_rf_kmeans.std()))

In [None]:
%%time
# This will run the Random Forest on the data with the 50 dimensional vectors. 
# Warning: It will take over an hour on most machines.

# RF cross val with normalized training data and kmeans label added + 50 dimensional sentence embedding
# clf_rf_50s = RandomForestClassifier(n_estimators=100, max_depth=100, random_state=42, class_weight="balanced")
# scores_rf_kmeans_50s= cross_val_score(clf_rf_50s,X_normalized_s_embed,y_train,cv=10)

# print("%0.2f accuracy with a standard deviation of %0.2f" % (scores_rf_kmeans_50s.mean(), scores_rf_kmeans_50s.std()))

In [None]:
%%time
# Generates feature importance chart using the Random Forest
features_kmeans = features + ["kmeans_label"]
clf_rf_kmeans, clf_rf_kmeans_fidf, clf_rf_kmeans_chart = get_rf_feature_importances(X_norm_kmeans,
                                                                                    y_train,
                                                                                    features=features_kmeans)

In [None]:
%%time
# Plot the accuracy score as more top features are added.
# Standard train test split used to generate data to help plot the accuracy over number of features.

# Create kmeans df which will be useful for plotting.
kmeans_df = pd.DataFrame(X_norm_kmeans,columns=features_kmeans)
kmeans_df

X_train, X_test, y_train_split, y_test_split = train_test_split(kmeans_df,
                                                    y_train,
                                                    stratify=y_train, test_size=0.1, random_state=42)

acc_df,acc_plot = plot_feature_accuracy(clf_rf_kmeans_fidf,X_train,y_train_split,X_test, y_test_split)

Overall, our Random Forest was our best traditional machine learning model. We will see that it comes close to the deep learning models in terms of accuracy.

### PyCaret Model Evaluations
We used PyCaret as a secondary evaluation method. Code based on PyCaret tutorials.

In [None]:
# Load data for pycaret.
pycaret_features = features + ["label"]
pycaret_data = train_df[pycaret_features]

In [None]:
# Setup the classifier
# Note: Sometimes categorizes numerical features as categorical. This can be easily solved by filtering those out.
pycaret_classifier = setup(data = pycaret_data, target = 'label', session_id=123)

In [None]:
# Perform model evaluation.
best_model = compare_models()

PyCaret also landed on Random Forest being the best overall model.

## Supervised Models - Deep Learning

### LSTM

In [None]:
# Fastai code based on fastai tutorials. https://docs.fast.ai/tutorial.text.html
# Load dataloaders
dls = TextDataLoaders.from_df(train_df, text_col='original_text', label_col='label', valid_pct=0.1)

In [None]:
# Create learner object. This is a wrapper that handles training.
learn = text_classifier_learner(dls, AWD_LSTM, drop_mult=0.5, metrics=accuracy)

In [None]:
# Fine tune LSTM.
# We set the epochs to 4 and our learning rate to 1e-2.
learn.fine_tune(4, 1e-2)

In [None]:
# For exporting the model to save progress
# learn.export("./pickles/fastai-lstm.pkl")

### MLP

In [None]:
# Used to rearrange our dataframes for getting data ready for the MLP

features_tab = ["automated_readability_index",
                "coleman_liau_index",
                "flesch_kincaid_grade_level",
                "flesch_reading_ease",
                "gunning_fog_index",
                "lix",
                "perspicuity_index",
                "smog_index",
                "Passage Sum",
                "Dale Chall Sum",
                "Dale Chall Percent",
                "SAT Sum",
                "SAT Percent",
                "AoA_Freq",
                "AoA_Mean_Age",
                "Conc.M",
                "Percent_known",
                "Average_Embed",
                "TTR",
                "Sqrd_AoA_Mean_Age",
                "Max_AoA_Age",
                "Min_AoA_Age",
                "Pronoun Count",
                "Pronoun Percent",
                "commune_matched",
                "football_matched",
                "LRB_RRB_matched",
                "label"
]

cat_names = ["commune_matched",
             "football_matched",
             "LRB_RRB_matched"
]
cont_names = ["automated_readability_index",
              "coleman_liau_index",
              "flesch_kincaid_grade_level",
              "flesch_reading_ease",
              "gunning_fog_index",
              "lix",
              "perspicuity_index",
              "smog_index",
              "Passage Sum",
              "Dale Chall Sum",
              "Dale Chall Percent",
              "SAT Sum",
              "SAT Percent",
              "AoA_Freq",
              "AoA_Mean_Age",
              "Conc.M",
              "Percent_known",
              "Average_Embed",
              "TTR",
              "Sqrd_AoA_Mean_Age",
              "Max_AoA_Age",
              "Min_AoA_Age",
              "Pronoun Count",
              "Pronoun Percent",
]
procs = [Categorify, FillMissing, Normalize]

In [None]:
# The MLP class had errors when the label was a number. 
# We replaced it with a boolean and went from 50% to 68% accuracy.
mlp_df = train_df[features_tab].copy()
mlp_df["label"].replace({0: False, 1: True}, inplace=True)
mlp_df.head()

In [None]:
# Creates splits
splits = RandomSplitter(valid_pct=0.1)(range_of(mlp_df))

In [None]:
# Create tabular object and dataloaders
to = TabularPandas(mlp_df, procs=procs,
                   cat_names = cat_names,
                   cont_names = cont_names,
                   y_names='label',
                   splits=splits)

dls = to.dataloaders(bs=64)

In [None]:
# Create tabular learner
learn_tab = tabular_learner(dls, metrics=accuracy)

In [None]:
# Fastai has a useful function to estimate the best learning rate to use.
learn_tab.lr_find()

In [None]:
# Fit the data. We are not fine-tuning here. We are learning from our training data alone.
learn_tab.fit_one_cycle(5,1e-3)

### Transformer (BERT)
We fine-tuned a pre-trained BERT model from HuggingFace which gave us our best accuracy score.
We attempted multiple different types of variants including regular BERT, distilbert, and RoBerta

In [None]:
features_bert = ["original_text",
                 "automated_readability_index",
                 "coleman_liau_index",
                 "flesch_kincaid_grade_level",
                 "flesch_reading_ease",
                 "gunning_fog_index",
                 "lix",
                 "perspicuity_index",
                 "smog_index",
                 "Passage Sum",
                 "Dale Chall Sum",
                 "Dale Chall Percent",
                 "SAT Sum",
                 "SAT Percent",
                 "AoA_Freq",
                 "AoA_Mean_Age",
                 "Conc.M",
                 "Percent_known",
                 "Average_Embed",
                 "TTR",
                 "Sqrd_AoA_Mean_Age",
                 "Max_AoA_Age",
                 "Min_AoA_Age",
                 "Pronoun Count",
                 "Pronoun Percent",
                 "commune_matched",
                 "football_matched",
                 "LRB_RRB_matched",
]

In [None]:
# Concat features together with original text to create a combined_text column
combined_df = combine_text_features(train_df,features_bert)

In [None]:
# We have written helpful wrapper functions in the above function section to make it easier to test and train models.
# Most of the code is inspired from two sources:
# HuggingFace Tutorials: https://huggingface.co/transformers/training.html
# PythonCode Tutorial: https://www.thepythoncode.com/article/finetuning-bert-using-huggingface-transformers-python

# First we set up our model by providing it training arguments.
# This is where we can tune hyperparameters such as learning rate or weight decay.

# We will also provide paths to save our model checkpoints.
# Deep learning models take a long time to train, and the final result is often not the best.
# It is useful to have checkpoints to load the most appropriate model for our task.
# We are setting up the model to track accuracy and F1 score and load the best model at the end based
# on accuracy score during the checkpoint evaluations

training_args = TrainingArguments(
    output_dir='./results/bert-base-uncased-64_5-wd0.1_lr5e-5-r42',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=64,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=600,                # number of warmup steps for learning rate scheduler
    learning_rate=5e-5,
    weight_decay=0.1,               # strength of weight decay
    logging_dir='./logs/bert-base-uncased-64_5-wd0.1_lr5e-5-r42',            # directory for storing logs
    load_best_model_at_end=True,     # load the best model when finished training
    metric_for_best_model="accuracy", # load the best model based on accuracy due to match the Kaggle evaluation
    greater_is_better=True,
    eval_steps=2000,                 # when to evaluate the model
    save_steps=2000,                 # when to save the model checkpoint
    evaluation_strategy="steps",     # evaluate each `logging_steps` (this can also be set to every epoch)
)

m_path = "./models/bert-base-uncased-64_5-wd0.1_lr5e-5-r42"
m_name = "bert-base-uncased" # This is the model. More pretrained models can be found at https://huggingface.co/models
bert64_1 = train_model_dl(combined_df, # The df to use as input
                            training_args, # The arguments for the trainer
                            m_path, # The path to the model for saving and loading if required
                            model_name=m_name, # Name of model.
                            label_names=["0","1"], # Label names or values. 0 and 1 for complexity labels.
                            test_size=0.1, # Size of the test set to evaluate on based on the training data
                            max_length=64, # Maximum sequence length. Lower number can help prevent overfitting.
                            bert=True) # Tells our model that the model is using bert compared to others.

In [None]:
# Load the model if required. This can load from a checkpoint too.
model_path = "./results/bert-base-uncased-64_10-wd0.1_3r1e-5/checkpoint-10000"
model_name = "bert-base-uncased"

# Get the classifier
bert_clas = dl_model_predict(model_path=model_path,model_name=model_name,bert=True)

In [None]:
# Load test df (with engineered features)
test_df = pickle.load(open('pickles/test_df_embed.pkl','rb'))
test_df = combine_text_features(test_df,features_bert)

In [None]:
%%time
tqdm.pandas() # Enables progress_apply which gives a loading bar for apply functions in pandas.
test_df["preds"] = test_df["original_text"].progress_apply(lambda t: bert_clas(t)) # Creates column of predictions and scores.

In [None]:
# Create separate columns for predicted label and its probability score
test_df["pred_label"] = test_df["preds"].apply(lambda x: x["label"][-1])
test_df["pred_score"] = test_df["preds"].apply(lambda x: x["score"])

In [None]:
# Save predictions to pickle file
test_df.to_pickle('./pickles/test_df_preds.pkl')

In [None]:
# Export predictions in CSV similar to the sample submission
pred_path = "" # Put your file path here where you want to save the csv

# Create ID columns
test_df["ID"] = test_df.index

# Create df slice of just ID and predicted label for use in the sample submission
preds = test_df[["ID","pred_label"]]

preds.columns = ["id","label"] # Make columns similar to the sample submission
preds.to_csv(pred_path,index=False) # Don't export the index to make it match the sample submission

### Visualizing Overfitting

In [None]:
# Data provided is from one of our BERT models.

overfitting_df = {
    "Step": [2000,4000,6000,8000,10000,12000,14000,16000,18000],
    "Training Loss": [0.510100,0.486600,0.478400,0.415900,0.412200,0.407800,0.319000,0.314000,0.305900],
    "Validation Loss": [0.505885,0.496645,0.472032,0.481597,0.473282,0.486976,0.531029,0.560407,0.570793],
    "Accuracy": [0.740679,0.754307,0.763280,0.765872,0.769519,0.772206,0.767695,0.765248,0.767311],
    "F1": [0.775972,0.778028,0.777562,0.773502,0.778367,0.787159,0.776490,0.771295,0.776286]
}

plt.plot(overfitting_df["Step"], overfitting_df["Training Loss"], label = "Training Loss")
plt.plot(overfitting_df["Step"], overfitting_df["Validation Loss"], label = "Validation Loss")
# plt.plot(overfitting_df["Step"], overfitting_df["Accuracy"], label = "Accuracy")
# plt.plot(overfitting_df["Step"], overfitting_df["Validation Loss"], label = "Validation Loss")
plt.legend()
plt.title("Training Loss vs. Validation Loss")
plt.suptitle("BERT Overfitting after 8K Steps")
plt.xlabel("Steps")
plt.ylabel("Loss")
plt.show()

## Failure Analysis

In [None]:
# We have prepared a sample failure analysis dataframe from one of our earlier models.
# This analysis inspired some features and ideas in our machine learning cycle.

# Uncomment below to load it
# compare_test_df = pickle.load(open('pickles/bert_compare_test_df.pkl','rb'))

# If you don't wish to load this then please run the code below

In [None]:
# Train test split to mimic the same data that our BERT model was trained and evaluated on.
# Make sure that the test_size is the same as the trainer size when training the model to avoid data leakage.
X_train, X_test, y_train_fa, y_test_fa = train_test_split(combined_df["combined_text"],
                                                    combined_df["label"],
                                                    stratify=combined_df["label"], test_size=0.1, random_state=42)

# The deep learning models need a list, not a series.
X_train = X_train.tolist()
X_test = X_test.tolist()
y_train_fa = y_train_fa.tolist()
y_test_fa = y_test_fa.tolist()

In [None]:
# Prepare test data to check for predicted vs real labels. bert64_1 is the BERT model we previously trained above.
# Note: The test data here is from the train test split, not the official Kaggle test set since that doesn't have labels.

compare_test_df = pd.DataFrame({"original_text":X_test,"real_label":y_test_fa})
compare_test_df["pred"] = compare_test_df["original_text"].apply(lambda x: get_prediction(x,bert64_1))
compare_test_df["probability"] = compare_test_df["pred"].apply(lambda x: x[1])
compare_test_df["correct"] = compare_test_df["pred_label"] == compare_test_df["real_label"]

In [None]:
# Get incorrect and correct examples. Sort by highest probability first.
# This shows us which predictions the model was most confident in.
# High confidence wrong predictions are a key to analyzing what is going wrong or understanding complex situations.

correct_test = compare_test_df[compare_test_df["correct"] == True].sort_values(by="probability",ascending=False)
incorrect_test = compare_test_df[compare_test_df["correct"] == False].sort_values(by="probability",ascending=False)

In [None]:
# To see the full sentences in pandas. Check the top 10 high confidence correct predictions.
with pd.option_context('display.max_colwidth', None):
    display(correct_test.head(10))

In [None]:
# To see the full sentences in pandas. Check the top 10 high confidence incorrect predictions.
with pd.option_context('display.max_colwidth', None):
    display(incorrect_test.head(50))

During our analysis, we noticed certain trends. A lot of the incorrect but high confidence predictions involved words like football, commune, or LRB / RRB tags. This inspired some of our features, as well as gave us insight into our topic modeling.