#Twitter and Reddit Sarcasm Detection Experimentation
## Authors: Mark Biegel and Youssef Othman
### CMSC 473 NLP Course Project

<br> This notebook contains experimenation with NLP featurizers and machine learning models to determine which combination is the best for detecting sarcasm among social media posts from twitter and reddit data sets
<br><br>*NOTE* Google Colab is requirement for running this noebook, as it uses the `google.colab.files` library which is unusable locally

In [None]:
### Import Cell

# Colab import functionality
from google.colab import files

# Data processing tools
import pandas as pd
import numpy as np
import math

# Matplotlib
import matplotlib.pyplot as plt

# SKLEARN packages
import sklearn.feature_extraction.text as featExtract
import sklearn.preprocessing as prep

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import LogisticRegression
from sklearn import svm

from sklearn import metrics

In [None]:
# Suppress Scikit-Learn's Forced Warnings (unnecssary for current experimentation)
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

In [None]:
### Create Files for final dataframes and for resulting finalized model outputs

final_stats_file = open("finalStats.txt", 'w')
fileSectionBreakString = "-----------------------------------------------------------\n"
final_stats_file.write(fileSectionBreakString)

### IMPORTANT - PLEASE READ THIS:
For this next cell, the data will be read in, so the `kaggle_data_key.json` needs to be uploaded when the output says to "Choose file"

In [None]:
### Reading data in

# Get the training and test set from the first source
!wget "https://raw.githubusercontent.com/AmirAbaskohi/SemEval2022-Task6-Sarcasm-Detection/main/Data/Main%20Dataset/Train_Dataset.csv"
!wget "https://raw.githubusercontent.com/AmirAbaskohi/SemEval2022-Task6-Sarcasm-Detection/main/Data/Test_Dataset.csv"

# Get the training set from the second source
!wget "https://raw.githubusercontent.com/surajr/SarcasmDetection/master/Data/sarcasm-dataset.txt"



## NOTE: Third and fourth dataset are downloaded from Kaggle online
  # ONLY NEED TO RUN ONCE TO GET FILES; DON'T THINK WE NEED IT FOR SUBMISSION
  
uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))
  
# Then move kaggle.json into the folder where the API expects to find it.
!mkdir -p ~/.kaggle/ && mv kaggle.json ~/.kaggle/ && chmod 600 ~/.kaggle/kaggle.json

# Download and extract data files
!kaggle datasets download -d danofer/sarcasm
!unzip sarcasm.zip
!kaggle datasets download -d rmisra/news-headlines-dataset-for-sarcasm-detection
!unzip news-headlines-dataset-for-sarcasm-detection.zip


# Data Preprocessing
#### Train, dev, and test sets created

In [None]:
### Processing first dataset

# Use pandas to load in the data
train_set1_df = pd.read_csv('Train_Dataset.csv')

# Load in the test set into a cleaner version
test_set1_df = pd.read_csv("Test_Dataset.csv")

# Defining Constants
sarcasm_tweet_column = "tweet"
is_sarcasm_column = "sarcastic"

# Combining test and traing datasets
cleaned_set1_train_df = train_set1_df[[sarcasm_tweet_column, is_sarcasm_column]].dropna()
cleaned_set1_test_df = test_set1_df[[sarcasm_tweet_column, is_sarcasm_column]].dropna()

total_cleaned_df = cleaned_set1_train_df.append(cleaned_set1_test_df, ignore_index=True)

In [None]:
### Processing second dataset
  # Data was formatted weird, need to remove new line 
  # characters and separate out the sarcasm label

with open('sarcasm-dataset.txt') as file:
  df = file.readlines()
file.close()

for i in range(len(df)):
  df[i] = df[i].strip("\n")

second_set = []

for line in df:
  second_set.append([line[:-2], int(line[-1])])

cleaned_second_set_df = pd.DataFrame(second_set, columns = ['tweet', 'sarcastic']).dropna()
total_cleaned_df = total_cleaned_df.append(cleaned_second_set_df, ignore_index=True)

In [None]:
### Processing third dataset

third_set_df = pd.read_csv("train-balanced-sarcasm.csv")

# Combining test and traing datasets
cleaned_set3_train_df = third_set_df[['comment', 'label']].rename(columns={'comment': 'tweet', 'label': 'sarcastic'}).dropna()
total_cleaned_df = total_cleaned_df.append(cleaned_set3_train_df, ignore_index=True)

In [None]:
### Processing fourth dataset

fourth_set_df = pd.read_json('Sarcasm_Headlines_Dataset_v2.json', lines=True)

# Combining test and traing datasets
cleaned_set4_train_df = fourth_set_df[['headline', 'is_sarcastic']].rename(columns={'headline': 'tweet', 'is_sarcastic': 'sarcastic'}).dropna()
total_cleaned_df = total_cleaned_df.append(cleaned_set4_train_df, ignore_index=True)

In [None]:
# Printing out final compiled dataframe
total_cleaned_df

In [None]:
# Create train, dev, and test sets
TRAIN_SET_RATIO = 0.8
DEV_SET_RATIO = 0.10
TEST_SET_RATION = 0.10

train_set_split_index = int(len(total_cleaned_df)*TRAIN_SET_RATIO)
dev_set_split_index = int(train_set_split_index+ (len(total_cleaned_df) * DEV_SET_RATIO))

train = total_cleaned_df[0:train_set_split_index]
dev = total_cleaned_df[train_set_split_index: dev_set_split_index]
test = total_cleaned_df[dev_set_split_index:]

print("The following values should be equal:", (len(train)+len(dev)+len(test)), "=", len(total_cleaned_df))

# Vectorization/Featurization of Datasets

In [None]:
### Vectorization/Featurization methods
    # Need rationale for why these vectorization/featurization methods are picked
    # Tune hyperparameters of each method????????

## Sentiment Analysis
  # Count Vectorizer
print("[Count Vectorizer] Encoding data...", end="")
CountVectorizer = featExtract.CountVectorizer()
CV_train_vectors = CountVectorizer.fit_transform(train[sarcasm_tweet_column])
CV_dev_vectors = CountVectorizer.transform(dev[sarcasm_tweet_column])
CV_test_vectors = CountVectorizer.transform(test[sarcasm_tweet_column])
print("done")

# One Hot Encoding
print("[One Hot Encoder] Encoding data...", end="")
OneHotEncoder = prep.OneHotEncoder(handle_unknown='ignore')
OHE_train_vectors = OneHotEncoder.fit_transform(train[[sarcasm_tweet_column]])
OHE_dev_vectors = OneHotEncoder.transform(dev[[sarcasm_tweet_column]])
OHE_test_vectors = OneHotEncoder.transform(test[[sarcasm_tweet_column]])
print("done")

# Word Vectorizaztion/Extraction
   #TfidfVectorizer
print("[Tfidf Vectorizer] Encoding data...", end="")
tfidf_vectorizer = featExtract.TfidfVectorizer()
Tfidf_train_vectors = tfidf_vectorizer.fit_transform(train[sarcasm_tweet_column])
Tfidf_dev_vectors = tfidf_vectorizer.transform(dev[sarcasm_tweet_column])
Tfidf_test_vectors = tfidf_vectorizer.transform(test[sarcasm_tweet_column])
print("done")

   #HashingVectorizer
print("[Hashing Vectorizer] Encoding data...", end="")
hashing_vectorizer = featExtract.HashingVectorizer(n_features=2**4)
hashing_train_vectors = hashing_vectorizer.fit_transform(train[sarcasm_tweet_column])
hashing_dev_vectors = hashing_vectorizer.transform(dev[sarcasm_tweet_column])
hashing_test_vectors = hashing_vectorizer.transform(test[sarcasm_tweet_column])
print("done")

---
# Individual Featurizer Analysis

### Model Predictor Function

A reusable function to predict on each model, gather statistics, and make matplotlib graphs.
<br>This function is SUPER helpful for systematically tuning the hyper-parameters of each model, as different models, prediction vectors, and statistical outpus can be gathered and specified by the parameters passed in. This made the experimenation process for observing and gathering data from the four featurizers and three classifiers simplified and consistent.

<br>Statistics for each model written to text file named `finalStats.txt`
<br>Images of the plots from each model saved to local directory

In [None]:
def model_Predicting(classifier, predict_vectors, correct_labels, model_name="ML Model", model_labels=[], matplot_filename="output.png", show_stats=True, make_plot=False, save_plot=False):
  # Multinomial NB Hyperparameter development
  pred = classifier.predict(predict_vectors)

  acc_score = metrics.accuracy_score(correct_labels.to_numpy(), pred)
  precision_score = metrics.precision_score(correct_labels.to_numpy(), pred, average='macro')
  recall_score = metrics.recall_score(correct_labels.to_numpy(), pred, average='macro')
  f1_score = metrics.f1_score(correct_labels.to_numpy(), pred, average='macro')
  roc_score = metrics.roc_auc_score(correct_labels.to_numpy(), pred)
  roc_auc = metrics.roc_auc_score(correct_labels.to_numpy(), pred)


  # If the boolean flag for showing outputs
  # is true, display matplotlib plot and metric data
  if (show_stats):
    print("Prediction:", pred)    
    print('Total accuracy classification score: {}'.format(acc_score))
    print('Total precision score: {}'.format(precision_score))
    print('Total recall score: {}'.format(recall_score))
    print('Total F1 classification score: {}'.format(f1_score))
    print('Total ROC classification score: {}'.format(roc_score))
    print('Total AUC of ROC classification: {}'.format(roc_auc))

  if(make_plot):
    # Get data for ROC Curve
    fpr, tpr, thresholds = metrics.roc_curve(correct_labels.to_numpy(), pred)

    # Create ROC curve
    plt.plot(fpr, tpr)

    # Add axis labels to plot
    title = model_name + " ROC Curve"
    plt.title(title)
    plt.legend(model_labels)
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')

    # Display plot and save plot as jpg
  if (save_plot):
    plt.savefig(matplot_filename)
  
  return {"accuracy": acc_score, "precision": precision_score, "recall": recall_score, "f1": f1_score, "roc": roc_score, "roc_auc": roc_auc}

In [None]:
# Train, dev, and test instances of sarcasm classifcation from dataset
train_is_sarcasm = train[is_sarcasm_column]
dev_is_sarcasm = dev[is_sarcasm_column]
test_is_sarcasm = test[is_sarcasm_column]

# A list of all train, dev, and test vectors from each featurizer
# Used for systematic hyper-parameter tuning
all_vectorizer_train_vectors = [CV_train_vectors, OHE_train_vectors, Tfidf_train_vectors, hashing_train_vectors]
all_vectorizer_dev_vectors = [CV_dev_vectors, OHE_dev_vectors, Tfidf_dev_vectors, hashing_dev_vectors]
all_vectorizer_test_vectors = [CV_test_vectors, OHE_test_vectors, Tfidf_test_vectors, hashing_test_vectors]
all_vectorizer_names = ["CountVectorizer", "OneHotEncoder", "TfidfVectorizer", "HashingVectorizer"]

# Final featurizer results for each model
featurization_multiNB_final_results = {}
featurization_SGD_final_results = {}
featurization_LogReg_final_results = {}

## Training, developing, and testing all Vectorization Methods

A list of hyperparameters were chosen for tuning, each with
a set of values to be tested on. A loop goes through all of these 
combinations and store the result of each parameter combination in 
order to find the best-in-class combination to use for testing each model.

In order to determine best-in-class hyperparameter combination, each
iteration creates an ROC AUC score that is stored in order to directly
compare the results across all iterations.

Each model is trained on training set, with hyperparameter tuning tested on dev set, testing the final evaluation of each iteration on the test set

### Multinomial Naive Bayes Model Testing

In [None]:
### MULTINOMIAL NAIVE BAYES


# Hyper parameters for tuning
alpha_values = np.linspace(0.00000001, 100, 250)
fit_prior_list = [True, False]

# Have to use one leass model for MultiNB since HashVectorizer did not work
NUM_MODELS_USED = len(all_vectorizer_train_vectors)-1

# Loops over each vectorizer to gather roc_auc_score data for comparison EXCEPT FOR LAST ONE (HashVectorizer)
for i in range(NUM_MODELS_USED):

  print("\n\n----------------------------------------- VECTOR MODEL:", all_vectorizer_names[i], "-----------------------------------------\n")
  print("Iteration Count:", end=" ")

  # AUC ROC metric list that holds one vecotrizer's roc_auc_score and
  # then resets when moving to the next vectorizer since the it is
  # used to find the best combination of hyperparameters for each model
  auc_roc_nb_resulting_metric = []
  count = 0

  # Iterate over all combinations of values between the 2 parameters' lists 
  for curr_alpha in alpha_values:
    for curr_bool_fit in fit_prior_list:
      
      # Show every 50th iteration (nicely formatted)
      if (count % 50 == 0):
        print(count, end=" ")
        
      # Naive Bayes training
      multiNB_clf = MultinomialNB(alpha=curr_alpha, fit_prior=curr_bool_fit)
      multiNB_clf.fit(all_vectorizer_train_vectors[i], train_is_sarcasm)

      # Multi NB Prediction with all dev vector sets
      metrics_dict = model_Predicting(classifier=multiNB_clf, predict_vectors=all_vectorizer_dev_vectors[i], correct_labels=dev_is_sarcasm, show_stats=False)

      # Adding ROC_AUC metric to globalized list
      auc_roc_nb_resulting_metric.append(metrics_dict["roc_auc"])
      count += 1
    

  # Finding max combination
  max_roc_index = np.where(auc_roc_nb_resulting_metric == max(auc_roc_nb_resulting_metric))[0][0]

  # Find alpha and fit_prior best combination indices
  alpha_index_loc = int(np.floor(max_roc_index/len(fit_prior_list)))
  fit_prior_index_loc = max_roc_index % len(fit_prior_list)

  # Extract alpha and fit_prior best combination values
  alpha_combo = alpha_values[alpha_index_loc]
  fit_prior_combo = fit_prior_list[fit_prior_index_loc]
  print("\nMax index:", max_roc_index, "with a value", max(auc_roc_nb_resulting_metric))
  print("Alpha combination:", alpha_combo)
  print("Fit Prior combination:", fit_prior_combo)

  # Using best hyperparameter combination, run model on test set
  best_hyper_param_combo = MultinomialNB(alpha=alpha_combo, fit_prior=fit_prior_combo)
  best_hyper_param_combo.fit(all_vectorizer_train_vectors[i], train_is_sarcasm)

  # Testing best combination hyperparameters on NB model with test data
  print("\nTESTING MULTINB MODEL")
  model_name = "MultiNB"

  # If on the last iteration, save plot
  if (i == (NUM_MODELS_USED-1)):
    plot_filename = model_name + "_roc_curve.jpg"
    metrics_dict = model_Predicting(classifier=best_hyper_param_combo, predict_vectors=all_vectorizer_test_vectors[i], correct_labels=test_is_sarcasm, model_name=model_name, model_labels=all_vectorizer_names[:NUM_MODELS_USED], make_plot=True, save_plot=True, matplot_filename=plot_filename)
  
  # Otherwise, just add to plot
  else:
    metrics_dict = model_Predicting(classifier=best_hyper_param_combo, predict_vectors=all_vectorizer_test_vectors[i], correct_labels=test_is_sarcasm, model_name=model_name, model_labels=all_vectorizer_names[:NUM_MODELS_USED], make_plot=True)

  # Add the each combination's ROC AUC score to a global dictionary
  featurization_multiNB_final_results[all_vectorizer_names[i]] = metrics_dict["roc_auc"]

In [None]:
### Finds the best-in-class featurizer for MultiNB model to 
# be written to a text file containing final statistics

ROUND_TO = 5
max_key = max(featurization_multiNB_final_results, key=featurization_multiNB_final_results.get)
max_value = round(max(featurization_multiNB_final_results.values()), ROUND_TO)

print(featurization_multiNB_final_results)
print("\n",max_key, "was the best model with an ROC AUC of:", max_value)

In [None]:
# Set up string to write stats to file
fileTotalStatString = "Total," + model_name + "," + str(featurization_multiNB_final_results) +"\n"
fileBestModelString = "Best," + model_name + "," + str(max_key) + "," + str(max_value) +"\n"

# Write to file
final_stats_file.write(model_name + "\n")
final_stats_file.write(fileTotalStatString)
final_stats_file.write(fileBestModelString)
final_stats_file.write(fileSectionBreakString)
final_stats_file.close()

### Stochastic Gradient Descent Model Testing

In [None]:
# SGD Model Training
### WARNING: With the number of combinations of
  # hyper-parameters, this code could take awhile to run
loss_values = ['hinge', 'log', 'modified_huber', 'perceptron', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive']
alpha_values = np.linspace(0.0001, 0.8, 10)
learning_rate_values = ['constant', 'optimal', 'invscaling']
MAX_ITER = 6000
ETA0 = 1
NUM_MODELS_USED = len(all_vectorizer_train_vectors)

# Loops over each vectorizer to gather roc_auc_score data for comparison EXCEPT FOR LAST ONE (HashVectorizer)
for i in range(NUM_MODELS_USED):

  print("\n\n----------------------------------------- VECTOR MODEL:", all_vectorizer_names[i], "-----------------------------------------\n")
  print("Iteration Count:", end=" ")

  # AUC ROC metric list that holds one vecotrizer's roc_auc_score and
  # then resets when moving to the next vectorizer since the it is
  # used to find the best combination of hyperparameters for each model
  auc_roc_SGD_resulting_metric = {}
  count = 0

  # Iterate over all combinations of values between the 2 parameters' lists 
  for curr_loss in loss_values:    
      for curr_alpha in alpha_values:
        for curr_learning_rate in learning_rate_values:

          # Show every 3rd iteration (nicely formatted)
          if (count % 28 == 0 and count != 0):
            print(count)
          elif (count % 3 == 0):
            #print(count, curr_loss, curr_learning_rate, curr_alpha, end="\n")
            print(count,end=" ")
          
          # Logistic Regression training
          SGD_clf = SGDClassifier(random_state=0, loss=curr_loss, max_iter=MAX_ITER, alpha=curr_alpha, learning_rate=curr_learning_rate, eta0=ETA0)
          SGD_clf.fit(all_vectorizer_train_vectors[i], train_is_sarcasm)

          # Logistic Regression Prediction with all dev vector sets
          metrics_dict = model_Predicting(classifier=SGD_clf, predict_vectors=all_vectorizer_dev_vectors[i], correct_labels=dev_is_sarcasm, show_stats=False)

          # Adding ROC_AUC metric to globalized dictionary
          hyperParamIterCombo = str(curr_loss) + "," + str(curr_alpha) + "," + str(curr_learning_rate)
          auc_roc_SGD_resulting_metric[hyperParamIterCombo] = metrics_dict["roc_auc"]
          count += 1


  # Finding max combination
  max_key = max(auc_roc_SGD_resulting_metric, key=auc_roc_SGD_resulting_metric.get)
  max_value = max(auc_roc_SGD_resulting_metric.values())

  best_values = max_key.split(",")

  best_loss = best_values[0]
  best_alpha = float(best_values[1])
  best_learning_rate = best_values[2]

  # Extract C combination value
  print("\nMax Settings:", max_key, "being a value of", max_value)
  print("Best values:", best_values)

  # Using best hyperparameter combination, run model on test set
  best_hyper_param_combo = SGDClassifier(random_state=0, loss=best_loss, max_iter=MAX_ITER, alpha=best_alpha, learning_rate=best_learning_rate, eta0=ETA0)
  best_hyper_param_combo.fit(all_vectorizer_train_vectors[i], train_is_sarcasm)

  # Testing best combination hyperparameters on NB model with test data
  print("\nTESTING SGD MODEL")
  model_name = "SGD"

  # If on the last iteration, save plot
  if (i == (NUM_MODELS_USED-1)):
    plot_filename = model_name + "_roc_curve.jpg"
    metrics_dict = model_Predicting(classifier=best_hyper_param_combo, predict_vectors=all_vectorizer_test_vectors[i], correct_labels=test_is_sarcasm, model_name=model_name, model_labels=all_vectorizer_names[:NUM_MODELS_USED], make_plot=True, save_plot=True, matplot_filename=plot_filename)
  
  # Otherwise, just add to plot
  else:
    metrics_dict = model_Predicting(classifier=best_hyper_param_combo, predict_vectors=all_vectorizer_test_vectors[i], correct_labels=test_is_sarcasm, model_name=model_name, model_labels=all_vectorizer_names[:NUM_MODELS_USED], make_plot=True)
  
  featurization_SGD_final_results[all_vectorizer_names[i]] = metrics_dict["roc_auc"]

In [None]:
### Finds the best-in-class featurizer for SGD model to 
# be written to a text file containing final statistics

ROUND_TO = 5
max_key = max(featurization_SGD_final_results, key=featurization_SGD_final_results.get)
max_value = round(max(featurization_SGD_final_results.values()), ROUND_TO)

print(featurization_SGD_final_results)
print("\n",max_key, "was the best model with an ROC AUC of:", max_value)


In [None]:
# Set up string to write stats to file
fileTotalStatString = "Total," + model_name + "," + str(featurization_SGD_final_results) +"\n"
fileBestModelString = "Best," + model_name + "," + str(max_key) + "," + str(max_value) +"\n"

# Write to file
final_stats_file = open("finalStats.txt", 'a')
final_stats_file.write(model_name + "\n")
final_stats_file.write(fileTotalStatString)
final_stats_file.write(fileBestModelString)
final_stats_file.write(fileSectionBreakString)
final_stats_file.close()

### Logistic Regression Model Testing

In [None]:
### Logistic Regression
  # DUE TO INSUFFICENT COMPUTING POWER, NOT AS MANY HYPERPARAMS WERE TESTED AS DESIRED
C_values = np.linspace(0.00000001, 10, 3)
MAX_ITERATIONS = 500
NUM_MODELS_USED = len(all_vectorizer_train_vectors)

# Loops over each vectorizer to gather roc_auc_score data for comparison EXCEPT FOR LAST ONE (HashVectorizer)
for i in range(NUM_MODELS_USED):

  print("\n\n----------------------------------------- VECTOR MODEL:", all_vectorizer_names[i], "-----------------------------------------\n")
  print("Iteration Count:", end=" ")

  # AUC ROC metric list that holds one vecotrizer's roc_auc_score and
  # then resets when moving to the next vectorizer since the it is
  # used to find the best combination of hyperparameters for each model
  auc_roc_logreg_resulting_metric = []
  count = 0

  # Iterate over all combinations of values between the 2 parameters' lists 
  for curr_C in C_values:    
    # Show every iteration
    print(count, end=" ")
      
    # Logistic Regression training
    LogReg_clf = LogisticRegression(random_state=0, solver='lbfgs', C=curr_C, max_iter=MAX_ITERATIONS)
    LogReg_clf.fit(all_vectorizer_train_vectors[i], train_is_sarcasm)

    # Logistic Regression Prediction with all dev vector sets
    metrics_dict = model_Predicting(classifier=LogReg_clf, predict_vectors=all_vectorizer_dev_vectors[i], correct_labels=dev_is_sarcasm, show_stats=False)

    # Adding ROC_AUC metric to globalized list
    auc_roc_logreg_resulting_metric.append(metrics_dict["roc_auc"])
    count += 1


  # Finding max combination
  print("\nROC AUC scores:", auc_roc_logreg_resulting_metric)
  max_roc_index = np.where(auc_roc_logreg_resulting_metric == max(auc_roc_logreg_resulting_metric))[0][0]

  # Find index location of max value for C (inverse regularization)
  C_index_loc = int(np.floor(max_roc_index/len(fit_prior_list)))+1

  # Extract C combination value
  C_combo = C_values[C_index_loc]
  print("\nMax index:", max_roc_index, "being a value of", max(auc_roc_logreg_resulting_metric))
  print("C combination:", C_combo)

  # Using best hyperparameter combination, run model on test set
  best_hyper_param_combo = LogisticRegression(random_state=0, solver='lbfgs', C=C_combo, max_iter=MAX_ITERATIONS)
  best_hyper_param_combo.fit(all_vectorizer_train_vectors[i], train_is_sarcasm)

  # Testing best combination hyperparameters on NB model with test data
  print("\nTESTING LOGISTIC REGRESSION MODEL")
  model_name = "LogReg"

  # If on the last iteration, save plot
  if (i == (NUM_MODELS_USED-1)):

    plot_filename = model_name + "_roc_curve.jpg"
    metrics_dict = model_Predicting(classifier=best_hyper_param_combo, predict_vectors=all_vectorizer_test_vectors[i], correct_labels=test_is_sarcasm, model_name=model_name, model_labels=all_vectorizer_names[:NUM_MODELS_USED], make_plot=True, save_plot=True, matplot_filename=plot_filename)
  
  # Otherwise, just add to plot
  else:
    metrics_dict = model_Predicting(classifier=best_hyper_param_combo, predict_vectors=all_vectorizer_test_vectors[i], correct_labels=test_is_sarcasm, model_name=model_name, model_labels=all_vectorizer_names[:NUM_MODELS_USED], make_plot=True)

  featurization_LogReg_final_results[all_vectorizer_names[i]] = metrics_dict["roc_auc"]

In [None]:
### Finds the best-in-class featurizer for Logistic Regression 
# model to be written to a text file containing final statistics

ROUND_TO = 5
max_key = max(featurization_LogReg_final_results, key=featurization_LogReg_final_results.get)
max_value = round(max(featurization_LogReg_final_results.values()), ROUND_TO)

print(featurization_LogReg_final_results)
print("\n",max_key, "was the best model with an ROC AUC of:", max_value)

In [None]:
# Set up string to write stats to file
fileTotalStatString = "Total," + model_name + "," + str(featurization_LogReg_final_results) +"\n"
fileBestModelString = "Best," + model_name + "," + str(max_key) + "," + str(max_value) +"\n"

# Write to file
final_stats_file = open("finalStats.txt", 'a')
final_stats_file.write(model_name + "\n")
final_stats_file.write(fileTotalStatString)
final_stats_file.write(fileBestModelString)
final_stats_file.write(fileSectionBreakString)
final_stats_file.close()

Statistics for each model written to text file named `finalStats.txt`
<br> Images of the plots from each model saved to local directory