# A. Setup Environment

In [0]:
import numpy as np
import pandas as pd
import json
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns

from itertools import cycle
from numpy import interp
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.metrics import roc_curve, auc, roc_auc_score

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
from google.colab import auth
auth.authenticate_user()
project_id = 'w266-gpu-setup'
!gcloud config set project {project_id}
!gsutil ls

Updated property [core/project].
gs://w266-nlp-bucket/


In [0]:
bucket_name = 'w266-nlp-bucket'
#!gsutil -m cp -r /content/drive/My\ Drive/Colab\ Notebooks/W266\ Final\ Project/predictions/* gs://{bucket_name}/predictions/
#!gsutil -m cp -r gs://{bucket_name}/predictions/*.* /content/drive/My\ Drive/Colab\ Notebooks/W266\ Final\ Project/predictions/
#!gsutil -m cp -r gs://{bucket_name}/predictions/*.* /content/drive/My\ Drive/Colab\ Notebooks/W266\ Final\ Project/predictions/
#!gsutil -m cp -r gs://{bucket_name}/predictions/*pp_cnn.json /content/drive/My\ Drive/Colab\ Notebooks/W266\ Final\ Project/predictions/

# B. Setup Functions

In [0]:
def plot_cm_count(y_true, y_pred, title_l1, title_l2, filepath, figsize=(10,8)):
    cm = confusion_matrix(y_true, y_pred, labels=np.unique(y_true))
    cm_sum = np.sum(cm, axis=1, keepdims=True)
    cm_perc = cm / cm_sum.astype(float) * 100
    annot = np.empty_like(cm).astype(str)
    nrows, ncols = cm.shape
    for i in range(nrows):
        for j in range(ncols):
            c = cm[i, j]
            p = cm_perc[i, j]
            if i == j:
                s = cm_sum[i]
                annot[i, j] = '%.1f%%\n%d/%d' % (p, c, s)
            else:
                annot[i, j] = '%.1f%%\n%d' % (p, c)
    cm = pd.DataFrame(cm, index=np.unique(y_true), columns=np.unique(y_true))
    fig, ax = plt.subplots(figsize=figsize)
    colormap = sns.diverging_palette(220, 20, sep=20, as_cmap=True)
    sns.heatmap(cm, 
                cmap=colormap, 
                annot=annot, 
                annot_kws={"size": 12}, 
                fmt='',
                linewidths=1, 
                square=True, 
                ax=ax)
    ax.set_title('Confusion Matrix\n' + title_l1 + "\n" + title_l2 + "\n", fontsize=16, weight='bold');
    ax.set_xlabel('\nPredicted Label', fontsize=12);
    ax.set_ylabel('Actual Label\n', fontsize=12);
    plt.savefig(filepath + '_cm_count.png', bbox_inches='tight', dpi=600)
    plt.close()
    return

In [0]:
def plot_cm_pct(y_true, y_pred, title_l1, title_l2, filepath, figsize=(10,8)):
    cf_matrix = confusion_matrix(y_true, y_pred, labels=np.unique(y_true))
    nrows, ncols = cf_matrix.shape

    cf_matrix_sum = np.sum(cf_matrix, axis=1, keepdims=True)
    cf_matrix_pct = cf_matrix / cf_matrix_sum.astype(float) * 100
    labels = np.empty_like(cf_matrix).astype(str)
    
    for i in range(nrows):
        for j in range(ncols):
            c = cf_matrix[i, j]
            p = cf_matrix_pct[i, j]
            if i == j:
                s = cf_matrix_sum[i]
                labels[i, j] = '%.1f%%\n%d/%d' % (p, c, s)
            else:
                labels[i, j] = '%.1f%%\n%d' % (p, c)
    cf_matrix = pd.DataFrame(cf_matrix, index=np.unique(y_true), columns=np.unique(y_true))
    
    fig, ax = plt.subplots(figsize=figsize)
    colormap = 'Blues' #sns.diverging_palette(220, 20, sep=20, as_cmap=True)
    sns.heatmap(cf_matrix_pct, 
                cmap=colormap, 
                annot=labels, 
                annot_kws={"size": 12}, 
                fmt='',
                linewidths=1, 
                square=True, 
                ax=ax)
    ax.set_title('Confusion Matrix\n' + title_l1 + "\n" + title_l2 + "\n", fontsize=16, weight='bold');
    ax.set_xlabel('\nPredicted Label', fontsize=12);
    ax.set_ylabel('Actual Label\n', fontsize=12);
    plt.savefig(filepath + '_cm_pct.png', bbox_inches='tight', dpi=600)
    plt.close()
    return

In [0]:
priorities = ["Low", "Normal", "High", "Critical"]

def plt_roc_auc(y_true, y_pred, title_l1, title_l2, filepath, figsize=(8,8)):
  # Compute ROC curve and ROC area for each class
  n_classes = len(y_true[0])
  fpr = dict()
  tpr = dict()
  roc_auc = dict()

  for i in range(n_classes):
      fpr[i], tpr[i], _ = roc_curve(y_true[:, i], y_pred[:, i])
      roc_auc[i] = auc(fpr[i], tpr[i])

  # Compute micro-average ROC curve and ROC area
  fpr["micro"], tpr["micro"], _ = roc_curve(y_true.ravel(), y_pred.ravel())
  roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

  ## Compute macro-average ROC curve and ROC area
  # Aggregate all false positive rates
  all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))

  # Interpolate all ROC curves
  mean_tpr = np.zeros_like(all_fpr)
  for i in range(n_classes):
      mean_tpr += interp(all_fpr, fpr[i], tpr[i])

  # Compute average and AUC
  mean_tpr /= n_classes

  fpr["macro"] = all_fpr
  tpr["macro"] = mean_tpr
  roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

  # Plot all ROC curves
  fig, ax = plt.subplots(figsize=figsize)

  plt.plot(fpr["macro"], tpr["macro"],
          label='Macro-Average ROC Curve (area = {0:0.2f}%)'.format(roc_auc["macro"]*100),
          color='#000067', linestyle='--', linewidth=2)

  plt.plot(fpr["micro"], tpr["micro"],
          label='Micro-Average ROC Curve (area = {0:0.2f}%)'.format(roc_auc["micro"]*100),
          color='#000067', linestyle=':', linewidth=2)

  colors = cycle(['green', '#a6a6a6', 'orange', 'red'])
  for i, color in zip(range(n_classes), colors):
      plt.plot(fpr[i], tpr[i], color=color, lw=2,
              label='ROC Curve of {} Priority (area = {:0.2f}%)'.format(str(priorities[i]), roc_auc[i]*100))

  plt.plot([0, 1], [0, 1], 'k--', lw=1, color='#cccccc')
  plt.xlim([0.0, 1.0])
  plt.ylim([0.0, 1.0])
  plt.xlabel('\nFalse Positive Rate', size=12)
  plt.ylabel('True Positive Rate\n', size=12)
  plt.title('Receiver Operating Characteristic for\n'+ title_l1 + "\n" + title_l2 + "\n", size=16, weight='bold')
  plt.legend(loc="lower right")
  plt.savefig(filepath + '_roc.png', bbox_inches='tight', dpi=600)
  plt.close()
  return

In [0]:
def plt_barchart(df, title):
  sns.set_style("white")

  df2 = df.groupby('PriorityID').agg({'PriorityID':['count']}).reset_index()
  df2.columns = ['PriorityID', 'count']
  df2['pct'] = df2['count']*100/(sum(df2['count']))

  x = df2['PriorityID']
  y = df2['pct']

  palette = ['red','orange', 'green', '#a6a6a6']

  fig, ax = plt.subplots(figsize = (8,4))
  fig = sns.barplot(y, x, estimator = sum, ci = None, orient='h', palette=palette)

  for i, v in enumerate(y):
    ax.text(v+1, i+.05, str(round(v,3))+'%', color='black', fontweight='bold')

  ax.set(xlim=(0,100))
  plt.title(title + '\nTicket Priority as Percentage of Total', size=16, weight='bold')
  plt.ylabel('Ticket Priority')
  plt.xlabel('% Total')
  plt.savefig(filepath + '_bar.png', bbox_inches='tight', dpi=600)
  plt.close()
  return

# C. Import Files

In [0]:
!ls -all '/content/drive/My Drive/Colab Notebooks/W266 Final Project/predictions/'

total 2103632
-rw------- 1 root root    10494 Apr 14 17:05  all_summary_results.txt
-rw------- 1 root root 51084017 Apr 14 17:01  bert_b1_20200410-1646.json
-rw------- 1 root root 49077997 Apr 14 17:01  bert_b1_downsampled_20200409-2149.json
-rw------- 1 root root 97164486 Apr 14 17:01  bert_b1_smote_20200412-1126.json
-rw------- 1 root root 35710301 Apr 14 17:01  bert_b2_20200410-1211.json
-rw------- 1 root root 34588965 Apr 14 17:01  bert_b2_downsampled_20200410-2208.json
-rw------- 1 root root 69958879 Apr 14 17:01  bert_b2_smote_20200411-2154.json
-rw------- 1 root root 49666066 Apr 14 17:01  ffcnn_b1_20200413-0526.json
-rw------- 1 root root 48493383 Apr 14 17:01  ffcnn_b1_downsampled_20200412-2019.json
-rw------- 1 root root 96834894 Apr 14 17:01  ffcnn_b1_smote_20200413-1859.json
-rw------- 1 root root 35330622 Apr 14 17:01  ffcnn_b2_20200414-0008.json
-rw------- 1 root root 35497348 Apr 14 17:01  ffcnn_b2_downsampled_20200413-1856.json
-rw------- 1 root root 69300196 Apr 14 17:

In [0]:
#output_predictions_20200410-121116210706.json # baseline2_preprocessed results 202000410-1211
#output_predictions_20200410-164635611853.json # baseline1_preprocessed_unbalanced predictions 20200410-1646
#output_predictions_20200410-220812317449.json # baseline2 downsampled results 20200410 2208
#output_predictions_20200411-021939159751.json # Baseline 2 downsampled predictions with alternate labeling classification scheme 20200311-0219 *** DO NOT USE
#output_predictions_20200411-215411759773.json # Results Baseline 2 SMOTE predictions 20200411-2154 **** This is Baseline 2 SMOTE ****
#output_predictions_20200411-234454049210.json # Results for frozen BERT + FF NN Baseline 1 down sampled 20200411-2344
#output_predictions_20200412-090532626216.json # Results: Frozen BERT + FF Dense; Baseline1 Preprocessed 20200412-0905
#output_predictions_20200412-112654021246.json # Simple BERT Classifier, Baseline 2 SMOTE - 20200412-1126 ********THIS IS Baseline 1 SMOTE
#output_predictions_20200412-201935843909_baseline1_down_cnn.json #Results - Frozen BERT -> CNN -> Dense - Baseline1 Downsampled - 20200 
#output_predictions_20200413-052644970486_baseline1_pp_cnn.json
#output_predictions_20200412-201935843909_baseline1_down_cnn.json
#output_predictions_20200413-185914072976_baseline1_smote_cnn.json
#output_predictions_20200413-185620313960_baseline2_downsampled_cnn.json
#output_predictions_20200413-044822693884_baseline1_smote_ff.json
#output_predictions_20200413-090707671570_baseline2_pp_ff.json
#output_predictions_20200413-165636760317_baseline2_downsampled_ff.json
#output_predictions_20200413-185620313960_baseline2_downsampled_cnn.json
#output_predictions_20200413-185914072976_baseline1_smote_cnn.json
#output_predictions_20200414-000843020206_baseline2_pp_cnn.json
#output_predictions_20200414-050347010869_baseline2_smote_ff.json
#output_predictions_20200414-143632209639_baseline2_smote_cnn.json

## Baseline 2 downsampled predictions with alternate labeling classification scheme 20200311-0219
#!cp '/content/drive/My Drive/Colab Notebooks/W266 Final Project/predictions/output_predictions_20200411-021939159751.json' '/content/drive/My Drive/Colab Notebooks/W266 Final Project/predictions/pred_bert_baseline2_bal_downsampled_alt_label_20200411-0219.json'

## Data for Model 1 (Fine Tuning BERT)

In [0]:
################################################################################
# MODEL 1: Finetuning BERT
# ------------------------------------------------------------------------------
# Baseline 1: Unbalanced, Balanced Downsampled and Balanced SMOTE Datasets
# ------------------------------------------------------------------------------
# Simple BERT Classifier on Baseline 1 Unbalanced Dataset
!cp '/content/drive/My Drive/Colab Notebooks/W266 Final Project/predictions/output_predictions_20200410-164635611853.json' '/content/drive/My Drive/Colab Notebooks/W266 Final Project/predictions/bert_b1_20200410-1646.json'
# Simple BERT Classifier on Baseline 1 Downsampled Dataset
!cp '/content/drive/My Drive/Colab Notebooks/W266 Final Project/predictions/output_predictions_20200409-214949693032.json' '/content/drive/My Drive/Colab Notebooks/W266 Final Project/predictions/bert_b1_downsampled_20200409-2149.json'
# Simple BERT Classifier on Baseline 1 SMOTE Dataset (was labeled as baseline 2 smote in github but is actually baseline 1)
!cp '/content/drive/My Drive/Colab Notebooks/W266 Final Project/predictions/output_predictions_20200412-112654021246.json' '/content/drive/My Drive/Colab Notebooks/W266 Final Project/predictions/bert_b1_smote_20200412-1126.json'

# ------------------------------------------------------------------------------
# Baseline 2: Unbalanced, Balanced Downsampled and Balanced SMOTE Datasets
# ------------------------------------------------------------------------------
# Simple BERT Classifier on Baseline 2 Unbalanced Dataset
!cp '/content/drive/My Drive/Colab Notebooks/W266 Final Project/predictions/output_predictions_20200410-121116210706.json' '/content/drive/My Drive/Colab Notebooks/W266 Final Project/predictions/bert_b2_20200410-1211.json'
# Simple BERT Classifier on Baseline 2 Downsampled Dataset
!cp '/content/drive/My Drive/Colab Notebooks/W266 Final Project/predictions/output_predictions_20200410-220812317449.json' '/content/drive/My Drive/Colab Notebooks/W266 Final Project/predictions/bert_b2_downsampled_20200410-2208.json'
# Simple BERT Classifier on Baseline 2 SMOTE Dataset
!cp '/content/drive/My Drive/Colab Notebooks/W266 Final Project/predictions/output_predictions_20200411-215411759773.json' '/content/drive/My Drive/Colab Notebooks/W266 Final Project/predictions/bert_b2_smote_20200411-2154.json'

## Data for Model 2 (Feed Forward Dense Neural Network Using BERT Frozen Weights)

In [0]:
################################################################################
# MODEL 2: BERT Frozen Weights with Feed Forward Dense Neural Network
# ------------------------------------------------------------------------------
# Baseline 1: Unbalanced, Balanced Downsampled and Balanced SMOTE Datasets
# ------------------------------------------------------------------------------
# BERT Frozen Weights and FF Dense NN on Baseline 1 Unbalanced Dataset
!cp '/content/drive/My Drive/Colab Notebooks/W266 Final Project/predictions/output_predictions_20200412-090532626216.json' '/content/drive/My Drive/Colab Notebooks/W266 Final Project/predictions/ffnn_b1_20200412-0905.json'
# BERT Frozen Weights and FF Dense NN on Baseline 1 Downsampled Dataset
!cp '/content/drive/My Drive/Colab Notebooks/W266 Final Project/predictions/output_predictions_20200411-234454049210.json' '/content/drive/My Drive/Colab Notebooks/W266 Final Project/predictions/ffnn_b1_downsampled_20200411-2344.json'
# BERT Frozen Weights and FF Dense NN on Baseline 1 SMOTE Dataset
!cp '/content/drive/My Drive/Colab Notebooks/W266 Final Project/predictions/output_predictions_20200413-044822693884_baseline1_smote_ff.json' '/content/drive/My Drive/Colab Notebooks/W266 Final Project/predictions/ffnn_b1_smote_20200413-0448.json'

# ------------------------------------------------------------------------------
# Baseline 2: Unbalanced, Balanced Downsampled and Balanced SMOTE Datasets
# ------------------------------------------------------------------------------
# BERT Frozen Weights and FF Dense NN on Baseline 2 Unbalanced Dataset
!cp '/content/drive/My Drive/Colab Notebooks/W266 Final Project/predictions/output_predictions_20200413-090707671570_baseline2_pp_ff.json' '/content/drive/My Drive/Colab Notebooks/W266 Final Project/predictions/ffnn_b2_20200413-0907.json'
# BERT Frozen Weights and FF Dense NN on Baseline 2 Downsampled Dataset
!cp '/content/drive/My Drive/Colab Notebooks/W266 Final Project/predictions/output_predictions_20200413-165636760317_baseline2_downsampled_ff.json' '/content/drive/My Drive/Colab Notebooks/W266 Final Project/predictions/ffnn_b2_downsampled_20200413-1656.json'
# BERT Frozen Weights and FF Dense NN on Baseline 2 SMOTE Dataset
!cp '/content/drive/My Drive/Colab Notebooks/W266 Final Project/predictions/output_predictions_20200414-050347010869_baseline2_smote_ff.json' '/content/drive/My Drive/Colab Notebooks/W266 Final Project/predictions/ffnn_b2_smote_20200414-0503.json'

## Data for Model 3 (Feed Forward CNN to Dense Neural Network Using BERT Frozen Weights)

In [0]:
################################################################################
# MODEL 3: BERT Frozen Weights with Feed Forward CNN to a Dense Neural Network
# ------------------------------------------------------------------------------
# Baseline 1: Unbalanced, Balanced Downsampled and Balanced SMOTE Datasets
# ------------------------------------------------------------------------------
# BERT Frozen Weights > CNN > FF Dense NN on Baseline 1 Unbalanced Dataset
!cp '/content/drive/My Drive/Colab Notebooks/W266 Final Project/predictions/output_predictions_20200413-052644970486_baseline1_pp_cnn.json' '/content/drive/My Drive/Colab Notebooks/W266 Final Project/predictions/ffcnn_b1_20200413-0526.json'
# BERT Frozen Weights > CNN > FF Dense NN on Baseline 1 Downsampled Dataset
!cp '/content/drive/My Drive/Colab Notebooks/W266 Final Project/predictions/output_predictions_20200412-201935843909_baseline1_down_cnn.json' '/content/drive/My Drive/Colab Notebooks/W266 Final Project/predictions/ffcnn_b1_downsampled_20200412-2019.json'
# BERT Frozen Weights > CNN > FF Dense NN on Baseline 1 SMOTE Dataset
!cp '/content/drive/My Drive/Colab Notebooks/W266 Final Project/predictions/output_predictions_20200413-185914072976_baseline1_smote_cnn.json' '/content/drive/My Drive/Colab Notebooks/W266 Final Project/predictions/ffcnn_b1_smote_20200413-1859.json'

# ------------------------------------------------------------------------------
# Baseline 2: Unbalanced, Balanced Downsampled and Balanced SMOTE Datasets
# ------------------------------------------------------------------------------
# BERT Frozen Weights > CNN > FF Dense NN on Baseline 2 Unbalanced Dataset
#!cp '/content/drive/My Drive/Colab Notebooks/W266 Final Project/predictions/output_predictions_20200414-000843020206_baseline2_pp_cnn.json' '/content/drive/My Drive/Colab Notebooks/W266 Final Project/predictions/ffcnn_b2_20200414-0008.json'
!cp '/content/drive/My Drive/Colab Notebooks/W266 Final Project/predictions/output_predictions_20200417-192942861363_baseline2_pp_cnn.json' '/content/drive/My Drive/Colab Notebooks/W266 Final Project/predictions/ffcnn_b2_20200417-1929.json'
# BERT Frozen Weights > CNN > FF Dense NN on Baseline 2 Downsampled Dataset
!cp '/content/drive/My Drive/Colab Notebooks/W266 Final Project/predictions/output_predictions_20200413-185620313960_baseline2_downsampled_cnn.json' '/content/drive/My Drive/Colab Notebooks/W266 Final Project/predictions/ffcnn_b2_downsampled_20200413-1856.json'
# BERT Frozen Weights > CNN > FF Dense NN on Baseline 2 SMOTE Dataset
!cp '/content/drive/My Drive/Colab Notebooks/W266 Final Project/predictions/output_predictions_20200414-143632209639_baseline2_smote_cnn.json' '/content/drive/My Drive/Colab Notebooks/W266 Final Project/predictions/ffcnn_b2_smote_20200414-1436.json'

In [0]:
#!cp '/content/drive/My Drive/Colab Notebooks/W266 Final Project/predictions/' '/content/drive/My Drive/Colab Notebooks/W266 Final Project/predictions/'
!ls -all '/content/drive/My Drive/Colab Notebooks/W266 Final Project/predictions/'

total 2138923
-rw------- 1 root root    10494 Apr 14 17:05  all_summary_results.txt
-rw------- 1 root root 51084017 Apr 14 17:01  bert_b1_20200410-1646.json
-rw------- 1 root root 49077997 Apr 14 17:01  bert_b1_downsampled_20200409-2149.json
-rw------- 1 root root 97164486 Apr 14 17:01  bert_b1_smote_20200412-1126.json
-rw------- 1 root root 35710301 Apr 14 17:01  bert_b2_20200410-1211.json
-rw------- 1 root root 34588965 Apr 14 17:01  bert_b2_downsampled_20200410-2208.json
-rw------- 1 root root 69958879 Apr 14 17:01  bert_b2_smote_20200411-2154.json
-rw------- 1 root root 49666066 Apr 17 19:51  ffcnn_b1_20200413-0526.json
-rw------- 1 root root 48493383 Apr 17 19:51  ffcnn_b1_downsampled_20200412-2019.json
-rw------- 1 root root 96834894 Apr 17 19:51  ffcnn_b1_smote_20200413-1859.json
-rw------- 1 root root 35330622 Apr 14 17:01  ffcnn_b2_20200414-0008.json
-rw------- 1 root root 36137052 Apr 17 19:51  ffcnn_b2_20200417-1929.json
-rw------- 1 root root 35497348 Apr 17 19:51  ffcnn_b2

In [0]:
df = pd.DataFrame()

list_model_name = [] 
list_model_name.extend(["BERT Fine Tuning Model" for i in range(6)]) 
list_model_name.extend(["BERT Frozen Weights with FF Dense NN Model" for i in range(6)]) 
list_model_name.extend(["BERT Frozen Weights with FF CNN to a Dense NN" for i in range(6)]) 

df = pd.DataFrame(list_model_name, columns={"model_name"})

list_dataset = [1, 1, 1, 2, 2, 2]*3
df["dataset"] = pd.DataFrame(list_dataset)

list_dataset_type = ["preprocessed", "downsampled", "smote"]*6
df["dataset_type"] = pd.DataFrame(list_dataset_type)

list_title_l1 = []
list_title_l1.extend(["BERT Finetuning Model Using" for i in range(6)]) 
list_title_l1.extend(["BERT Frozen Weights with FF Dense NN Using" for i in range(6)]) 
list_title_l1.extend(["BERT Frozen Weights with FF CNN to a Dense NN Using" for i in range(6)]) 
df["title_l1"] = pd.DataFrame(list_title_l1)

list_title_l2 = ["Baseline 1 Dataset", "Baseline 1 Downsampled Dataset", "Baseline 1 SMOTE Dataset",
                 "Baseline 2 Dataset", "Baseline 2 Downsampled Dataset", "Baseline 2 SMOTE Dataset"]*3
df["title_l2"] = pd.DataFrame(list_title_l2)

list_pred_file = ['bert_b1_20200410-1646.json',
                  'bert_b1_downsampled_20200409-2149.json',
                  'bert_b1_smote_20200412-1126.json', 
                  'bert_b2_20200410-1211.json', 
                  'bert_b2_downsampled_20200410-2208.json', 
                  'bert_b2_smote_20200411-2154.json', 
                  'ffnn_b1_20200412-0905.json', 
                  'ffnn_b1_downsampled_20200411-2344.json',
                  'ffnn_b1_smote_20200413-0448.json',
                  'ffnn_b2_20200413-0907.json',
                  'ffnn_b2_downsampled_20200413-1656.json',
                  'ffnn_b2_smote_20200414-0503.json',
                  'ffcnn_b1_20200413-0526.json',
                  'ffcnn_b1_downsampled_20200412-2019.json',
                  'ffcnn_b1_smote_20200413-1859.json',
                  'ffcnn_b2_20200417-1929.json', #'ffcnn_b2_20200414-0008.json',
                  'ffcnn_b2_downsampled_20200413-1856.json',
                  'ffcnn_b2_smote_20200414-1436.json']
df["pred_file"] = pd.DataFrame(list_pred_file)

pred_path = '/content/drive/My Drive/Colab Notebooks/W266 Final Project/predictions/'
df["img_filename"] = pd.DataFrame(["img_" + i[:-5] for i in df.pred_file])
df[:18]

Unnamed: 0,model_name,dataset,dataset_type,title_l1,title_l2,pred_file,img_filename
0,BERT Fine Tuning Model,1,preprocessed,BERT Finetuning Model Using,Baseline 1 Dataset,bert_b1_20200410-1646.json,img_bert_b1_20200410-1646
1,BERT Fine Tuning Model,1,downsampled,BERT Finetuning Model Using,Baseline 1 Downsampled Dataset,bert_b1_downsampled_20200409-2149.json,img_bert_b1_downsampled_20200409-2149
2,BERT Fine Tuning Model,1,smote,BERT Finetuning Model Using,Baseline 1 SMOTE Dataset,bert_b1_smote_20200412-1126.json,img_bert_b1_smote_20200412-1126
3,BERT Fine Tuning Model,2,preprocessed,BERT Finetuning Model Using,Baseline 2 Dataset,bert_b2_20200410-1211.json,img_bert_b2_20200410-1211
4,BERT Fine Tuning Model,2,downsampled,BERT Finetuning Model Using,Baseline 2 Downsampled Dataset,bert_b2_downsampled_20200410-2208.json,img_bert_b2_downsampled_20200410-2208
5,BERT Fine Tuning Model,2,smote,BERT Finetuning Model Using,Baseline 2 SMOTE Dataset,bert_b2_smote_20200411-2154.json,img_bert_b2_smote_20200411-2154
6,BERT Frozen Weights with FF Dense NN Model,1,preprocessed,BERT Frozen Weights with FF Dense NN Using,Baseline 1 Dataset,ffnn_b1_20200412-0905.json,img_ffnn_b1_20200412-0905
7,BERT Frozen Weights with FF Dense NN Model,1,downsampled,BERT Frozen Weights with FF Dense NN Using,Baseline 1 Downsampled Dataset,ffnn_b1_downsampled_20200411-2344.json,img_ffnn_b1_downsampled_20200411-2344
8,BERT Frozen Weights with FF Dense NN Model,1,smote,BERT Frozen Weights with FF Dense NN Using,Baseline 1 SMOTE Dataset,ffnn_b1_smote_20200413-0448.json,img_ffnn_b1_smote_20200413-0448
9,BERT Frozen Weights with FF Dense NN Model,2,preprocessed,BERT Frozen Weights with FF Dense NN Using,Baseline 2 Dataset,ffnn_b2_20200413-0907.json,img_ffnn_b2_20200413-0907


In [0]:
df = df[df.pred_file != '***** TBU *****']
df.reset_index(inplace=True) 

In [0]:
#i = 15
#title_l1 = df.title_l1[i]
#title_l2 = df.title_l2[i]
#pred_file = df.pred_file[i]
#img_filepath = pred_path + df.img_filename[i]

#with open(pred_path + pred_file,'r') as f:
#  json_data = json.load(f)

##print(json_data.keys())
#y_pred = np.asarray(json_data['predictions'])
#y_true = np.asarray(json_data['true_labels'])
##print(y_pred.shape)
##print(y_true.shape)

#plt_roc_auc(y_true, y_pred, title_l1, title_l2, img_filepath)
#print()
#print(title_l1, "\t", title_l2, "\t", df.img_filename[i])

#y_pred = y_pred.argmax(axis=1)
#y_true = y_true.argmax(axis=1)

#plot_cm_pct(y_true, y_pred, title_l1, title_l2, img_filepath)
#print(title_l1, "\t", title_l2, "\t", df.img_filename[i])


BERT Frozen Weights with FF CNN to a Dense NN Using 	 Baseline 2 Dataset 	 img_ffcnn_b2_20200417-1929
BERT Frozen Weights with FF CNN to a Dense NN Using 	 Baseline 2 Dataset 	 img_ffcnn_b2_20200417-1929


In [0]:
for i in range(len(df)):
  title_l1 = df.title_l1[i]
  title_l2 = df.title_l2[i]
  pred_file = df.pred_file[i]
  img_filepath = pred_path + df.img_filename[i]

  with open(pred_path + pred_file,'r') as f:
    json_data = json.load(f)

  #print(json_data.keys())
  y_pred = np.asarray(json_data['predictions'])
  y_true = np.asarray(json_data['true_labels'])
  #print(y_pred.shape)
  #print(y_true.shape)

  plt_roc_auc(y_true, y_pred, title_l1, title_l2, img_filepath)
  print()
  print(title_l1, "\t", title_l2, "\t", df.img_filename[i])

  y_pred = y_pred.argmax(axis=1)
  y_true = y_true.argmax(axis=1)

  plot_cm_pct(y_true, y_pred, title_l1, title_l2, img_filepath)
  print(title_l1, "\t", title_l2, "\t", df.img_filename[i])


BERT Finetuning Model Using 	 Baseline 1 Dataset 	 img_bert_b1_20200410-1646
BERT Finetuning Model Using 	 Baseline 1 Dataset 	 img_bert_b1_20200410-1646

BERT Finetuning Model Using 	 Baseline 1 Downsampled Dataset 	 img_bert_b1_downsampled_20200409-2149
BERT Finetuning Model Using 	 Baseline 1 Downsampled Dataset 	 img_bert_b1_downsampled_20200409-2149

BERT Finetuning Model Using 	 Baseline 1 SMOTE Dataset 	 img_bert_b1_smote_20200412-1126
BERT Finetuning Model Using 	 Baseline 1 SMOTE Dataset 	 img_bert_b1_smote_20200412-1126

BERT Finetuning Model Using 	 Baseline 2 Dataset 	 img_bert_b2_20200410-1211
BERT Finetuning Model Using 	 Baseline 2 Dataset 	 img_bert_b2_20200410-1211

BERT Finetuning Model Using 	 Baseline 2 Downsampled Dataset 	 img_bert_b2_downsampled_20200410-2208
BERT Finetuning Model Using 	 Baseline 2 Downsampled Dataset 	 img_bert_b2_downsampled_20200410-2208

BERT Finetuning Model Using 	 Baseline 2 SMOTE Dataset 	 img_bert_b2_smote_20200411-2154
BERT Finetuning

In [0]:
#

In [0]:
#import warnings
#warnings.filterwarnings('always')

import sys
orig_stdout = sys.stdout
f = open(pred_path + 'all_summary_results.txt', 'w')
sys.stdout = f


#for i in range(len(df)):
#  title_l1 = df.title_l1[i]
#  title_l2 = df.title_l2[i]
#  pred_file = df.pred_file[i]
#  img_filepath = pred_path + df.img_filename[i]#

#  with open(pred_path + pred_file,'r') as f:
#    json_data = json.load(f)

#  y_pred = np.asarray(json_data['predictions'])
#  y_true = np.asarray(json_data['true_labels'])

#  y_pred = y_pred.argmax(axis=1)
#  y_true = y_true.argmax(axis=1)

#  print(title_l1)
#  print(title_l2)
#  print(classification_report(y_true, y_pred, digits=4))
#  print()
#  print("-"*80)


i = 15
title_l1 = df.title_l1[i]
title_l2 = df.title_l2[i]
pred_file = df.pred_file[i]
img_filepath = pred_path + df.img_filename[i]

with open(pred_path + pred_file,'r') as f:
  json_data = json.load(f)

y_pred = np.asarray(json_data['predictions'])
y_true = np.asarray(json_data['true_labels'])

y_pred = y_pred.argmax(axis=1)
y_true = y_true.argmax(axis=1)

print(title_l1)
print(title_l2)
print(classification_report(y_true, y_pred, digits=4))
print()
print("-"*80)



sys.stdout = orig_stdout
f.close()

  _warn_prf(average, modifier, msg_start, len(result))
