In [2]:
import torch
import numpy as np
import os
import glob
import re
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics import precision_recall_curve, auc
import warnings
from PIL import Image
import json

from pyjarowinkler import distance as jw_dist

1. Creating the training and testing learning curves over epoch:

In [3]:
dir_name_list = glob.glob("results")

for dir_name in dir_name_list:
  train_dir = os.path.join(dir_name, "train")
  test_dir = os.path.join(dir_name, "val")

  epoch_numbers = list()
  train_f_scores = list()
  test_f_scores = list()

  train_csv_list = [f for f in os.listdir(os.path.join(train_dir, str(0))) if os.path.isfile(os.path.join(train_dir, str(0), f))]
  test_csv_list = [f for f in os.listdir(os.path.join(train_dir, str(0))) if os.path.isfile(os.path.join(test_dir, str(0), f))]

  for f in train_csv_list:
    epoch_num = int(f[-7:-4])
    epoch_numbers.append(epoch_num)

  for csv_train in train_csv_list:
    df = pd.DataFrame()
    for i in range(5):
      curr_file = os.path.join(train_dir, str(i), csv_train)
      if not os.path.isfile(curr_file):
        continue
      df_curr = pd.read_csv(curr_file)
      df = pd.concat([df, df_curr])

    label = df['label'].to_numpy()
    model_score = df['model_score'].to_numpy()

    precision, recall, thresholds = precision_recall_curve(label, model_score)

    f_score_list = list()
    for pre, rec in zip(precision, recall):
      lower = pre + rec
      if lower == 0.0:
        lower = 0.00000001
      
      f_score_list.append(2 * (pre * rec) / lower)

    f_score = max(f_score_list)

    train_f_scores.append(f_score)
  
  highest_f_score = 0
  highest_f_score_idx = 0

  for csv_test, e in zip(test_csv_list, epoch_numbers):
    df = pd.DataFrame()
    for i in range(5):
      curr_file = os.path.join(test_dir, str(i), csv_test)
      df_curr = pd.read_csv(curr_file)
      df = pd.concat([df, df_curr])

    label = df['label'].to_numpy().astype(np.int8)
    model_score = df['model_score'].to_numpy()

    precision, recall, thresholds = precision_recall_curve(label, model_score)

    f_score_list = list()
    for pre, rec in zip(precision, recall):
      lower = pre + rec
      if lower == 0.0:
        lower = 0.00000001
      
      f_score_list.append(2 * (pre * rec) / lower)

    f_score = max(f_score_list)

    if f_score > highest_f_score:
      highest_f_score = f_score
      highest_f_score_idx = e

    test_f_scores.append(f_score)

  teststr = f"Highest Val Score: {str(highest_f_score)[0:5]} @ epoch {highest_f_score_idx}"
  

  print(epoch_numbers)
  plt.plot(epoch_numbers, train_f_scores, color = "red", label = "Train")
  plt.plot(epoch_numbers, test_f_scores, color = "blue", label = "Val")
  plt.legend(loc="upper left")
  plt.title(f'Run Name: {dir_name[:-1]}\n{teststr}')
  plt.ylabel('F-Score')
  plt.xlabel('Epoch Number')
  plt.savefig(os.path.join(dir_name, "train_test_curve.png"))
  plt.save("test")

FileNotFoundError: [Errno 2] No such file or directory: '__pycache__/train/0'

2. Creating a set of F-score curves

In [None]:
warnings.simplefilter('error')

dir_name_list = glob("*/")
if os.path.isfile("results.zip"):
  dir_name_list.remove("results.zip")

for dir_name in dir_name_list:
  train_dir = os.path.join(dir_name, "train")
  test_dir = os.path.join(dir_name, "test")

  train_csv_list = [f for f in os.listdir(train_dir) if os.path.isfile(os.path.join(train_dir, f))]
  test_csv_list = [f for f in os.listdir(test_dir) if os.path.isfile(os.path.join(test_dir, f))]
    
  #TEST JW
  jw_test = list()
  df = pd.read_csv(os.path.join(test_dir, test_csv_list[0]))
  name1 = df['name1']
  name2 = df['name2']
  label = df['label']

  for n1, n2 in zip(name1, name2):
    if isinstance(n1, float) or isinstance(n2, float):
      jw_test.append(0.8)
      continue
    dist = jw_dist.get_jaro_distance(n1, n2, winkler = True, scaling = 0.1)
    jw_test.append(dist)

  precision_jw, recall_jw, jw_thresholds = precision_recall_curve(label, jw_test)
  f_score_jw_list = list()
  for pre, rec in zip(precision_jw, recall_jw):
    lower = pre + rec
    if lower == 0.0:
      lower = 0.00000001
    
    f_score_jw_list.append(2 * (pre * rec) / lower)

  highest_f_score = 0
  highest_f_score_idx = 0
  highest_f_score_scores = None
  highest_f_score_precision = None
  highest_f_score_recall = None

  for csv_test in test_csv_list:
    for i in range(5):
    curr_file = os.path.join(test_dir, csv_test)
    df = pd.read_csv(curr_file)

    label = df['label']
    model_score = df['model_score']

    epoch_num = int(curr_file[-7:-4])

    df = df.to_numpy()

    model_score = model_score.to_numpy()
    label = label.to_numpy().astype(np.int8)

    precision_mod, recall_mod, thresholds = precision_recall_curve(label, model_score)

    f_score_test_list = list()
    for pre, rec in zip(precision_mod, recall_mod):
      lower = pre + rec
      if lower == 0.0:
        lower = 0.00000001
      
      f_score_test_list.append(2 * (pre * rec) / lower)

    f_score = max(f_score_test_list)

    if f_score > highest_f_score:
      highest_f_score = f_score
      highest_f_score_idx = epoch_num
      highest_f_score_scores = f_score_test_list
      highest_f_score_precision = precision_mod
      highest_f_score_recall = recall_mod

    auc_score = auc(recall_mod, precision_mod)

    teststr = f"Highest Test Score: {str(highest_f_score)[0:5]} @ epoch {highest_f_score_idx}"
    aucstr = f"AUC: {str(auc_score)[0:7]}"

    save_dir = os.path.join(dir_name, "f_curves")
    if not os.path.isdir(save_dir):
      os.mkdir(save_dir)

    plt.plot(highest_f_score_recall, highest_f_score_precision, color = "lightcoral", label = "Sia (Best)")
    plt.plot(recall_mod, precision_mod, color = "blue", label = "Sia")
    plt.plot(recall_jw, precision_jw, color = "red", label = "JW")
    plt.xlim((-0.05, 1.05))
    plt.ylim((-0.05, 1.05))
    plt.legend(loc="lower left")
    plt.title(f'Run Name: {dir_name[:-1]}\n{teststr}\n{aucstr}')
    plt.ylabel('F-Score')
    plt.xlabel('Thresholds')
    plt.savefig(os.path.join(save_dir, f"curve{str(epoch_num).zfill(3)}.png"))
    plt.figure().clear()
    plt.close()
    plt.cla()
    plt.clf()

3. Creating gifs for the changing F-score curves over epochs

In [None]:
%cd /content/drive/MyDrive/7Channel/siamese_vinden/results
warnings.simplefilter('error')

'''
dir_name_list = glob("*/")
dir_name_list = glob("*/")
if 'init_11_25/' in dir_name_list:
  dir_name_list.remove('init_11_25/')
'''
dir_name_list = ['init_41_50/', ]

for dir_name in dir_name_list:
  train_dir = os.path.join(dir_name, "train")
  test_dir = os.path.join(dir_name, "test")

  # filepaths
  fp_in = os.path.join(dir_name, "f_curves", "curve*.png")
  fp_out = os.path.join(dir_name, "F_curve.gif")

  # https://pillow.readthedocs.io/en/stable/handbook/image-file-formats.html#gif
  img, *imgs = [Image.open(f) for f in sorted(glob.glob(fp_in))]
  img.save(fp=fp_out, format='GIF', append_images=imgs,
          save_all=True, duration=200, loop=0)

/content/drive/MyDrive/7Channel/siamese_vinden/results


4. Creating CSV files for positive, negative and jeremy for a initalized JSON file.

In [None]:
%cd /content/drive/MyDrive/7Channel/siamese_vinden/results
f = open('../data/init_41_50.json', 'r')
 
# returns JSON object as
# a dictionary
data = json.load(f)

jer = np.concatenate(data['jeremy'])
ran = np.concatenate(data['random'])

def emb2str(emb):
    word = ""
    for char in emb:
        char = char.item()
        if char >= 30:
            continue
        word = word + chr(char + 97)
    return word

jer_list = pd.DataFrame(columns = ["name_a", "name_b"])
ran_list = pd.DataFrame(columns = ["name_a", "name_b"])

for name_a, name_b in jer:
  name_a = emb2str(name_a)
  name_b = emb2str(name_b)
  jer_list = jer_list.append({"name_a": name_a, "name_b": name_b}, ignore_index = True)

for name_a, name_b in ran:
  name_a = emb2str(name_a)
  name_b = emb2str(name_b)
  ran_list = ran_list.append({"name_a": name_a, "name_b": name_b}, ignore_index = True)

jer_list.to_csv("jeremy_pairs.csv")
ran_list.to_csv("random_pairs.csv")

# Closing file
f.close()

/content/drive/MyDrive/7Channel/siamese_vinden/results
