<a href="https://colab.research.google.com/github/mihayy/review_aspect_score_prediction/blob/master/models/ulmfit_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## ULMFit model

### Load fast.ai

In [0]:
!curl https://course.fast.ai/setup/colab | bash

In [0]:
from fastai import *
from fastai.train import *

from fastai.text import *

from functools import partial
from pathlib import Path

### Mount Google Drive

In [0]:
from google.colab import drive
drive.mount('gdrive')

### Config

In [0]:
class Config(object):
  def __init__(self):
    self.gdrive_working_dir = Path("gdrive/My Drive/thesis")

    self.embeddings_path = self.gdrive_working_dir/ "embeddings"

#     self.dataset_intermediate_path = Path("manual_labeled_strength_weak_sections/summary_strength_weak_sections/clarity_sentences/mandatory_clarity_sentences")
    self.dataset_intermediate_path = Path("")

#     self.dataset_intermediate_path = Path("manual_labeled_strength_weak_sections")
#     self.dataset_intermediate_path = Path("acl_abstracts")
    self.dataset_path = self.gdrive_working_dir/ "data" / self.dataset_intermediate_path

    self.train_dataset_filename = "train_dataset.csv"
    self.dev_dataset_filename = "dev_dataset.csv"
    self.test_dataset_filename = "acl_dev_test.csv"
    
    self.train_ds = "train_ds_"
    self.test_ds = "test_ds_"

    self.aspects = ["RECOMMENDATION", "REVIEWER_CONFIDENCE", "SOUNDNESS_CORRECTNESS", "IMPACT", "SUBSTANCE", "CLARITY", "ORIGINALITY"]
#     self.aspects = ["RECOMMENDATION", "REVIEWER_CONFIDENCE"]
    
    self.stats_path = self.gdrive_working_dir / "ulmfit_stats_random"
#     self.stats_path = Path("")
    
  def set_dataset_intermediate_path(self, intermediate_path):
    self.dataset_intermediate_path = intermediate_path
    self.dataset_path = self.gdrive_working_dir/ "data" / self.dataset_intermediate_path

  def set_ds_fname(self, iteration):
    self.train_dataset_filename = self.train_ds + str(iteration) + ".csv"
    self.dev_dataset_filename = self.test_ds + str(iteration) + ".csv"

In [0]:
env_var = Config()

In [0]:
lm_encoder = "lm_encoder"
lm_ds_pkl = "lm_ds.pkl"

dataset_intermediate_paths =[Path(""),
                             Path("strength_weak_sections"),
                             Path("manual_labeled_strength_weak_sections"),
                             Path("manual_labeled_strength_weak_sections/first_section"),
                             Path("manual_labeled_strength_weak_sections/strength_weak_sections"),
                             Path("manual_labeled_strength_weak_sections/strength_weak_sections_len_limit"),
                             Path("manual_labeled_strength_weak_sections/summary_strength_weak_sections"),
                             Path("manual_labeled_strength_weak_sections/summary_strength_weak_sections/clarity_sentences"),
                             Path("manual_labeled_strength_weak_sections/weak_section"),
                             Path("manual_labeled_strength_weak_sections/weak_strength_sections_len_limit"),
                             Path("manual_labeled_strength_weak_sections/aug_stren_weak_sections_len_limit"),
                             Path("manual_labeled_strength_weak_sections/strength_section"),
                             Path("acl_abstracts")
                            ]

# dataset_intermediate_paths =[Path("acl_abstracts")]

## Dataset splits

In [0]:
# from sklearn.model_selection import train_test_split

# for path_idx, ds_inter_path in enumerate(dataset_intermediate_paths):
#   print(ds_inter_path)
#   env_var.set_dataset_intermediate_path(ds_inter_path)

#   df = pd.read_csv(env_var.dataset_path / "acl_reviews.csv")
  
#   for run_idx in range(10):
#     train_fname = "train_ds_" + str(run_idx) + ".csv"
#     test_fname = "test_ds_" + str(run_idx) + ".csv"

#     train_set, test_set = train_test_split(df, test_size=0.2, random_state=run_idx)
#     train_set.to_csv(env_var.dataset_path / train_fname, index=False)
#     test_set.to_csv(env_var.dataset_path / test_fname, index=False)

In [0]:
# for path_idx, ds_inter_path in enumerate(dataset_intermediate_paths):
#   print(ds_inter_path)
#   env_var.set_dataset_intermediate_path(ds_inter_path)
  
#   lm_df = get_lm_df(env_var)
# #   lm_df.to_csv(env_var.dataset_path / "acl_reviews.csv", index=False)

## Language Model

### Create LM DS

In [0]:
def get_lm_df(env_var):
  train_ds_df = pd.read_csv(env_var.dataset_path / env_var.train_dataset_filename)
  dev_ds_df = pd.read_csv(env_var.dataset_path / env_var.dev_dataset_filename)
  test_ds_df = pd.read_csv(env_var.dataset_path / env_var.test_dataset_filename)

  return pd.concat([train_ds_df, dev_ds_df, test_ds_df])

In [0]:
for path_idx, ds_inter_path in enumerate(dataset_intermediate_paths):
  print(ds_inter_path)
  env_var.set_dataset_intermediate_path(ds_inter_path)
  
  lm_df = get_lm_df(env_var)

  data_lm = (TextList.from_df(lm_df, env_var.dataset_path, cols='comments')
                     .split_by_rand_pct()
                     .label_for_lm()
                     .databunch())
  
#   data_lm.save("lm_ds.pkl")

### Train LM

In [0]:
for ds_idx, ds_inter_path in enumerate(dataset_intermediate_paths):
  env_var.set_dataset_intermediate_path(ds_inter_path)
  print(env_var.dataset_path)
  
  data_lm = load_data(env_var.dataset_path, lm_ds_pkl)  
  learn = language_model_learner(data_lm, arch=AWD_LSTM, drop_mult=0.5, path=env_var.dataset_path)
  lm_acc_cb = callbacks.SaveModelCallback(learn, every="improvement", monitor='accuracy', name='lm_acc_cb')
  
  learn.fit_one_cycle(1, 5e-2, moms=(0.8,0.7))
  learn.unfreeze()
  
  if ds_idx == 11:
    learn.fit_one_cycle(10, 2e-3, moms=(0.8,0.7), callbacks=[lm_acc_cb])
    learn.fit_one_cycle(5, 2e-4, moms=(0.8,0.7), callbacks=[lm_acc_cb])
    
  elif ds_idx == 1:
    learn.fit_one_cycle(7, 2e-3, moms=(0.8,0.7), callbacks=[lm_acc_cb])
    learn.fit_one_cycle(5, 2e-4, moms=(0.8,0.7), callbacks=[lm_acc_cb])
  
  else:
    learn.fit_one_cycle(5, 2e-3, moms=(0.8,0.7), callbacks=[lm_acc_cb])

  learn.save_encoder(lm_encoder)

In [0]:
lm_results = []

for ds_inter_path in dataset_intermediate_paths:
  print(ds_inter_path)
  env_var.set_dataset_intermediate_path(ds_inter_path)
  data_lm = load_data(env_var.dataset_path, lm_ds_pkl)  
  learn = language_model_learner(data_lm, arch=AWD_LSTM, drop_mult=0.5, path=env_var.dataset_path)
  learn.load_encoder(lm_encoder)
  lm_results.append(learn.validate(learn.data.valid_dl))

In [0]:
lm_results

## Classifier

In [0]:
def prepare_for_train_valid_split(env_var):
  train_ds_df = pd.read_csv(env_var.dataset_path / env_var.train_dataset_filename)
  dev_ds_df = pd.read_csv(env_var.dataset_path / env_var.dev_dataset_filename)
  
  train_ds_df["is_valid"] = False
  dev_ds_df["is_valid"] = True

  return pd.concat([train_ds_df, dev_ds_df])

In [0]:
class Preproc_config(object):
  def __init__(self):
#     self.train_dev_bs = 49
    self.train_dev_bs = 40
    self.test_bs = 27
    self.classes = [1,2,3,4,5]

In [0]:
class Trian_config(object):
  def __init__(self):
    self.runs = 10

    self.best_loss_cb_name = "best_loss_cb"
    self.best_loss_cb_name_pth = self.best_loss_cb_name + ".pth"
    
    self.best_loss_cb_1_name = "best_loss_cb_1"
    self.best_loss_cb_1_name_pth = self.best_loss_cb_1_name + ".pth"
    
    self.best_loss_cb_2_name = "best_loss_cb_2"
    self.best_loss_cb_2_name_pth = self.best_loss_cb_2_name + ".pth"
    
    self.stats_runs_path = env_var.stats_path / "ulmfit_stats.p"

In [0]:
def preprocessing_pipeline(aspect_idx, preproc_config):

  data_lm = load_data(env_var.dataset_path, lm_ds_pkl)

  train_dev_ds_df = prepare_for_train_valid_split(env_var)

  data = (TextList.from_df(path=env_var.dataset_path, df=train_dev_ds_df, cols='comments', vocab=data_lm.vocab)
                  .split_from_df(col=8)
                  .label_from_df(cols=aspect_idx, classes=preproc_config.classes)
                  .databunch(bs=preproc_config.train_dev_bs))
  
  
  return data

In [0]:
def get_current_epoch(learn):
  return np.argmin(learn.recorder.val_losses) + 1

def trainer(data, model_stats, trian_config, iteration):
  moms = (0.8,0.7)
  
  learn = text_classifier_learner(data, AWD_LSTM, drop_mult=0.5, path=env_var.dataset_path, silent=True)
  save_best_loss_cb = callbacks.SaveModelCallback(learn, every="improvement", monitor='valid_loss', name=trian_config.best_loss_cb_name)
  save_best_loss_cb_1 = callbacks.SaveModelCallback(learn, every="improvement", monitor='valid_loss', name=trian_config.best_loss_cb_1_name)
  save_best_loss_cb_2 = callbacks.SaveModelCallback(learn, every="improvement", monitor='valid_loss', name=trian_config.best_loss_cb_2_name)
  
  learn.load_encoder(lm_encoder)

  learn.unfreeze()
  
  learn.fit_one_cycle(2, 1e-2, moms=moms, callbacks=[save_best_loss_cb])

  start_epoch = 3
  
  learn.fit_one_cycle(10, 5e-3, moms=moms, callbacks=[save_best_loss_cb_1])
  
  stat = get_best_cycle_stats(learn, model_stats, start_epoch, iteration)
  
  learn.load(save_best_loss_cb.name)
  
  learn.fit_one_cycle(10, 8e-3, moms=moms, callbacks=[save_best_loss_cb_2])
  
  curr_best_cb = save_best_loss_cb_2
  
  if save_best_loss_cb_1.best < save_best_loss_cb_2.best:
    print("First is better")
    learn.load(save_best_loss_cb_1.name)
    curr_best_cb = save_best_loss_cb_1
  else:
    stat = get_best_cycle_stats(learn, model_stats, start_epoch, iteration)
  
  print(env_var.dataset_path, model_stats.aspect_label_name)
  
  print(stat)
  
  stats = [stat]
  
  start_epoch += get_current_epoch(learn)
  
  print(save_best_loss_cb_1.best, save_best_loss_cb_2.best)  
  
  learn.fit_one_cycle(3, 1e-3, moms=moms, callbacks=[save_best_loss_cb])
           
  if curr_best_cb.best < save_best_loss_cb.best:
    print(curr_best_cb.best, save_best_loss_cb.best)
    print("undo 1e-3 cycle")
    learn.load(curr_best_cb.name)
  else:
    stat_1 = get_best_cycle_stats(learn, model_stats, start_epoch, iteration)
    stats.append(stat_1)
    start_epoch += get_current_epoch(learn)
    print(stat_1)

  learn.fit_one_cycle(6, 1e-4, moms=moms, callbacks=[save_best_loss_cb])

  stat_2 = get_best_cycle_stats(learn, model_stats, start_epoch, iteration)
  stats.append(stat_2)

  best_stat = max(stats, key=operator.attrgetter('dev_accuracy'))
  print("BEST THING")
  print(iteration, best_stat)
  
  learn.destroy()
  return best_stat

In [0]:
class Stat():
  def __init__(self, epoch, dev_accuracy, dev_loss, dev_conf_matrix, iteration):
    self.epoch = epoch    
    self.dev_accuracy = dev_accuracy
    self.dev_loss = dev_loss
    self.dev_conf_matrix = dev_conf_matrix
    self.iteration = iteration
    
  def __str__(self):
    return f'{self.epoch} Test accuracy: {self.dev_accuracy} Test loss: {self.dev_loss}'

nr_epoch_neigh = 3

def get_conf_matrices(learn, test_dataset):
  dev_conf_matrix = learn.interpret().confusion_matrix()
  
  initial_valid_dl = learn.data.valid_dl
  
  learn.data.valid_dl = test_dataset.train_dl
  test_conf_matrix = learn.interpret().confusion_matrix()
  learn.data.valid_dl = initial_valid_dl
  
  return dev_conf_matrix, test_conf_matrix

def get_best_cycle_stats(learn, model_stats, start_epoch=0, iteration=0):
  
  dev_loss_stats = learn.recorder.val_losses
  dev_acc_stats = learn.recorder.metrics
  
  dev_acc_stats = np.squeeze(dev_acc_stats)

  best_loss_idx = np.argmin(dev_loss_stats)
  best_loss = dev_loss_stats[best_loss_idx]
  best_loss_acc = dev_acc_stats[best_loss_idx]
  
  model_stats.check_for_best_model(learn, best_loss, best_loss_acc, iteration)
  
  left_window = best_loss_idx - nr_epoch_neigh

  if left_window < 0: 
    left_window = 0

  right_window = best_loss_idx + 1 + nr_epoch_neigh

  best_loss_neigh_best_acc_idx = np.argmax(dev_acc_stats[left_window:right_window])

  best_epoch_idx = left_window + best_loss_neigh_best_acc_idx
  
  if best_loss_acc == dev_acc_stats[best_epoch_idx]:
    best_epoch_idx = best_loss_idx
  
  dev_conf_matrix = []
  
  if best_epoch_idx == best_loss_idx:
    dev_conf_matrix = learn.interpret().confusion_matrix()
  
  dev_loss = dev_loss_stats[best_epoch_idx]
  dev_acc = dev_acc_stats[best_epoch_idx]
  
  return Stat(best_epoch_idx + start_epoch, dev_acc, dev_loss, dev_conf_matrix, iteration)

In [0]:
import operator

class Model_stats(object):
  def __init__(self, dataset_intermediate_path, aspect_label_name):
    self.dataset_intermediate_path = dataset_intermediate_path
    self.aspect_label_name = aspect_label_name
    self.stats = []
    
    self.best_model_path = "random_best_" + aspect_label_name
    self.best_saved_model_dev_acc = 0
    self.best_saved_model_dev_loss = 0
    self.best_iteration = 0
  
  def check_for_best_model(self, learn, dev_loss, dev_accuracy, iteration):
    
    if self.best_saved_model_dev_acc < dev_accuracy or (self.best_saved_model_dev_acc <= dev_accuracy and self.best_saved_model_dev_loss < dev_loss):
      learn.save(self.best_model_path)
      self.best_saved_model_dev_acc = dev_accuracy
      self.best_saved_model_dev_loss = dev_loss
      self.best_iteration = iteration
  
  def compute_mean_metrics(self):
    top_3_dev_acc_stats = sorted(self.stats, key=operator.attrgetter('dev_accuracy'), reverse=True)[:3]
    
    self.mean_convergence_epoch = np.mean([stat.epoch for stat in self.stats]) 
    
    self.mean_dev_accuracy = np.mean([stat.dev_accuracy for stat in self.stats]) 
    
    self.std_dev_accuracy = np.std([stat.dev_accuracy for stat in self.stats]) 
    
    self.top_3_mean_dev_accuracy = np.mean([stat.dev_accuracy for stat in top_3_dev_acc_stats]) 
    
    self.best_dev_acc_stats = top_3_dev_acc_stats[0]
    self.best_epoch_dev_conf_matrix = self.best_dev_acc_stats.dev_conf_matrix

In [0]:
import pickle

class Stats_Manager(object):
  def __init__(self, stats_runs_path):
    self.stats_runs_path = stats_runs_path
  
  def create_or_restore_runs_stats_obj(self, dataset_intermediate_paths, aspect_label_names):
    self.models_stats = []

    if self.stats_runs_path.is_file():
      with open(self.stats_runs_path, "rb") as input_file:
        self.models_stats = pickle.load(input_file)

    self.should_restore_to_last_run = len(self.models_stats)

    if self.should_restore_to_last_run:
      last_run = self.models_stats[-1]

      self.last_path_idx = dataset_intermediate_paths.index(last_run.dataset_intermediate_path)
      self.last_aspect_idx = aspect_label_names.index(last_run.aspect_label_name)

  def should_skip_current_run(self):
    _should_skip_current_run = False
    
    if self.should_restore_to_last_run:
      _should_skip_current_run = path_idx < self.last_path_idx or (path_idx <= self.last_path_idx and aspect_idx <= self.last_aspect_idx)
    
    return _should_skip_current_run
  
  def update(self, model_stats):
    self.models_stats.append(model_stats)
    with open(self.stats_runs_path, "wb") as output_file:
      pickle.dump(self.models_stats, output_file)    

In [0]:
import torch

def regulate_cuda_memory():
  torch.cuda.empty_cache()
  gc.collect()  

## Main

In [0]:
import os

def remove_tmp_cb(path):
  if path.is_file():
    os.remove(path)

In [0]:
trian_config = Trian_config()

stats_manager = Stats_Manager(trian_config.stats_runs_path)
stats_manager.create_or_restore_runs_stats_obj(dataset_intermediate_paths, env_var.aspects)

preproc_config = Preproc_config()
  
for path_idx, ds_inter_path in enumerate(dataset_intermediate_paths):
  
  env_var.set_dataset_intermediate_path(ds_inter_path)
  
  print("DS_PATH:", env_var.dataset_path)
    
  for aspect_idx, aspect_label_name in enumerate(env_var.aspects):
    print(env_var.dataset_path, aspect_label_name)

    if stats_manager.should_skip_current_run():
      continue
      
    model_stats = Model_stats(ds_inter_path, aspect_label_name)

    for iteration in range(trian_config.runs):
      env_var.set_ds_fname(iteration)
      
      print(env_var.train_dataset_filename, env_var.dev_dataset_filename)
      
      data = preprocessing_pipeline(aspect_idx + 1, preproc_config)
      
      best_epoch_stats = trainer(data, model_stats, trian_config, iteration)
      
      model_stats.stats.append(best_epoch_stats)

    model_stats.compute_mean_metrics()
    stats_manager.update(model_stats)
    regulate_cuda_memory()
    
  remove_tmp_cb(env_var.dataset_path / "models" / trian_config.best_loss_cb_name_pth)  
  remove_tmp_cb(env_var.dataset_path / "models" / trian_config.best_loss_cb_1_name_pth)  
  remove_tmp_cb(env_var.dataset_path / "models" / trian_config.best_loss_cb_2_name_pth)

## Simple stats

In [0]:
dataset_intermediate_paths_grouped = [Path('.'),
 Path('manual_labeled_strength_weak_sections'),
 Path('strength_weak_sections'),
 Path('manual_labeled_strength_weak_sections/summary_strength_weak_sections'),
 Path('manual_labeled_strength_weak_sections/strength_weak_sections'),
 Path('manual_labeled_strength_weak_sections/first_section'),
 Path('manual_labeled_strength_weak_sections/weak_section'),
 Path('manual_labeled_strength_weak_sections/strength_section'),
 Path('manual_labeled_strength_weak_sections/strength_weak_sections_len_limit'),
 Path('manual_labeled_strength_weak_sections/weak_strength_sections_len_limit'),
 Path('manual_labeled_strength_weak_sections/aug_strength_weak_sections_len_limit'),
 Path('manual_labeled_strength_weak_sections/summary_strength_weak_sections/clarity_sentences'),
 Path('acl_abstracts')]

comments_manipulations = ["comments", 
  "manual", 
  "str_weak",           
  "manual_sum_str_weak", 
  "manual_str_weak",  
  "manual_sum", 
  "manual_weak", 
  "manual_str", 
  "manual_str_weak_limit", 
  "manual_weak_str_limit", 
  "manual_limit_aug", 
  "manual_clarity", 
  "abstract"]

In [0]:
trian_config = Trian_config()

preproc_config = Preproc_config()

venue_len_mean = []
venue_len_std = []

env_var.set_ds_fname(0)

for path_idx, ds_inter_path in enumerate(dataset_intermediate_paths_grouped):
  
  env_var.set_dataset_intermediate_path(ds_inter_path)
  
  acl_reviews = pd.read_csv(env_var.dataset_path / env_var.train_dataset_filename)
  text_column = acl_reviews['comments']
      
  venue_reviews_len = []
    
  for review_text in text_column:
      venue_reviews_len.append(len(review_text.split()))

  mean = int(np.mean(venue_reviews_len))
  std = int(np.std(venue_reviews_len))

  venue_len_mean.append(mean)
  venue_len_std.append(std)

  min_len = np.min(venue_reviews_len)
  max_len = np.max(venue_reviews_len)
  
  print(env_var.dataset_intermediate_path, ": mean", mean, "std", std, "min_len", min_len, "max_len", max_len)

In [0]:
fig, ax = plt.subplots(figsize=(12,5))
x_pos = np.arange(len(comments_manipulations))

ax.bar(x_pos, venue_len_mean, yerr=venue_len_std, align='center', alpha=0.5, ecolor='black', capsize=7)
ax.set_ylabel('Content length')
ax.set_xticks(x_pos)
ax.set_xticklabels(comments_manipulations)
ax.set_title("ACL content's length")
ax.yaxis.grid(True)

fig.autofmt_xdate()
plt.xlabel("Type of content")
plt.tight_layout()
plt.savefig('aclcontentlen.jpg', dpi=300)
plt.show()

## Test

In [0]:
import matplotlib.cm as cm

txt_ci = TextClassificationInterpretation.from_learner(learn)
test_text = "We achieve the state of the art with our method."
txt_ci.show_intrinsic_attention(test_text,cmap=cm.Purples)
learn.predict(test_text)

(Category 3, tensor(2), tensor([0.1142, 0.0325, 0.6043, 0.2002, 0.0488]))