<a href="https://colab.research.google.com/github/mihayy/review_aspect_score_prediction/blob/master/models/baseline_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Baseline Models

In [0]:
from google.colab import files
import os
from functools import partial
import numpy as np
from pathlib import Path

import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [0]:
import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

In [0]:
from google.colab import drive
drive.mount('gdrive')

Drive already mounted at gdrive; to attempt to forcibly remount, call drive.mount("gdrive", force_remount=True).


In [0]:
class Config(object):
  def __init__(self):
    self.gdrive_working_dir = Path("gdrive/My Drive/thesis")

    self.embeddings_path = self.gdrive_working_dir/ "embeddings"

#     self.dataset_intermediate_path = Path("manual_labeled_strength_weak_sections/summary_strength_weak_sections/clarity_sentences/mandatory_clarity_sentences")
    self.dataset_intermediate_path = Path("")

#     self.dataset_intermediate_path = Path("manual_labeled_strength_weak_sections")
#     self.dataset_intermediate_path = Path("acl_abstracts")
    self.dataset_path = self.gdrive_working_dir/ "data" / self.dataset_intermediate_path

    self.train_dataset_filename = "train_dataset.csv"
    self.dev_dataset_filename = "dev_dataset.csv"
    self.test_dataset_filename = "acl_dev_test.csv"
    
    self.train_ds = "train_ds_"
    self.test_ds = "test_ds_"

    self.aspects_no_com_approp = ['RECOMMENDATION', 'REVIEWER_CONFIDENCE', 'SOUNDNESS_CORRECTNESS', 'IMPACT', 'SUBSTANCE', 'CLARITY', 'ORIGINALITY']
    
    self.stats_path = self.gdrive_working_dir / "simple_stats_random"
#     self.stats_path = Path("")
    
  def set_dataset_intermediate_path(self, intermediate_path):
    self.dataset_intermediate_path = intermediate_path
    self.dataset_path = self.gdrive_working_dir/ "data" / self.dataset_intermediate_path
    
  def set_ds_fname(self, iteration):
    self.train_dataset_filename = self.train_ds + str(iteration) + ".csv"
    self.dev_dataset_filename = self.test_ds + str(iteration) + ".csv"

In [0]:
env_var = Config()

In [0]:
dataset_intermediate_paths =[Path(""),
                             Path("strength_weak_sections"),
                             Path("manual_labeled_strength_weak_sections"),
                             Path("manual_labeled_strength_weak_sections/first_section"),
                             Path("manual_labeled_strength_weak_sections/strength_weak_sections"),
                             Path("manual_labeled_strength_weak_sections/strength_weak_sections_len_limit"),
                             Path("manual_labeled_strength_weak_sections/summary_strength_weak_sections"),
                             Path("manual_labeled_strength_weak_sections/summary_strength_weak_sections/clarity_sentences"),
                             Path("manual_labeled_strength_weak_sections/weak_section"),
                             Path("manual_labeled_strength_weak_sections/weak_strength_sections_len_limit"),
                             Path("manual_labeled_strength_weak_sections/aug_stren_weak_sections_len_limit"),
                             Path("manual_labeled_strength_weak_sections/strength_section"),
                             Path("acl_abstracts")
                            ]

In [0]:
import operator

class Stat():
  def __init__(self, epoch, dev_accuracy, dev_loss, dev_conf_matrix, iteration):
    self.epoch = epoch    
    self.dev_accuracy = dev_accuracy
    self.dev_loss = dev_loss
    self.dev_conf_matrix = dev_conf_matrix
    self.iteration = iteration
    
  def __str__(self):
    return f'{self.epoch} Test accuracy: {self.dev_accuracy} Test loss: {self.dev_loss}'

class Model_stats(object):
  def __init__(self, dataset_intermediate_path, aspect_label_name):
    self.dataset_intermediate_path = dataset_intermediate_path
    self.aspect_label_name = aspect_label_name
    self.stats = []

In [0]:
import pickle

class Stats_Manager(object):
  def __init__(self, stats_runs_path):
    self.stats_runs_path = stats_runs_path
    self.models_stats = []
    
  def update(self):
    with open(self.stats_runs_path, "wb") as output_file:
      pickle.dump(self.models_stats, output_file)    

In [0]:
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings("ignore")

models_stats = []

iterations = 10

stats_runs_path = env_var.stats_path / "rf_stats.p"
mv_runs_path = env_var.stats_path / "mv_stats.p"

stats_manager = Stats_Manager(stats_runs_path)
stats_manager_mv = Stats_Manager(mv_runs_path)

for path_idx, ds_inter_path in enumerate(dataset_intermediate_paths):
  
  env_var.set_dataset_intermediate_path(ds_inter_path)  
  print(env_var.dataset_intermediate_path)  
  
  model_stats_for_inter_path = [Model_stats(ds_inter_path, aspect_label_name) for aspect_label_name in env_var.aspects_no_com_approp]
  mv_stats_for_inter_path = [Model_stats(ds_inter_path, aspect_label_name) for aspect_label_name in env_var.aspects_no_com_approp]
  
  for iteration in range(iterations):
    env_var.set_ds_fname(iteration)
    
    df_train = pd.read_csv(env_var.dataset_path / env_var.train_dataset_filename)
    df_test = pd.read_csv(env_var.dataset_path / env_var.dev_dataset_filename)

    count_vectorizer = TfidfVectorizer(
      analyzer="word", tokenizer=nltk.word_tokenize,
      preprocessor=None, stop_words='english', max_features=None)    

    svd = TruncatedSVD(n_components=25, n_iter=25, random_state=0)

    X_train = count_vectorizer.fit_transform(df_train['comments'])
    X_test = count_vectorizer.transform(df_test['comments'])

    X_train = svd.fit_transform(X_train)

    X_test = svd.transform(X_test)
    
    for aspect_idx, aspect in enumerate(env_var.aspects_no_com_approp):
      
      y_train = df_train[aspect].values.ravel()
      y_test = df_test[aspect].values.ravel()

      rf_clf = RandomForestClassifier(n_estimators=400, max_depth=3)
      rf_clf.fit(X_train, y_train)
      rf_acc = rf_clf.score(X_test, y_test) 
      
      stat = Stat(0, rf_acc, 0, [], iteration)
      model_stats_for_inter_path[aspect_idx].stats.append(stat)
      
      items, counts = np.unique(y_train, return_counts=True)
      y_pred = [items[np.argmax(counts)]] * len(y_test)
      mv_acc = accuracy_score(y_test, y_pred)
      
      mv_stat = Stat(0, mv_acc, 0, [], iteration)
      mv_stats_for_inter_path[aspect_idx].stats.append(mv_stat)
      
  stats_manager.models_stats += model_stats_for_inter_path
  
  stats_manager_mv.models_stats += mv_stats_for_inter_path

stats_manager.update()
stats_manager_mv.update()

.
strength_weak_sections
manual_labeled_strength_weak_sections
manual_labeled_strength_weak_sections/first_section
manual_labeled_strength_weak_sections/strength_weak_sections
manual_labeled_strength_weak_sections/strength_weak_sections_len_limit
manual_labeled_strength_weak_sections/summary_strength_weak_sections
manual_labeled_strength_weak_sections/summary_strength_weak_sections/clarity_sentences
manual_labeled_strength_weak_sections/weak_section
manual_labeled_strength_weak_sections/weak_strength_sections_len_limit
manual_labeled_strength_weak_sections/aug_stren_weak_sections_len_limit
manual_labeled_strength_weak_sections/strength_section
acl_abstracts
