In [None]:
!pip install wordfreq

In [None]:
import os
import pandas as pd
from sklearn import preprocessing
from copy import deepcopy
import numpy as np
import matplotlib.pyplot as plt 
from sklearn.linear_model import BayesianRidge
from sklearn.metrics import mean_absolute_error
from sklearn.impute import SimpleImputer

from collections.abc import Iterable
from sklearn.svm import SVR
from collections import Counter
from wordfreq import word_frequency, zipf_frequency
from string import punctuation
import re
from tqdm.notebook import tqdm
from google.colab import drive
import pickle

pd.options.mode.chained_assignment = None 

In [None]:
# Define splits

SPLIT_1 = (7839,10490,400,533)
SPLIT_2 = (10490,13082,533,667)
SPLIT_3 = (13082,15737,667,800)
SPLIT_ALL = (15737,15737,800,800)

In [None]:
USE_SPLIT = SPLIT_2

In [None]:
# Helper Functions


def add_seq_states(n_forward, n_backward, vars):
  print_vars = []
  for var in tqdm(vars):
    for forward_step in range(1, n_forward+1):
      built_var = f'NEXT_{forward_step}_{var}'
      print_vars.append(built_var)
      data[built_var] = np.zeros_like(data[var])
      for sample in range(len(data)):        
        if sample+forward_step < len(data) and data['sentence_id'][sample] == data['sentence_id'][sample+forward_step]:
          data[built_var][sample] = data[var][sample+forward_step]
        else:
          data[built_var][sample] = np.zeros_like(data[var][0]).tolist()

    for backward_step in range(1, n_backward+1, 1):
      built_var = f'PREV_{backward_step}_{var}'
      print_vars.append(built_var)
      data[built_var] = np.zeros_like(data[var])
      for sample in range(len(data)):        
        if sample-backward_step >= 0 and data['sentence_id'][sample] == data['sentence_id'][sample-backward_step]:
          data[built_var][sample] = data[var][sample-backward_step]
        else:
          data[built_var][sample] = np.zeros_like(data[var][0]).tolist()

  return(print_vars)

def add_seq_states_test(n_forward, n_backward, vars):
  print_vars = []
  for var in tqdm(vars):
    for forward_step in range(1, n_forward+1):
      built_var = f'NEXT_{forward_step}_{var}'
      print_vars.append(built_var)
      test_data[built_var] = np.zeros_like(test_data[var])
      for sample in range(len(test_data)):        
        if sample+forward_step < len(test_data) and test_data['sentence_id'][sample] == test_data['sentence_id'][sample+forward_step]:
          test_data[built_var][sample] = test_data[var][sample+forward_step]
        else:
          test_data[built_var][sample] = np.zeros_like(test_data[var][0]).tolist()

    for backward_step in range(1, n_backward+1, 1):
      built_var = f'PREV_{backward_step}_{var}'
      print_vars.append(built_var)
      test_data[built_var] = np.zeros_like(test_data[var])
      for sample in range(len(test_data)):        
        if sample-backward_step >= 0 and test_data['sentence_id'][sample] == test_data['sentence_id'][sample-backward_step]:
          test_data[built_var][sample] = test_data[var][sample-backward_step]
        else:
          test_data[built_var][sample] = np.zeros_like(test_data[var][0]).tolist()

  return(print_vars)


def flatten(items):
  for item in items:
      if isinstance(item, list):
          yield from flatten(item)
      else:
          yield item

def plot_sent(sent, y, y_hat):
  eg_results = {}
  sent_len = len(sent)

  for idx, target in enumerate(['nFix',	'FFD',	'GPT', 'TRT', 'fixProp']):
    eg_results[target] = y[:,idx]
    eg_results['PRED_'+target] = y_hat[:,idx]

  fig, ax = plt.subplots()
  locs, labels = plt.xticks()  # Get the current locations and labels.
  plt.xticks(list(range(sent_len)), sent, rotation=-90)  # Set text labels and properties.

  plt.plot(list(range(sent_len)), eg_results['nFix'], color='darkblue', label = 'nFix')
  plt.plot(list(range(sent_len)), eg_results['PRED_nFix'], '--', color='darkblue')

  plt.plot(list(range(sent_len)), eg_results['FFD'], color='darkred', label = 'FFD')
  plt.plot(list(range(sent_len)), eg_results['PRED_FFD'], '--', color='darkred')

  plt.plot(list(range(sent_len)), eg_results['GPT'], color='darkorange', label = 'GPT')
  plt.plot(list(range(sent_len)), eg_results['PRED_GPT'], '--', color='darkorange')

  plt.plot(list(range(sent_len)), eg_results['TRT'], color='darkgreen', label = 'TRT')
  plt.plot(list(range(sent_len)), eg_results['PRED_TRT'], '--', color='darkgreen')

  plt.plot(list(range(sent_len)), eg_results['fixProp'], color='deeppink', label = 'fixProp')
  plt.plot(list(range(sent_len)), eg_results['PRED_fixProp'], '--', color='deeppink')

  #plt.yscale('symlog')

  plt.legend(loc='upper right', ncol=len(eg_results))
  plt.tight_layout()
  plt.ylim((0,110))
  plt.title('MAE '+str(float("{0:.5f}".format(mean_absolute_error(y, y_hat)))))
  plt.show()

def val_to_plot(val_df, preds):
  sent_boundaries = []
  sent_start = 0
  y_val_sents = val_df['sentence_id'].tolist()
  sent_id_tracker = y_val_sents[0]
  for idx, sent_id in enumerate(y_val_sents):
      if sent_id_tracker < sent_id or idx == len(y_val_sents) - 1:
          if idx == len(data) - 1:
                idx += 1
          sent_boundaries.append((sent_start,idx))              
          sent_start = deepcopy(idx)
      sent_id_tracker = sent_id
  
  val_results = {}
  val_store = {}

  for idx, sent_span in enumerate(sent_boundaries):
    
    x = val_df.iloc[sent_span[0]:sent_span[1],:]['word'].tolist()
    y = val_df.iloc[sent_span[0]:sent_span[1],:][['nFix',	'FFD',	'GPT', 'TRT', 'fixProp']].values
    y_hat = np.vstack([preds[target][sent_span[0]:sent_span[1]] for target in ['nFix',	'FFD', 'GPT', 'TRT', 'fixProp']]).transpose()
    assert y_hat.shape == y.shape
    val_results[idx] = float(mean_absolute_error(y, y_hat))
    val_store[idx] = [x, y, y_hat] 

  return(val_results, val_store)

In [None]:
drive.mount('/content/drive')

In [None]:
shared_dir = 'drive/Shareddrives/CMCL Shared Task - CogNLP@Sheffield/'
local_dir = shared_dir+'Final models/local/'
features_dir = shared_dir+'Feature Enhancement/'

In [None]:
#Load the data with some preprocessed additions

data = pd.read_pickle(shared_dir+'Data/CMCL_Proc.pkl')
#data = data.drop('BERT_embed', axis=1)

In [None]:
test_data = pd.read_csv(local_dir+'test_data_with_pos.csv')

In [None]:
test_data

In [None]:
## Example adding extra features from drive

Conc_M = pd.read_excel(features_dir+'Conc.M_draft.xlsx').fillna(3)
Conc_SD = pd.read_excel(features_dir+'Conc.SD_draft.xlsx')
Conc_SD['Conc.SD'] = Conc_SD['Conc.SD'].fillna(Conc_SD['Conc.SD'].mean())
Percent_Known = pd.read_excel(features_dir+'Percent_known.xlsx').fillna(1)

In [None]:
Conc_M_test = pd.read_excel(features_dir+'Conc.M_test_data.xlsx').fillna(3)
Conc_SD_test = pd.read_excel(features_dir+'Conc.SD_test_data.xlsx')
Conc_SD_test['Conc.SD'] = Conc_SD_test['Conc.SD'].fillna(Conc_SD_test['Conc.SD'].mean())
Percent_Known_test = pd.read_excel(features_dir+'Percent_known_test_data.xlsx').fillna(1)

In [None]:
data.head()

In [None]:
data['Conc_M'] = Conc_M['Conc.M']
data['Conc_SD'] = Conc_SD['Conc.SD']
data['Percent_Known'] = Percent_Known['Percent_known']

In [None]:
test_data['Conc_M'] = Conc_M_test['Conc.M']
test_data['Conc_SD'] = Conc_SD_test['Conc.SD']
test_data['Percent_Known'] = Percent_Known_test['Percent_known']

In [None]:
MWE_Exists = pd.read_excel(local_dir+'mwe_features_with_comp_weights_embeddings_coarse_cats.xlsx')
MWE_Exists_test = pd.read_excel(local_dir+'test_mwe_features_with_comp_weights_embeddings_coarse_cats.xlsx')


In [None]:
data['is_mwe'] = MWE_Exists['is_mwe']
data['mwe_cat'] = MWE_Exists['mwe_cat']
data['comp_score'] = MWE_Exists['comp']
data['comp_weights'] = MWE_Exists['weights']


In [None]:
test_data['is_mwe'] = MWE_Exists_test['is_mwe']
test_data['mwe_cat'] = MWE_Exists_test['mwe_cat']
test_data['comp_score'] = MWE_Exists_test['comp']
test_data['comp_weights'] = MWE_Exists_test['weights']


In [None]:
embed_dict = {} 
for idx, row in MWE_Exists.iterrows():
  embed_string = row['w2v_embedding']
  embed = [float(val) for val in embed_string.strip('[]\n').split()]
  embed_dict[idx] = embed

In [None]:
test_embed_dict = {} 
for idx, row in MWE_Exists_test.iterrows():
  embed_string = row['w2v_embedding']
  embed = [float(val) for val in embed_string.strip('[]\n').split()]
  test_embed_dict[idx] = embed

In [None]:
data['w2v_embedding'] = data.index.map(embed_dict)
test_data['w2v_embedding'] = test_data.index.map(test_embed_dict)

In [None]:
test_data['word'] = test_data['word'].str.replace('<EOS>', '')

In [None]:
data.append(test_data)

In [None]:
# Make some variables categorical

vars_to_cat = ['pos', 'word'] 

for var in vars_to_cat:
  le = preprocessing.LabelEncoder()
  le.fit(data.append(test_data)[var])
  data[f'CAT_{var}'] = le.transform(data[var])
  test_data[f'CAT_{var}'] = le.transform(test_data[var])

In [None]:
test_data.head()

In [None]:
# Add EOS, Word Length, Word Frequency

is_EOS = []
is_SOS = []

for row in data.iterrows():
  idx, row_data = row[0], row[1]

  if idx != len(data)-1 and row_data['word_id'] < data.iloc[idx+1]['word_id']:
    is_EOS.append(0)
  else:
    is_EOS.append(1)
data['Is_EOS']=is_EOS

for row in data.iterrows():
  idx, row_data = row[0], row[1]

  if row_data['word_id'] == 0:
    is_SOS.append(1)
  else:
    is_SOS.append(0)

data['Is_SOS']=is_SOS

data['word_len'] = [len(word) for word in data['word']]

data['zipf_frequency'] = [zipf_frequency(word.rstrip(punctuation), 'en', wordlist='large') for word in data['word']]
data['word_frequency'] =  [word_frequency(word.rstrip(punctuation), 'en') for word in data['word']]

In [None]:
# Add EOS, Word Length, Word Frequency

is_EOS = []
is_SOS = []

for row in test_data.iterrows():
  idx, row_data = row[0], row[1]

  if idx != len(test_data)-1 and row_data['word_id'] < test_data.iloc[idx+1]['word_id']:
    is_EOS.append(0)
  else:
    is_EOS.append(1)
test_data['Is_EOS']=is_EOS

for row in test_data.iterrows():
  idx, row_data = row[0], row[1]

  if row_data['word_id'] == 0:
    is_SOS.append(1)
  else:
    is_SOS.append(0)

test_data['Is_SOS']=is_SOS

test_data['word_len'] = [len(word) for word in test_data['word']]

test_data['zipf_frequency'] = [zipf_frequency(word.rstrip(punctuation), 'en', wordlist='large') for word in test_data['word']]
test_data['word_frequency'] =  [word_frequency(word.rstrip(punctuation), 'en') for word in test_data['word']]

In [None]:
test_data[56:80]

In [None]:
saccade_len_dict = {1 : 0, 2: 0, 3: 0, 4: 1, 5: 1, 6:1, 7:1, 8:2, 9:2, 10:2}
for i in range(11,27): 
  saccade_len_dict[i] = 3

saccade_len_dict_binary = {i : (1 if i > 3 else 0) for i in range(1,data['word_len'].max()+1)}
data['saccade_cat_binary'] = data['word_len'].map(saccade_len_dict_binary)
data['saccade_cat'] = data['word_len'].map(saccade_len_dict)


In [None]:
test_data['saccade_cat_binary'] = test_data['word_len'].map(saccade_len_dict_binary)
test_data['saccade_cat'] = test_data['word_len'].map(saccade_len_dict)

In [None]:
typical_words = re.compile(r'["\'\(\)\[\]]*[A-Za-z]*[\'-.]?[A-Za-z]*[\.\-\'\"?!\(\)\[\]:;,]*$|&|–|[0-9]+\-year-old|[A-Za-z]*-in-law[\.\-\'\"?!\(\)\[\]:;,%]*')
typical_nums = re.compile(r'[\.\-\'\"?!\(\)\[\]:;,]*[0-9]+[\.\-\'\"?!\(\)\[\]:;,%]*[0-9]*(th|st|rd|nd)?[\'s]?[\.\-\'\"?!\(\)\[\]:;,%]*')

In [None]:
strange_dict = {} 
for i, row in data.iterrows():
  chars_match = re.match(typical_words, row['word'].strip())
  nums_match = re.match(typical_nums, row['word'].strip())
  if chars_match is not None: 
    if chars_match.group() == row['word']:
      strange_dict[i] = 0 
    else:
      strange_dict[i] = 1
  elif nums_match is not None: 
    if nums_match.group() == row['word']:
      strange_dict[i] = 0 
    else:
      strange_dict[i] = 1
  else:
    strange_dict[i] = 1

In [None]:
data['is_strange'] = data.index.map(strange_dict)

In [None]:
strange_dict = {} 
for i, row in test_data.iterrows():
  chars_match = re.match(typical_words, row['word'].strip())
  nums_match = re.match(typical_nums, row['word'].strip())
  if chars_match is not None: 
    if chars_match.group() == row['word']:
      strange_dict[i] = 0 
    else:
      strange_dict[i] = 1
  elif nums_match is not None: 
    if nums_match.group() == row['word']:
      strange_dict[i] = 0 
    else:
      strange_dict[i] = 1
  else:
    strange_dict[i] = 1

In [None]:
test_data['is_strange'] = test_data.index.map(strange_dict)

In [None]:
for i, row in test_data.iterrows():
  if np.isnan(row['is_strange']):
    print(row['word'])

In [None]:
geco_FFD = pd.read_csv(local_dir+'zuco_featureset-geco_FFD_word_level.csv').drop(['sentence_id','word_id', 'word'], axis=1)
geco_TRT = pd.read_csv(local_dir+'zuco_featureset-geco_TRT_word_level.csv').drop(['sentence_id','word_id', 'word'], axis=1)
geco_GPT = pd.read_csv(local_dir+'zuco_featureset-geco_GPT_word_level.csv').drop(['sentence_id','word_id', 'word'], axis=1)
geco_nFix = pd.read_csv(local_dir+'zuco_featureset-geco_nFix_word_level.csv').drop(['sentence_id','word_id', 'word'], axis=1)
geco_fixProp = pd.read_csv(local_dir+'zuco_featureset-geco_fixProp_word_level.csv').drop(['sentence_id','word_id', 'word'], axis=1)

In [None]:
test_geco_FFD = pd.read_csv(local_dir+'zuco_testset-geco_FFD_word_level.csv').drop(['sentence_id','word_id', 'word'], axis=1)
test_geco_TRT = pd.read_csv(local_dir+'zuco_testset-geco_TRT_word_level.csv').drop(['sentence_id','word_id', 'word'], axis=1)
test_geco_GPT = pd.read_csv(local_dir+'zuco_testset-geco_GPT_word_level.csv').drop(['sentence_id','word_id', 'word'], axis=1)
test_geco_nFix = pd.read_csv(local_dir+'zuco_testset-geco_nFix_word_level.csv').drop(['sentence_id','word_id', 'word'], axis=1)
test_geco_fixProp = pd.read_csv(local_dir+'zuco_testset-geco_fixProp_word_level.csv').drop(['sentence_id','word_id', 'word'], axis=1)

In [None]:
data['geco_FFD_mean'] = geco_FFD['mean']
data['geco_FFD_median'] = geco_FFD['median']
data['geco_FFD_std'] = geco_FFD['std']

data['geco_TRT_mean'] = geco_TRT['mean']
data['geco_TRT_median'] = geco_TRT['median']
data['geco_TRT_std'] = geco_TRT['std']

data['geco_GPT_mean'] = geco_GPT['mean']
data['geco_GPT_median'] = geco_GPT['median']
data['geco_GPT_std'] = geco_GPT['std']

data['geco_nFix_mean'] = geco_nFix['mean']
data['geco_nFix_median'] = geco_nFix['median']
data['geco_nFix_std'] = geco_nFix['std']

data['geco_fixProp_mean'] = geco_fixProp['mean']
data['geco_fixProp_median'] = geco_fixProp['median']
data['geco_fixProp_std'] = geco_fixProp['std']


In [None]:
test_data['geco_FFD_mean'] = test_geco_FFD['mean']
test_data['geco_FFD_median'] = test_geco_FFD['median']
test_data['geco_FFD_std'] = test_geco_FFD['std']

test_data['geco_TRT_mean'] = test_geco_TRT['mean']
test_data['geco_TRT_median'] = test_geco_TRT['median']
test_data['geco_TRT_std'] = test_geco_TRT['std']

test_data['geco_GPT_mean'] = test_geco_GPT['mean']
test_data['geco_GPT_median'] = test_geco_GPT['median']
test_data['geco_GPT_std'] = test_geco_GPT['std']

test_data['geco_nFix_mean'] = test_geco_nFix['mean']
test_data['geco_nFix_median'] = test_geco_nFix['median']
test_data['geco_nFix_std'] = test_geco_nFix['std']

test_data['geco_fixProp_mean'] = test_geco_fixProp['mean']
test_data['geco_fixProp_median'] = test_geco_fixProp['median']
test_data['geco_fixProp_std'] = test_geco_fixProp['std']



In [None]:
data.columns

In [None]:
# Add features from previous/subsequent words

print_vars = add_seq_states(n_forward=2, n_backward=2, vars=['Conc_M', 'Conc_SD', 'Percent_Known',
       'is_mwe', 'mwe_cat', 'comp_score', 'comp_weights', 
       'CAT_pos', 'CAT_word', 'Is_EOS', 'Is_SOS', 'word_len', 'zipf_frequency',
       'word_frequency', 'saccade_cat_binary', 'saccade_cat', 'is_strange',
       'geco_FFD_mean', 'geco_FFD_median', 'geco_FFD_std', 'geco_TRT_mean',
       'geco_TRT_median', 'geco_TRT_std', 'geco_GPT_mean', 'geco_GPT_median',
       'geco_GPT_std', 'geco_nFix_mean', 'geco_nFix_median', 'geco_nFix_std',
       'geco_fixProp_mean', 'geco_fixProp_median', 'geco_fixProp_std'])
print('Added:')
print('\'' + '\', \''.join(print_vars) + '\'')

In [None]:
print_vars = add_seq_states_test(n_forward=2, n_backward=2, vars=['Conc_M', 'Conc_SD', 'Percent_Known',
       'is_mwe', 'mwe_cat', 'comp_score', 'comp_weights', 
       'CAT_pos', 'CAT_word', 'Is_EOS', 'Is_SOS', 'word_len', 'zipf_frequency',
       'word_frequency', 'saccade_cat_binary', 'saccade_cat', 'is_strange',
       'geco_FFD_mean', 'geco_FFD_median', 'geco_FFD_std', 'geco_TRT_mean',
       'geco_TRT_median', 'geco_TRT_std', 'geco_GPT_mean', 'geco_GPT_median',
       'geco_GPT_std', 'geco_nFix_mean', 'geco_nFix_median', 'geco_nFix_std',
       'geco_fixProp_mean', 'geco_fixProp_median', 'geco_fixProp_std'])
print('Added:')
print('\'' + '\', \''.join(print_vars) + '\'')

In [None]:
use_features = ['word_len', 'saccade_cat_binary', 'saccade_cat','geco_FFD_mean',
                'Is_EOS', 'geco_TRT_mean', 'Is_SOS', 'geco_nFix_median', 
                'word_frequency','PREV_2_Is_SOS', 'PREV_1_Is_SOS', 
                'PREV_1_geco_fixProp_median', 'PREV_1_Percent_Known', 
                'NEXT_1_word_len', 'PREV_1_word_len', 'PREV_1_comp_weights', 
                'PREV_2_comp_score', 'geco_GPT_median', 'NEXT_1_Is_EOS',
                'is_mwe', 'NEXT_1_CAT_pos', 'PREV_1_word_frequency', 
                'PREV_1_CAT_word','PREV_1_saccade_cat','Conc_M', 
                'CAT_word', 'PREV_1_geco_nFix_mean', 'NEXT_1_word_frequency',
                'CAT_pos', 'PREV_1_geco_FFD_std','comp_weights',
                'PREV_2_CAT_pos','geco_fixProp_std', 'geco_nFix_std',
                'PREV_2_geco_GPT_std','NEXT_2_saccade_cat_binary', 
                'NEXT_1_Conc_M','Percent_Known','PREV_1_geco_GPT_std', 
                'NEXT_2_geco_nFix_median', 'NEXT_2_word_frequency', 
                'NEXT_2_zipf_frequency', 'PREV_1_CAT_pos', 
                'PREV_2_mwe_cat', 'geco_fixProp_mean', 'PREV_1_is_strange', 
                'NEXT_1_geco_GPT_std', 'mwe_cat', 'PREV_1_is_mwe',
                'geco_FFD_std','NEXT_1_mwe_cat', 'NEXT_1_Percent_Known', 
                'PREV_1_geco_fixProp_std', 'is_strange',
                'PREV_1_geco_nFix_std', 'PREV_1_Conc_M', 
                'NEXT_1_geco_TRT_std', 'geco_GPT_std','NEXT_1_geco_fixProp_std',
                'NEXT_2_geco_fixProp_std', 'PREV_1_geco_nFix_median',
                'NEXT_2_is_strange', 'NEXT_2_Conc_M', 'NEXT_2_geco_GPT_std', 
                'NEXT_1_comp_weights','PREV_2_geco_GPT_mean','PREV_2_is_strange', 
                'PREV_1_geco_TRT_std', 'PREV_1_Conc_SD', 'comp_score', 
                'NEXT_2_comp_weights', 'Conc_SD', 'NEXT_2_comp_score',
                'NEXT_1_geco_nFix_median', 'zipf_frequency', 
                'PREV_2_CAT_word', 'NEXT_1_CAT_word','NEXT_2_Conc_SD', 
                'PREV_2_word_frequency','NEXT_1_Conc_SD','NEXT_1_geco_FFD_std',
                'NEXT_1_is_strange', 'NEXT_1_saccade_cat_binary',
                'PREV_2_geco_fixProp_std', 'NEXT_2_Is_EOS', 'w2v_embedding']

test_data = test_data.replace('.',0, regex=True)
data = data.replace('.', 0, regex=True)
test_data.NEXT_2_comp_weights = test_data.NEXT_2_comp_weights.astype(float)


X = np.vstack([list(flatten(x)) for x in data[use_features].values]).tolist()      

#X_test =  np.vstack([list(flatten(x)) for x in test_data[use_features].values]).tolist()      

In [None]:
data.tail()

In [None]:
# Build the task for test data

X_train, y_train = X[:USE_SPLIT[0]]+X[USE_SPLIT[1]:], data[['nFix',	'FFD',	'GPT', 'TRT', 'fixProp']].drop(data.index[USE_SPLIT[0]:USE_SPLIT[1]])
X_val, y_val = X[USE_SPLIT[0]:USE_SPLIT[1]], data[['nFix',	'FFD',	'GPT', 'TRT', 'fixProp']][USE_SPLIT[0]:USE_SPLIT[1]]
val_df = data[USE_SPLIT[0]:USE_SPLIT[1]]
#X_train, y_train = X, data[['nFix',	'FFD',	'GPT', 'TRT', 'fixProp']]

In [None]:
# Build the task for paper experiments

# Define splits

SPLIT_1 = (7839,10490,400,533)
SPLIT_2 = (10490,13082,533,667)
SPLIT_3 = (13082,15737,667,800)
SPLIT_ALL = (15737,15737,800,800)

USE_SPLIT = SPLIT_3

#model_dir = shared_dir+'Final models/no_BERT_split_3/'
#model_dir = shared_dir+'Final models/split_2/'
#model_dir = shared_dir+'Final models/split_3/'


X_train, y_train = X[:USE_SPLIT[0]]+X[USE_SPLIT[1]:], data[['nFix',	'FFD',	'GPT', 'TRT', 'fixProp']].drop(data.index[USE_SPLIT[0]:USE_SPLIT[1]])
X_val, y_val = X[USE_SPLIT[0]:USE_SPLIT[1]], data[['nFix',	'FFD',	'GPT', 'TRT', 'fixProp']][USE_SPLIT[0]:USE_SPLIT[1]]
val_df = data[USE_SPLIT[0]:USE_SPLIT[1]]

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

scaler.fit(X_train)

X_train_scale = scaler.transform(X_train)
#X_test_scale = scaler.transform(X_test)
X_val_scale = scaler.transform(X_val)

#X_train_scale = X_train
#X_val_scale = X_val

In [None]:
f = np.array(X_test_scale)

f[np.argwhere(np.isnan(f))] = 0
X_test_scale = f.tolist()
print(np.argwhere(np.isinf(f)))

In [None]:
f = np.array(X_val_scale)

f[np.argwhere(np.isnan(f))] = 0
X_val_scale = f.tolist()
print(np.argwhere(np.isinf(f)))

In [None]:
from sklearn.linear_model import ElasticNetCV

preds = {}
total_mae = 0

for target in tqdm(['nFix']):
    print('fitting', target)
    target_y_train = y_train[target]
    target_y_val = y_val[target] 
    regr = ElasticNetCV(max_iter=200, l1_ratio=[0.999999,1])
    regr.fit(X_train_scale, target_y_train)
    print('alpha', regr.alpha_)
    print('l1_ratio', regr.l1_ratio_)
    #y_hat = regr.predict(X_val_scale)
    #preds[target] = y_hat
    #print(target, 'MAE', mean_absolute_error(y_val[target], preds[target]))  
    #filename = model_dir+(target)+'.pkl'
    #pickle.dump(regr, open(filename,'wb'))

#total_mae /= 5
#print(f'Total MAE {total_mae}')

In [None]:
coefs_abs = np.abs(regr.coef_)
coefs_normed = 100*coefs_abs/np.sum(coefs_abs)

In [None]:
sum(coefs_normed[85:])

In [None]:
for name, val in zip(feature_locs[:85], coefs_normed[:85]):
  print(name.replace('_','-'),val, '\\\\') 

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import permutation_importance


preds = {}
total_mae = 0

for target in tqdm(['nFix','FFD','GPT', 'TRT', 'fixProp']):
    print('fitting', target)
    target_y_train = y_train[target]
    target_y_val = y_val[target] 
    regr = RandomForestRegressor(max_depth=7, n_estimators=100, max_features=None)
    regr.fit(X_train_scale, target_y_train)
    y_hat = regr.predict(X_val_scale)
    preds[target] = y_hat
    print(target, 'MAE', mean_absolute_error(y_val[target], preds[target]))  

    r = permutation_importance(regr, X_val_scale, y_val[target], n_repeats=10, random_state=0)
    print('permuatations calculated')
    normed_importances_mean = 100*r.importances_mean/np.sum(r.importances_mean)

    print(f'w2v-embedding-all', np.sum(normed_importances_mean[85:385]))
    
    for i in r.importances_mean.argsort()[::-1]:
      if r.importances_mean[i] > 0:
         if i < 85 or i >= 385:
           print(feature_locs[i].replace('_','-'), f"{normed_importances_mean[i]:.5f}")

    print('____________________________\n\n\n')

total_mae /= 5
print(f'Total MAE {total_mae}')

#WITH SPLIT = 3

In [None]:
total_mae = 0
for target in ['nFix',	'FFD',	'GPT', 'TRT', 'fixProp']:
      target_y_val = y_val[target] 

      preds[target][preds[target]>100] = 100
      preds[target][preds[target]<0] = 0
      print(target, 'MAE', mean_absolute_error(y_val[target], preds[target]))  
      total_mae += mean_absolute_error(y_val[target], preds[target])

total_mae /= 5
print(f'Total MAE {total_mae}')

In [None]:
regr.get_params()

In [None]:
#obtain list of which feature is where for discovering elasticnet zeros 
feature_locs = [] 
for feat in use_features: 
    if type(data[feat][0]) is list:
        print('start', len(feature_locs))
        for i, x in enumerate(data[feat][0]):
            feature_locs.append(str(feat)+'_'+str(i))
        print('end', len(feature_locs))
    else:
        feature_locs.append(str(feat))

In [None]:
feature_locs

In [None]:
filename = 'no_BERT_split1.pkl'
pickle.dump(regr, open(filename,'wb'))

In [None]:
model = pickle.load(open(model_dir+'TRT.pkl', 'rb'))
temp = model.predict(X_val_scale)

In [None]:
importance_dict = {feat:val for feat, val in zip(feature_locs,model.feature_importances_)}

In [None]:
from collections import Counter
features_ordered = Counter(importance_dict).most_common()
for i, (k,v) in enumerate(features_ordered):
  print(k, v)
  if i >20:
    break


In [None]:
mean_absolute_error(y_val['fixProp'], temp)

In [None]:
clean_test_data = pd.read_csv('test_data_with_pos.csv')


submit_data=clean_test_data[['sentence_id','word_id', 'word']]
for pred_target in ['nFix',	'FFD',	'GPT', 'TRT', 'fixProp']:
    submit_data[pred_target] = safe_preds[pred_target]
    
    
submit_data.to_csv('answer.txt')

In [None]:
answers = pd.read_csv('answer.txt', index_col=0)
answers.to_csv('answer2.txt', index=False)

In [None]:
from copy import deepcopy

In [None]:
safe_preds = deepcopy(preds)

In [None]:
for pred_target in ['nFix',	'FFD',	'GPT', 'TRT', 'fixProp']:
    safe_preds[pred_target][safe_preds[pred_target]>100] = 100
    safe_preds[pred_target][safe_preds[pred_target]<0] = 0

In [None]:
from sklearn.ensemble import RandomForestRegressor
total_mae = 0
for target in tqdm(['nFix',	'FFD',	'GPT', 'TRT', 'fixProp']):
    print('fitting', target)
    target_y_train = y_train[target]
    target_y_val = y_val[target] 
    regr = RandomForestRegressor(max_depth=7, random_state=0, verbose=2)
    regr.fit(X_train_scale, target_y_train)
    y_hat = regr.predict(X_val_scale)

    preds[target] = y_hat
    print(target, 'MAE', mean_absolute_error(y_val[target], preds[target]))  
    total_mae += mean_absolute_error(y_val[target], preds[target])
#total_mae /= 5
#print(f'Total MAE {total_mae}')

In [None]:
regr.predict(X_val_scale)

In [None]:
y_hat = regr.predict(X_val_scale)

mean_absolute_error(y_hat, y_val['fixProp'])

In [None]:
model = ElasticNetCV

preds = {}
total_mae = 0
for target in tqdm(['nFix','FFD','GPT', 'TRT', 'fixProp']):
    print('fitting', target)
    target_y_train = y_train[target]
    target_y_val = y_val[target] 

    reg = model(max_iter=2000, l1_ratio=[.1, .5, .6, .7, .9, .95, .99, 1]).fit(X_train_scale, target_y_train)
    print('alpha', reg.alpha_)
    print('l1_ratio',reg.l1_ratio_)
    y_hat = reg.predict(X_val_scale)

    preds[target] = y_hat
    print(target, 'MAE', mean_absolute_error(y_val[target], preds[target]))  
    total_mae += mean_absolute_error(y_val[target], preds[target])
total_mae /= 5
print(f'Total MAE {total_mae}')

In [None]:
print('percentage of features used by ElasticNet:', 100 * np.count_nonzero(reg.coef_)/len(reg.coef_))

In [None]:
w2v_dims_prev_1 = [] 
w2v_dims_prev_2 = []
w2v_dims_current = []
w2v_dims_next_1 = []
w2v_dims_next_2 = [] 
w2v_dims = 0 
from collections import Counter 

coef_dict = {} 
total_coef = np.sum(np.abs(reg.coef_))
for loc in np.argwhere(reg.coef_):
    current = feature_locs[int(loc)]
#  if current[:10] == 'NEXT_2_w2v':
  #      w2v_dims_next_2.append(current.split('_')[-1])
 #   elif current[:10] == 'NEXT_1_w2v':
  #      w2v_dims_next_1.append(current.split('_')[-1])
    if current[:3] == 'w2v':
        w2v_dims+=1
  #  elif current[:10] == 'PREV_1_w2v':
  #      w2v_dims_prev_1.append(current.split('_')[-1])
  #  elif current[:10] == 'PREV_2_w2v':
   #     w2v_dims_prev_2.append(current.split('_')[-1])
    else: 
       # print(feature_locs[int(loc)], reg.coef_[int(loc)]*100/total_coef)
        coef_dict[feature_locs[int(loc)]] = np.abs(reg.coef_[int(loc)]*100/total_coef)

print('w2v dims used:', w2v_dims)
print(Counter(coef_dict).most_common())

In [None]:
new_feats = [feat for (feat, weight) in Counter(coef_dict).most_common() if weight > 0.1]

In [None]:
' '.join(new_feats)

In [None]:
total_mae = 0
for target in ['nFix',	'FFD',	'GPT', 'TRT', 'fixProp']:
      target_y_train = y_train[target]
      target_y_val = y_val[target] 

      preds[target][preds[target]>100] = 100
      preds[target][preds[target]<0] = 0
      print(target, 'MAE with cutoffs', mean_absolute_error(y_val[target], preds[target]))  
      total_mae += mean_absolute_error(y_val[target], preds[target])

total_mae /= 5
print(f'Total MAE with cutoffs {total_mae}')

In [None]:
from sklearn.linear_model import BayesianRidge
def train_from_feature_list(feature_columns):
    print(feature_columns)
    X = np.vstack([list(flatten(x)) for x in data[feature_columns].values]).tolist()        
   
    # Build the task

    X_train, y_train = X[:USE_SPLIT[0]]+X[USE_SPLIT[1]:], data[['nFix',	'FFD',	'GPT', 'TRT', 'fixProp']].drop(data.index[USE_SPLIT[0]:USE_SPLIT[1]])
    X_val, y_val = X[USE_SPLIT[0]:USE_SPLIT[1]], data[['nFix',	'FFD',	'GPT', 'TRT', 'fixProp']][USE_SPLIT[0]:USE_SPLIT[1]]
    val_df = data[USE_SPLIT[0]:USE_SPLIT[1]]
    
    scaler = StandardScaler()

    scaler.fit(X_train)

    X_train_scale = scaler.transform(X_train)
    X_val_scale = scaler.transform(X_val)

    model = BayesianRidge

    preds = {}
    total_mae = 0
    for target in ['nFix',	'FFD',	'GPT', 'TRT', 'fixProp']:
      print('fitting', target)
      target_y_train = y_train[target]
      target_y_val = y_val[target] 

      reg = model().fit(X_train_scale, target_y_train)  

      y_hat = reg.predict(X_val_scale)
      y_hat[y_hat>100] = 100
      y_hat[y_hat<0] = 0

      preds[target] = y_hat
      print(target, 'MAE', mean_absolute_error(y_val[target], preds[target]))  
      total_mae += mean_absolute_error(y_val[target], preds[target])

    total_mae /= 5
    print(f'Total MAE {total_mae}')



In [None]:
# Run the task with the model specified
from sklearn.linear_model import BayesianRidge
from sklearn.linear_model import ElasticNetCV


model = BayesianRidge

preds = {}
total_mae = 0
for target in ['nFix',	'FFD',	'GPT', 'TRT', 'fixProp']:
  print('fitting', target)
  target_y_train = y_train[target]
  target_y_val = y_val[target] 

  #reg = model(max_iter=20000, l1_ratio=0.6).fit(X_train_scale, target_y_train)
  reg = model().fit(X_train_scale, target_y_train)  

  #reg = model(alpha_2=1e-2).fit(X_train_scale, target_y_train)
  #print('alpha', reg.alpha_)
  #print('l1_ratio', reg.alpha_)
  y_hat = reg.predict(X_val_scale)
  #y_hat[y_hat>100] = 100
  #y_hat[y_hat<0] = 0

  preds[target] = y_hat
  print(target, 'MAE', mean_absolute_error(y_val[target], preds[target]))  
  total_mae += mean_absolute_error(y_val[target], preds[target])

total_mae /= 5
print(f'Total MAE {total_mae}')

'''
model = ElasticNetCV


for l1ratio in [.1, .5, .7, .9, .95, .99, 1]:
    print('_________________')
    print(l1ratio)
    preds = {}
    total_mae = 0
    for target in ['nFix',	'FFD',	'GPT', 'TRT', 'fixProp']:
      print('fitting', target)
      target_y_train = y_train[target]
      target_y_val = y_val[target] 

      reg = model(max_iter=2000, l1_ratio=l1ratio).fit(X_train_scale, target_y_train)
      
      y_hat = reg.predict(X_val_scale)

      preds[target] = y_hat
      print(target, 'MAE', mean_absolute_error(y_val[target], preds[target]))  
      total_mae += mean_absolute_error(y_val[target], preds[target])

    total_mae /= 5
    print(f'Total MAE {total_mae}')
'''

In [None]:
data.head

In [None]:
fitting nFix
nFix MAE 3.897349478994872
fitting FFD
FFD MAE 0.6865561010487296
fitting GPT
GPT MAE 2.2355537777376155
fitting TRT
TRT MAE 1.5097279541778272
fitting fixProp
fixProp MAE 11.277121487697162
Total MAE 3.921261759931241

In [None]:
# Run the task with the model specified
from sklearn.neural_network import MLPRegressor


model = MLPRegressor


preds = {}
total_mae = 0
for target in ['nFix',	'FFD',	'GPT', 'TRT', 'fixProp']:
  print('fitting', target)
  target_y_train = y_train[target]
  target_y_val = y_val[target] 

  reg = model(hidden_layer_sizes=500, alpha=1.00E-12, learning_rate='constant', batch_size=400, learning_rate_init=1.00E-06, max_iter=10000).fit(X_train_scale, target_y_train)
  #reg = model(alpha_2=1e-2).fit(X_train_scale, target_y_train)
  y_hat = reg.predict(X_val_scale)

  preds[target] = y_hat
  print(target, 'MAE', mean_absolute_error(y_val[target], preds[target]))  
  total_mae += mean_absolute_error(y_val[target], preds[target])

total_mae /= 5
print(f'Total MAE {total_mae}')


In [None]:
from sklearn.linear_model import MultiTaskElasticNetCV

model = MultiTaskElasticNetCV

preds = {}
total_mae = 0
print('fitting', ['nFix',	'FFD',	'GPT', 'TRT', 'fixProp'])
target_y_train = y_train[['nFix',	'FFD',	'GPT', 'TRT', 'fixProp']].values
target_y_val = y_val[['nFix',	'FFD',	'GPT', 'TRT', 'fixProp']] 


reg = model(max_iter=2000, l1_ratio=0.5).fit(X_train_scale, target_y_train)

In [None]:
import pickle

pickle.dump([val_df, preds], open('drive/Shareddrives/CMCL Shared Task - CogNLP@Sheffield/Data/CMCL XLNET Preds Split 3.pkl', 'wb'))

In [None]:
val_df_1, preds_1 = pickle.load(open('drive/Shareddrives/CMCL Shared Task - CogNLP@Sheffield/Data/CMCL XLNET Preds Split 1.pkl', 'rb'))
val_df_2, preds_2 = pickle.load(open('drive/Shareddrives/CMCL Shared Task - CogNLP@Sheffield/Data/CMCL XLNET Preds Split 2.pkl', 'rb'))
val_df_3, preds_3 = pickle.load(open('drive/Shareddrives/CMCL Shared Task - CogNLP@Sheffield/Data/CMCL XLNET Preds Split 3.pkl', 'rb'))

In [None]:
val_df_all = val_df_1.append(val_df_2, ignore_index=True).append(val_df_3, ignore_index=True)

In [None]:
preds_all = {}

for key in preds_1:
    preds_all[key] = np.hstack((preds_1[key], preds_2[key], preds_3[key]))

In [None]:
# Format validation results 
val_results, val_store = val_to_plot(val_df_all, preds_all)

In [None]:
pred_review = pd.DataFrame()

# Plot validation results from worst to best
sent_word2mae = {}


'''
for val_idx, score in Counter(val_results).most_common():
    x, y, y_hat = val_store[val_idx]
    print(' '.join(x))
    print('MAE:', mean_absolute_error(y, y_hat))

    for idx, word in enumerate(x):
      print(word, np.abs(y[idx,:]-y_hat[idx,:]))
      print('MAE:', mean_absolute_error(y[idx,:], y_hat[idx,:]))
    #plot_sent(x, y, y_hat)
      print()
'''
for val_idx, score in Counter(val_results).most_common():
    x, y, y_hat = val_store[val_idx]

    for idx, word in enumerate(x):
      sent_word2mae[str(val_idx)+'_'+str(idx)] = mean_absolute_error(y[idx,:], y_hat[idx,:])

worst_words = {k:e for e, (k, v) in enumerate(Counter(sent_word2mae).most_common())}


words = []
word_idx = []
word_badness = []
sent_badness = []
sent_mae = []
sent_nFix	= []
sent_FFD = []
sent_GPT = []
sent_TRT = []
sent_fixProp = []
word_mae = []
word_nFix	= []
word_FFD = []
word_GPT = []
word_TRT = []
word_fixProp = []


for sent_b, (val_idx, score) in enumerate(Counter(val_results).most_common()):
    x, y, y_hat = val_store[val_idx]
    sent_badness.extend([sent_b]*len(x))#+[''])
    sent_mae.extend(["{:.3f}".format(score)]*len(x))#+[''])
    sent_nFix.extend(["{:.3f}".format(mean_absolute_error(y[:,0], y_hat[:,0]))]*len(x))#+[''])
    sent_FFD.extend(["{:.3f}".format(mean_absolute_error(y[:,1], y_hat[:,1]))]*len(x))#+[''])
    sent_GPT.extend(["{:.3f}".format(mean_absolute_error(y[:,2], y_hat[:,2]))]*len(x))#+[''])
    sent_TRT.extend(["{:.3f}".format(mean_absolute_error(y[:,3], y_hat[:,3]))]*len(x))#+[''])
    sent_fixProp.extend(["{:.3f}".format(mean_absolute_error(y[:,4], y_hat[:,4]))]*len(x))#+[''])

    for idx, word in enumerate(x):
        word_idx.append(idx)
        words.append(word)
        word_mae.extend(["{:.3f}".format(mean_absolute_error(y[idx,:], y_hat[idx,:]))])
        word_nFix.extend(["{:.3f}".format((y_hat[idx,0]-y[idx,0]))])
        word_FFD.extend(["{:.3f}".format((y_hat[idx,1]-y[idx,1]))])
        word_GPT.extend(["{:.3f}".format((y_hat[idx,2]-y[idx,2]))])
        word_TRT.extend(["{:.3f}".format((y_hat[idx,3]-y[idx,3]))])
        word_fixProp.extend(["{:.3f}".format((y_hat[idx,4]-y[idx,4]))])
        word_badness.append(worst_words[str(val_idx)+'_'+str(idx)])
    '''
    word_idx.append('')
    words.append('')
    word_mae.extend([''])
    word_nFix.extend([''])
    word_FFD.extend([''])
    word_GPT.extend([''])
    word_TRT.extend([''])
    word_fixProp.extend([''])
    word_badness.extend([''])
    '''



In [None]:
#words,word_idx,word_badness,sent_mae,sent_nFix,sent_FFD,sent_GPT,sent_TRT,sent_fixProp,word_mae,word_nFix,word_FFD,word_GPT,word_TRT,word_fixProp
df

In [None]:
df = pd.DataFrame(list(zip(word_idx,words,word_badness,word_mae,word_nFix,word_FFD,word_GPT,word_TRT,word_fixProp,sent_badness,sent_mae,sent_nFix,sent_FFD,sent_GPT,sent_TRT,sent_fixProp
)), columns=['word_idx', 'words', 'word_badness', 'word_mae', 'word_nFix', 'word_FFD', 'word_GPT', 'word_TRT', 'word_fixProp', 'sent_badness', 'sent_mae', 'sent_nFix', 'sent_FFD', 'sent_GPT', 'sent_TRT', 'sent_fixProp']) 

In [None]:
df.to_excel("drive/Shareddrives/CMCL Shared Task - CogNLP@Sheffield/Data/All Preds.xlsx")  

In [None]:
plt.hist(preds['fixProp'])

In [None]:
plt.hist(y_val['fixProp'])