In [None]:
!pip install wordfreq

In [None]:
import os
from google.colab import drive
import pandas as pd
from sklearn import preprocessing
from copy import deepcopy
import numpy as np
import matplotlib.pyplot as plt 
from sklearn.linear_model import BayesianRidge
from sklearn.metrics import mean_absolute_error
from sklearn.impute import SimpleImputer

from collections.abc import Iterable
from sklearn.svm import SVR
from collections import Counter
from wordfreq import zipf_frequency
from string import punctuation

import spacy
from spacy.tokens import Doc

nlp  = spacy.load('en_core_web_sm')  
nlp.tokenizer = nlp.tokenizer.tokens_from_list

pd.options.mode.chained_assignment = None 

In [None]:
# Define splits

SPLIT_1 = (7839,10490,400,533)
SPLIT_2 = (10490,13082,533,667)
SPLIT_3 = (13082,15737,667,800)
SPLIT_ALL = (15737,15737,800,800)

USE_SPLIT=SPLIT_1

In [None]:
# Helper Functions

def pos_from_tokensized(sent):  
  sent[-1] = sent[-1][:-1]    # Remove full-stop 
  sent = [word.replace(',', '') for word in sent]
  doc = nlp(sent)
  return([token.pos_ for token in doc])


def add_seq_states(n_forward, n_backward, vars):
  print_vars = []
  for var in vars:
    for forward_step in range(1, n_forward+1):
      built_var = f'NEXT_{forward_step}_{var}'
      print_vars.append(built_var)
      data[built_var] = np.zeros_like(data[var])
      for sample in range(len(data)):        
        if sample+forward_step < len(data) and data['sentence_id'][sample] == data['sentence_id'][sample+forward_step]:
          data[built_var][sample] = data[var][sample+forward_step]
        else:
          data[built_var][sample] = np.zeros_like(data[var][0]).tolist()

    for backward_step in range(1, n_backward+1, 1):
      built_var = f'PREV_{backward_step}_{var}'
      print_vars.append(built_var)
      data[built_var] = np.zeros_like(data[var])
      for sample in range(len(data)):        
        if sample-backward_step >= 0 and data['sentence_id'][sample] == data['sentence_id'][sample-backward_step]:
          data[built_var][sample] = data[var][sample-backward_step]
        else:
          data[built_var][sample] = np.zeros_like(data[var][0]).tolist()

  return(print_vars)

def flatten(items):
  for item in items:
      if isinstance(item, list):
          yield from flatten(item)
      else:
          yield item

def plot_sent(sent, y, y_hat):
  eg_results = {}
  sent_len = len(sent)

  for idx, target in enumerate(['nFix',	'FFD',	'GPT', 'TRT', 'fixProp']):
    eg_results[target] = y[:,idx]
    eg_results['PRED_'+target] = y_hat[:,idx]

  fig, ax = plt.subplots()
  locs, labels = plt.xticks()  # Get the current locations and labels.
  plt.xticks(list(range(sent_len)), sent, rotation=-90)  # Set text labels and properties.

  plt.plot(list(range(sent_len)), eg_results['nFix'], color='darkblue', label = 'nFix')
  plt.plot(list(range(sent_len)), eg_results['PRED_nFix'], '--', color='darkblue')

  plt.plot(list(range(sent_len)), eg_results['FFD'], color='darkred', label = 'FFD')
  plt.plot(list(range(sent_len)), eg_results['PRED_FFD'], '--', color='darkred')

  plt.plot(list(range(sent_len)), eg_results['GPT'], color='darkorange', label = 'GPT')
  plt.plot(list(range(sent_len)), eg_results['PRED_GPT'], '--', color='darkorange')

  plt.plot(list(range(sent_len)), eg_results['TRT'], color='darkgreen', label = 'TRT')
  plt.plot(list(range(sent_len)), eg_results['PRED_TRT'], '--', color='darkgreen')

  plt.plot(list(range(sent_len)), eg_results['fixProp'], color='deeppink', label = 'fixProp')
  plt.plot(list(range(sent_len)), eg_results['PRED_fixProp'], '--', color='deeppink')

  #plt.yscale('symlog')

  plt.legend(loc='upper right', ncol=len(eg_results))
  plt.tight_layout()
  plt.ylim((0,110))
  plt.title('MAE '+str(float("{0:.5f}".format(mean_absolute_error(y, y_hat)))))
  plt.grid()
  plt.show()

def val_to_plot(val_df, preds):
  sent_boundaries = []
  sent_start = 0
  y_val_sents = val_df['sentence_id'].tolist()
  sent_id_tracker = y_val_sents[0]
  for idx, sent_id in enumerate(y_val_sents):
      if sent_id_tracker < sent_id or idx == len(y_val_sents) - 1:
          if idx == len(data) - 1:
                idx += 1
          sent_boundaries.append((sent_start,idx))              
          sent_start = deepcopy(idx)
      sent_id_tracker = sent_id
  
  val_results = {}
  val_store = {}

  for idx, sent_span in enumerate(sent_boundaries):
    
    x = val_df.iloc[sent_span[0]:sent_span[1],:]['word'].tolist()
    y = val_df.iloc[sent_span[0]:sent_span[1],:][['nFix',	'FFD',	'GPT', 'TRT', 'fixProp']].values
    y_hat = np.vstack([preds[target][sent_span[0]:sent_span[1]] for target in ['nFix',	'FFD', 'GPT', 'TRT', 'fixProp']]).transpose()
    assert y_hat.shape == y.shape
    val_results[idx] = float(mean_absolute_error(y, y_hat))
    val_store[idx] = [x, y, y_hat] 

  return(val_results, val_store)

In [None]:
!wget -O CMCL2021_train.zip  https://competitions.codalab.org/my/datasets/download/79675cf7-c344-478d-97f6-d4cfadb4a54a
!unzip CMCL2021_train.zip

In [None]:
#Load the data with some preprocessed additions

train_data = pd.read_csv('training_data/training_data.csv')
# Remove EOS as redundant by word_id
train_data['word'] = train_data['word'].str.replace('<EOS>', '')

In [None]:
test_data = pd.read_csv('test_data/test_data.csv')
test_data['word'] = test_data['word'].str.replace('<EOS>', '')

In [None]:
# Load NN Features
embs = pd.read_pickle('Feature Enhancement/CMCL_XLNET_Fold_{str(FOLD)}_Embeddings.pkl')
data['BERT_embed'] = [v[0].reshape(-1).tolist() for k, v in sorted(embs.items(), key=lambda item: item[0])]

test_embs = pd.read_pickle('Feature Enhancement/CMCL_XLNET_Test_Embeddings.pkl')
test_data['BERT_embed'] = [v[0].reshape(-1).tolist() for k, v in sorted(test_embs.items(), key=lambda item: item[0])]

In [None]:
all_poses = []

sent_start = 0
word_id_tracker = 0

for word_id_word in test_data[['word_id', 'word']].iterrows():
  idx = word_id_word[0]
  word_id, word = word_id_word[1].tolist()

  if word_id_tracker > word_id:
    sentence = test_data['word'].tolist()[sent_start:idx]
    sent_poses = pos_from_tokensized(sentence)

    assert len(sent_poses) == len(sentence)
    all_poses.extend(sent_poses)
    sent_start = deepcopy(idx)

  word_id_tracker = word_id

sentence = test_data['word'].tolist()[sent_start:]
sent_poses = pos_from_tokensized(sentence)

assert len(sent_poses) == len(sentence)
all_poses.extend(sent_poses)

test_data['pos'] = all_poses

In [None]:
# Make some variables categorical

vars_to_cat = ['word', 'pos'] 

for var in vars_to_cat:
  le = preprocessing.LabelEncoder()
  data[f'CAT_{var}'] = le.fit_transform(data[var])

for var in vars_to_cat:
  test_data[f'CAT_{var}'] = le.fit_transform(test_data[var])

In [None]:
# Add EOS, Word Length, Word Frequency for Train

is_EOS = []
is_SOS = []

for row in data.iterrows():
  idx, row_data = row[0], row[1]

  if idx != len(data)-1 and row_data['word_id'] < data.iloc[idx+1]['word_id']:
    is_EOS.append(0)
  else:
    is_EOS.append(1)

for row in data.iterrows():
  idx, row_data = row[0], row[1]

  if row_data['word_id'] == 0:
    is_SOS.append(1)
  else:
    is_SOS.append(0)

data['Is_EOS']=is_EOS
data['Is_SOS']=is_SOS
data['word_len'] = [len(word) for word in data['word']]
data['zipf_frequency'] = [zipf_frequency(word.rstrip(punctuation), 'en', wordlist='large') for word in data['word']]

In [None]:
# Add EOS, Word Length, Word Frequency for Test

is_EOS = []
is_SOS = []

for row in test_data.iterrows():
  idx, row_data = row[0], row[1]

  if idx != len(test_data)-1 and row_data['word_id'] < test_data.iloc[idx+1]['word_id']:
    is_EOS.append(0)
  else:
    is_EOS.append(1)

for row in test_data.iterrows():
  idx, row_data = row[0], row[1]

  if row_data['word_id'] == 0:
    is_SOS.append(1)
  else:
    is_SOS.append(0)

test_data['Is_EOS']=is_EOS
test_data['Is_SOS']=is_SOS
test_data['word_len'] = [len(word) for word in test_data['word']]
test_data['zipf_frequency'] = [zipf_frequency(word.rstrip(punctuation), 'en', wordlist='large') for word in test_data['word']]

In [None]:
data[-10:]

In [None]:
# Add features from previous/subsequent words

print_vars = add_seq_states(n_forward=1, n_backward=1, vars=['BERT_embed', 'Is_EOS', 'Is_SOS', 'word_len', 'zipf_frequency', 'CAT_word'])
print('Added:')
print('\'' + '\', \''.join(print_vars) + '\'')

In [None]:
data.columns

In [None]:
# Pick features to train with in 'feature_columns'

feature_columns = ['word_len', 'zipf_frequency', 'CAT_pos', 'Is_EOS', 'Is_SOS', 'BERT_embed']


X_train = np.vstack([list(flatten(x)) for x in data[feature_columns].values]).tolist()
y_train = data[['nFix',	'FFD',	'GPT', 'TRT', 'fixProp']]

X_test = np.vstack([list(flatten(x)) for x in test_data[feature_columns].values]).tolist()

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

scaler.fit(X_train)

X_train_scale = scaler.transform(X_train)
X_test_scale = scaler.transform(X_test)

In [None]:
# Run the task with the model specified
from sklearn.linear_model import ElasticNetCV

model = ElasticNetCV

preds = {}
total_mae = 0
for target in ['nFix',	'FFD',	'GPT', 'TRT', 'fixProp']:
  print('fitting', target)
  target_y_train = y_train[target]

  reg = model(l1_ratio=[0.2,0.4,0.5,0.6,0.7,0.8], max_iter=20000).fit(X_train_scale, target_y_train)
  
  y_hat = reg.predict(X_test_scale)

  y_hat[y_hat>100] = 100
  y_hat[y_hat<0] = 0
  preds[target] = y_hat

In [None]:
for target in ['nFix',	'FFD',	'GPT', 'TRT', 'fixProp']: 
    test_data[target] = preds[target]

In [None]:
submit_test_df = test_data[['sentence_id', 'word_id', 'word', 'nFix',	'FFD',	'GPT', 'TRT', 'fixProp']]

In [None]:
submit_test_df

In [None]:
submit_test_df.to_csv('drive/Shareddrives/CMCL Shared Task - CogNLP@Sheffield/Predicitons/ElasticNetCV-alpha=0.6_word_len+zipf_freq+CAT_pos+Is_EOS+Is_SOS+XLNET_A.csv', index=False)

In [None]:
# Format validation results 
val_results, val_store = val_to_plot(submit_test_df, preds)

In [None]:
for val_idx, score in Counter(val_results).most_common():
    x, y, y_hat = val_store[val_idx]
    plot_sent(x, y, y_hat)
    print()