In [17]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
from torch.autograd import Variable
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
import random
import csv
import json
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB 
from sklearn.svm import SVC 
import scipy

In [2]:
torch.backends.cudnn.deterministic = True
torch.manual_seed(123)
torch.cuda.manual_seed_all(123)
np.random.seed(123)
torch.cuda.manual_seed_all(123)

In [3]:
original_to_standard = {}
original_to_standard[0]=0
original_to_standard[0.25]=1
original_to_standard[0.5]=2
original_to_standard[0.75]=3
original_to_standard[1]=4
standard_to_original = {}
standard_to_original[0]=0
standard_to_original[1]=0.25
standard_to_original[2]=0.5
standard_to_original[3]=0.75
standard_to_original[4]=1

In [4]:
def get_train_data_electra(tokens, embeddings_dict, complexity):
    x_train = []
    for i in range(len(tokens)):
        temp = np.array(embeddings_dict[tokens[i]])
        x_train.append(temp)

    x_train = np.array(x_train, dtype=np.float32)
    y_train = complexity
    
    return x_train, y_train

In [5]:
def mae(actual, pred): 
    actual, pred = np.array(actual), np.array(pred)
    return np.mean(np.abs(actual - pred))

In [6]:
def training(x_train, y_train):
    model = LinearRegression().fit(x_train, y_train)        
    return model

def predict(model, x):
    predicted = model.predict(x)
    return predicted

In [7]:
# Loading Embeddings
# Opening JSON file 
f = open('Embeddings/BERT_embeddings_sum.json') 
  
# returns JSON object as a dictionary 
embeddings_dict1 = json.load(f) 

In [8]:
# Loading Embeddings
# Opening JSON file 
f = open('Embeddings/Electra_embeddings.json') 
  
# returns JSON object as a dictionary 
embeddings_dict2 = json.load(f) 

In [9]:
# Loading Embeddings
# Opening JSON file 
f = open('Embeddings/glove_embeddings.json')
# f = open('Embeddings/hand_crafted_features.json')
# returns JSON object as a dictionary 
embeddings_dict3 = json.load(f) 

In [10]:
# Loading Embeddings
# Opening JSON file 
# f = open('Embeddings/glove_embeddings.json')
f = open('Embeddings/hand_crafted_features.json')
# returns JSON object as a dictionary 
embeddings_dict4 = json.load(f) 

In [19]:
embeddings_dict4 = embeddings_dict3
embeddings_dict1 = embeddings_dict3

In [11]:
def ensemble(predictions):
    predicted = (predictions[0]+predictions[1]+predictions[2]+predictions[3]+predictions[4])/5
#     predicted = predictions[0]
    return predicted

In [12]:
def training_util(df, embeddings_dict):
    tokens = df['id'].values
    complexity = df['complexity'].values

    x_train, y_train = get_train_data_electra(tokens, embeddings_dict, complexity)
    model = training(x_train, y_train)
    
    return model

In [13]:
def testing_util(df, model, embeddings_dict):
    tokens = df['id'].values
    complexity = df['complexity'].values

    x_test, y_test = get_train_data_electra(tokens, embeddings_dict, complexity)
    return predict(model, x_test)

In [14]:
def generate_annotations(df,n=20):   
    scores = df['complexity']
    annotations = []
    for i in range(len(scores)):
        if scores[i]>=0 and scores[i]<0.25:
            low = 0
            high = 0.25
        elif scores[i]>=0.25 and scores[i]<0.5:
            low = 0.25
            high = 0.5
        elif scores[i]>=0.5 and scores[i]<0.75:
            low = 0.5
            high = 0.75
        elif scores[i]>=0.75 and scores[i]<1:
            low = 0.75
            high = 1
        alpha = (high - scores[i])/(high-low)
        num_low = int(np.floor(alpha*n))
        num_high = n - num_low

        temp = int(original_to_standard[high])*np.ones(n,dtype=int)
        for i in range(num_low):
            temp[i]=int(original_to_standard[low])
        annotations.append(temp.tolist())
    #     print(num_low, num_high)
    annotations = np.array(annotations)
    
    return annotations

def zscore(x, m, s):
    for i in range(len(x[0,:])):
        x[:,i] = (x[:,i] - m[i])/s[i]
    return x

def getx_train(tokens, embeddings_dict):
    x_train = []
    for i in range(len(tokens)):
        temp = np.array(embeddings_dict[tokens[i]])
        x_train.append(temp)

    x_train = np.array(x_train, dtype = np.float32)
    return x_train

def classification(x, y):   
#     clf = SVC(kernel = 'linear', verbose=1, C = 1).fit(x, y) 
    clf = SVC(kernel = 'rbf', verbose=1,C = 1).fit(x, y)
#     clf = SVC(kernel = 'sigmoid', verbose=1, C = 1).fit(x, y)
#     clf = RandomForestClassifier(max_depth = 18, verbose=1).fit(x, y)
#     clf = KNeighborsClassifier(n_neighbors = 5, verbose=1).fit(x, y) 
#     clf = LogisticRegression(random_state=0, verbose=1).fit(x, y)
#     gnb = GaussianNB(verbose=1).fit(x, y) 
    
    return clf

def predict(clf, x):
    return clf.predict(x)

def mae(actual, pred): 
    actual, pred = np.array(actual), np.array(pred)
    return np.mean(np.abs(actual - pred))

def train_test2(df_train,df_test, embeddings_dict1, n=20):
    annotations = generate_annotations(df_train,n)
    x_train = getx_train(df_train['id'], embeddings_dict1)
    x_test = getx_train(df_test['id'], embeddings_dict1)
    predictions = []
    for i in range(n):
        y_train = annotations[:,i]
        model = classification(x_train, y_train)
        predictions.append(predict(model,x_test))
    y_test = df_test['complexity']
    
    pred = np.array(predictions, dtype = float)
    for i in range(len(pred)):
        for j in range(len(pred[i])):
            pred[i,j]=standard_to_original[int(pred[i,j])]
            
    y_pred = np.zeros(y_test.shape,dtype=float)
    for j in range(len(y_test)):
        y_pred[j] = pred[:,j].mean()
    
    return y_pred
#     return mae(y_test, y_pred)
#     return predictions

In [22]:
def train_test(df_train,df_test, embeddings_dict1, embeddings_dict2, embeddings_dict3):
    # Training BERT Model
    model1 = training_util(df_train, embeddings_dict1)
    # Training Electra Model
    model2 = training_util(df_train, embeddings_dict2)
    # Training Glove Model
    model3 = training_util(df_train, embeddings_dict3)
    # Training HC Model
    model4 = training_util(df_train, embeddings_dict4)
        
    # Testing on All
    df = df_test
    tokens = df['id'].values
    complexity = df['complexity']
    x_test, y_test = get_train_data_electra(tokens, embeddings_dict1, complexity)

    predictions = []
# Regression
    # BERT
#     predictions.append(testing_util(df, model1, embeddings_dict1))

    # Electra
    predictions.append(testing_util(df, model2, embeddings_dict2))

    # Glove
#     predictions.append(testing_util(df, model3, embeddings_dict3))
    
    
# Classification
    
#     predictions.append(train_test2(df_train,df_test, embeddings_dict2, 30))
    
    predictions.append(train_test2(df_train,df_test, embeddings_dict3, 50))
    
    # HC
#     predictions.append(testing_util(df, model4, embeddings_dict4))
    
#     predictions.append(train_test2(df_train,df_test, embeddings_dict4, 30))
    
    return predictions, y_test

In [23]:
# # Single Words

# df_train = pd.read_csv("Group 1 Dataset/Dataset 1/lcp_single_train.tsv",
#                               delimiter="\t", quoting=csv.QUOTE_NONE, encoding='utf-8')
# df1 = df_train.sample(frac=1,random_state=2)
# df2 = df1[df1['corpus']!='europarl']
# df3 = df1[df1['corpus']=='europarl']
# df_train = df2
# df_train = df_train.append(df3.sample(frac=0.5,random_state=2))

# # df_train = df_train.append(pd.read_csv("Group 1 Dataset/Dataset 1/lcp_single_trial.tsv",
# #                               delimiter="\t", quoting=csv.QUOTE_NONE, encoding='utf-8'))
# df_curr = pd.read_csv("Group 1 Dataset/Dataset 2/lcp_multi_train.tsv",
#                               delimiter="\t", quoting=csv.QUOTE_NONE, encoding='utf-8').sample(frac=1,random_state=2)
# df_train = df_train.append(df_curr[df_curr['corpus']!='europarl'])

# df_train = df_train.reset_index(drop=True)

# df_test = pd.read_csv("Group 1 Dataset/Dataset 1/lcp_single_trial.tsv",
#                               delimiter="\t", quoting=csv.QUOTE_NONE, encoding='utf-8')


# # df_test = pd.read_csv("Group 1 Dataset/LCP_Single/test1.csv")
# df_test['complexity'] = 0.00000

# predictions, y_test = train_test(df_train,df_test, embeddings_dict1, embeddings_dict2, embeddings_dict3)

# # y_pred = (predictions[0]+predictions[1]+predictions[2]+predictions[3]+predictions[4])/5

# # y_pred = (predictions[0]+predictions[1])/2

# y = pd.read_csv('Group 1 Dataset/Sagnik/1_layer_trainable96.csv',header = None)

# # y_pred = 0.6*x[1]+0.4*y[1]
# y_pred = 0.5*predictions[1]+0.5*y[1]
# y_pred = list(y_pred)

# print(scipy.stats.pearsonr(pd.read_csv("Group 1 Dataset/Dataset 1/lcp_single_trial.tsv",
#                                   delimiter="\t", quoting=csv.QUOTE_NONE, encoding='utf-8')['complexity'],y_pred))

# df_test['Predicted Complexity'] = 0.00000
# for i in range(len(df_test)):
#     df_test['Predicted Complexity'][i] = y_pred[i]

# # df_final = df_test.drop(columns=['complexity','subcorpus','sentence','token'])

# # print(mae(pd.read_csv("Group 1 Dataset/Dataset 1/lcp_single_trial.tsv",
# #                               delimiter="\t", quoting=csv.QUOTE_NONE, encoding='utf-8')['complexity'],y_pred))

# # print(scipy.stats.pearsonr(pd.read_csv("Group 1 Dataset/Dataset 1/lcp_single_trial.tsv",
# #                               delimiter="\t", quoting=csv.QUOTE_NONE, encoding='utf-8')['complexity'],y_pred))

# df_final = df_test.drop(columns=['complexity','subcorpus','sentence','token'])
# df_final.to_csv('single_trial_predictions.csv',index=False, header=None)

# Single Words
df_train = pd.read_csv("Group 1 Dataset/Dataset 1/lcp_single_train.tsv",
                              delimiter="\t", quoting=csv.QUOTE_NONE, encoding='utf-8')

df_train = df_train.append(pd.read_csv("Group 1 Dataset/Dataset 1/lcp_single_trial.tsv",
                              delimiter="\t", quoting=csv.QUOTE_NONE, encoding='utf-8'))

df_curr = pd.read_csv("Group 1 Dataset/Dataset 2/lcp_multi_train.tsv",
                              delimiter="\t", quoting=csv.QUOTE_NONE, encoding='utf-8').sample(frac=1,random_state=2)
df_train = df_train.append(df_curr)
# [df_curr['corpus']!='europarl'])

df_train = df_train.reset_index(drop=True)

df_test = pd.read_csv("Group 1 Dataset/Test/lcp_single_test.tsv",
                              delimiter="\t", quoting=csv.QUOTE_NONE, encoding='utf-8')

df_test['complexity'] = 0.00000

predictions, y_test = train_test(df_train,df_test, embeddings_dict1, embeddings_dict2, embeddings_dict3)

y = pd.read_csv('Group 1 Dataset/Sagnik/1_layer_trainable_trainandtrial_test_96.csv',header = None)

y_pred = 0.5*predictions[1]+0.5*y[1]
y_pred = list(y_pred)

df_test['Predicted Complexity'] = 0.00000
for i in range(len(df_test)):
    df_test['Predicted Complexity'][i] = y_pred[i]

df_final = df_test.drop(columns=['complexity','corpus','sentence','token'])
df_final.to_csv('single_test_predictions_3.csv',index=False,header=None)
df_final = df_test.drop(columns=['complexity'])
df_final.to_csv('single_test_predictions_temp_3.csv',index=False,header=None)

print(mae(pd.read_csv('single_test_predictions_1.csv',header = None)[1],pd.read_csv('single_test_predictions_3.csv',header = None)[1]))

[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM]0.015045486590789477


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [24]:
# # MWEs

# df_train = pd.read_csv("Group 1 Dataset/Dataset 2/lcp_multi_train.tsv",
#                               delimiter="\t", quoting=csv.QUOTE_NONE, encoding='utf-8')
# # df_train = df_train.append(pd.read_csv("Group 1 Dataset/Dataset 1/lcp_single_trial.tsv",
# #                               delimiter="\t", quoting=csv.QUOTE_NONE, encoding='utf-8'))
# df_curr = pd.read_csv("Group 1 Dataset/Dataset 1/lcp_single_train.tsv",
#                               delimiter="\t", quoting=csv.QUOTE_NONE, encoding='utf-8').sample(frac=1,random_state=2)[:2073]
# df_train = df_train.append(df_curr[df_curr['corpus']!='europarl'])
# # df_train = df_train.append(pd.read_csv("Group 1 Dataset/Dataset 2/lcp_multi_trial.tsv",
# #                               delimiter="\t", quoting=csv.QUOTE_NONE, encoding='utf-8'))
# # df_train = df_train.append(df_temp[df_temp['corpus']=='biomed'])
# df_train = df_train.reset_index(drop=True)

# # df_train = pd.read_csv("Group 1 Dataset/LCP_Multi/trainfinal.csv")
# df_test = pd.read_csv("Group 1 Dataset/Dataset 2/lcp_multi_trial.tsv",
#                               delimiter="\t", quoting=csv.QUOTE_NONE, encoding='utf-8')
# df_test['complexity'] = 0.00000

# predictions, y_test = train_test(df_train,df_test, embeddings_dict1, embeddings_dict2, embeddings_dict3)

# # y_pred = (predictions[0]+predictions[1]+predictions[2]+predictions[3]+predictions[4]+predictions[5]+predictions[6])/7
# # y_pred = (predictions[0]+predictions[1]+predictions[2]+predictions[3]+predictions[4])/5

# y_pred = (predictions[0]+predictions[1])/2


# df_test['Predicted Complexity'] = 0.00000

# for i in range(len(df_test)):
#     df_test['Predicted Complexity'][i] = y_pred[i]

# df_final = df_test.drop(columns=['complexity','subcorpus','sentence','token'])

# # print(mae(pd.read_csv("Group 1 Dataset/Dataset 2/lcp_multi_trial.tsv",
# #                               delimiter="\t", quoting=csv.QUOTE_NONE, encoding='utf-8')['complexity'],y_pred))

# print(scipy.stats.pearsonr(pd.read_csv("Group 1 Dataset/Dataset 2/lcp_multi_trial.tsv",
#                               delimiter="\t", quoting=csv.QUOTE_NONE, encoding='utf-8')['complexity'],y_pred))
# df_final = df_test.drop(columns=['complexity','subcorpus','sentence','token'])
# df_final.to_csv('multi_trial_predictions.csv',index=False,header=None)

# MWEs
df_train = pd.read_csv("Group 1 Dataset/Dataset 2/lcp_multi_train.tsv", 
                       delimiter="\t", quoting=csv.QUOTE_NONE, encoding='utf-8')
df_train = df_train.append(pd.read_csv("Group 1 Dataset/Dataset 2/lcp_multi_trial.tsv",
                              delimiter="\t", quoting=csv.QUOTE_NONE, encoding='utf-8'))
df_curr = pd.read_csv("Group 1 Dataset/Dataset 1/lcp_single_train.tsv",
                              delimiter="\t", quoting=csv.QUOTE_NONE, encoding='utf-8').sample(frac=1,random_state=2)[:1900]
df_train = df_train.append(df_curr)
# [df_curr['corpus']!='europarl'])
# df_train = df_train.append(pd.read_csv("Group 1 Dataset/Dataset 2/lcp_multi_trial.tsv",
#                               delimiter="\t", quoting=csv.QUOTE_NONE, encoding='utf-8'))
# df_train = df_train.append(df_temp[df_temp['corpus']=='biomed'])
df_train = df_train.reset_index(drop=True)

# df_train = pd.read_csv("Group 1 Dataset/LCP_Multi/trainfinal.csv")
df_test = pd.read_csv("Group 1 Dataset/Test/lcp_multi_test.tsv",
                              delimiter="\t", quoting=csv.QUOTE_NONE, encoding='utf-8')
df_test['complexity'] = 0.00000

predictions, y_test = train_test(df_train,df_test, embeddings_dict1, embeddings_dict2, embeddings_dict3)

y_pred = (predictions[0]+predictions[1])/2

df_test['Predicted Complexity'] = 0.00000

for i in range(len(df_test)):
    df_test['Predicted Complexity'][i] = y_pred[i]

#     df_final = df_test.drop(columns=['complexity','corpus','sentence','token'])

# print(mae(pd.read_csv("Group 1 Dataset/Dataset 2/lcp_multi_trial.tsv",
#                               delimiter="\t", quoting=csv.QUOTE_NONE, encoding='utf-8')['complexity'],y_pred))

#     print(scipy.stats.pearsonr(pd.read_csv("Group 1 Dataset/Dataset 2/lcp_multi_trial.tsv",
#                                   delimiter="\t", quoting=csv.QUOTE_NONE, encoding='utf-8')['complexity'],y_pred))
df_final = df_test.drop(columns=['complexity','corpus','sentence','token'])
df_final.to_csv('multi_test_predictions_3.csv',index=False,header=None)
df_final = df_test.drop(columns=['complexity'])
df_final.to_csv('multi_test_predictions_temp_3.csv',index=False,header=None)
    
print(mae(pd.read_csv('multi_test_predictions.csv',header = None)[1],pd.read_csv('multi_test_predictions_3.csv',header = None)[1]))

[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM]0.013234233026919154


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
