### Helper functions 

In [183]:
import pandas as pd
import numpy  as np
import copy
import random
import pickle
import itertools
import math
import sys
import csv

import matplotlib.pyplot as plt 
import matplotlib.lines as mlines
import matplotlib.transforms as mtransforms

import xgboost as xgb
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_squared_error
from sklearn import preprocessing
from sklearn import metrics

import tensorly as tl
from tensorly.decomposition import parafac
from tensorly.decomposition import tucker
from tensorly import tucker_to_tensor
from tensorly.decomposition import robust_pca

import lightgbm as lgb


* Functions about tensor operations

In [184]:
# FOR F1 TENSOR
def get_dict_f1(path, metric="f1"):
    with open(path, "rb") as f:
        tensorDict_list = pickle.load(f)
        if metric == "f1":
            d = tensorDict_list[0]
    return d
     

def iter_dict_f1(tensor_dict, mode_dict, dept=0):
    for k,v in tensor_dict.items():
        if isinstance(v,dict):
            mode_dict[dept].add(k)
            iter_dict_f1(v, mode_dict, dept+1)
        else:
            mode_dict[dept].add(k)
            mode_dict[dept+1].add(v)
            
    
def get_modes_f1(tensorDict, numMode):
    # numMode + 1 since values take 1 extra level 
    modeDict = {i: set() for i in range(numMode+1)} 
    iter_dict_f1(tensorDict, modeDict)
    return modeDict
    
def order_buckets_f1(d):
    d_copy = copy.deepcopy(d)
    for model in d.keys():
        for dataset in d[model].keys():
            for attribute in d[model][dataset].keys():
                d_copy[model][dataset][attribute] = {}
                for i, (k, v) in enumerate(d[model][dataset][attribute].items()):
                    d_copy[model][dataset][attribute][i] = v
    return d_copy

# Directions: model -> dataset -> attribute -> bucket
def get_index_f1(mode_dict):
    index_lst = []
    for m in mode_dict:
        md = dict()
        for i, n in enumerate(list(mode_dict[m])):
            md[n] = i
        index_lst.append(md)
    return index_lst


def read_dict_f1(dataDict, maplist):
    first, rest = maplist[0], maplist[1:]
    # Handle missing data
    if dataDict.get(first) == None:
        return None
    if rest: 
        return read_dict_f1(dataDict[first], rest)
    else:
        # return None if key is missing
        return dataDict.get(first) 

    
def read_tensor_f1(tensor, maplist):
    first, rest = maplist[0], maplist[1:]
    if rest: 
        return read_tensor_f1(tensor[first], rest)
    else:
        return tensor[first]


def write_tensor_f1(tensor, maplist, val):
    first, rest = maplist[0], maplist[1:]

    if rest: 
        assert(isinstance(tensor[first], np.ndarray))
        return write_tensor_f1(tensor[first], rest, val)
    else:
        if isinstance(tensor[first], np.ndarray):
            print("Maplist length not correct!")
        tensor[first] = val
    
    
# Construct a tensor of dimensions *argv    
def init_tensor_f1(*argv):
    size = 1
    for s in argv:
        size *= s
    X = tl.tensor(np.arange(size).reshape(argv), dtype=tl.float32)
    return X


def convert_index_f1(p, indDict):
    l = []
    for i,n in enumerate(p):
        l.append(indDict[i][n])
    return l

    
def construct_tensor_f1(modeDict, tensorDict, indexDict):
    l = []
    for _,v in modeDict.items():
        l.append(list(v))
    from_paths = list(itertools.product(*l[:-1])) # all possible entries 
    
    to_paths = []
    for p in from_paths:
        to_paths.append(convert_index(p, indexDict))
  
    
    # tensor dim
    dims = [len(modeDict[i]) for i in modeDict.keys()][:-1]
    X = init_tensor_f1(*dims)
    
    
    for p in range(len(from_paths)):
       # print(p)
        val = read_dict_f1(tensorDict, from_paths[p])
        # if data is not missing
        if val != None:
            write_tensor_f1(X,to_paths[p],val) 
        else:
            print("MISSING DATA AT "+ str(from_paths[p]))
            write_tensor_f1(X,to_paths[p],None) 
    return X
        
def all_paths_f1(modeDict):
    l = []
    for _,v in modeDict.items():
        l.append(list(v))
    p = list(itertools.product(*l[:-1])) # all possible entries 
    return p


def convert_to_df_f1(tensor, mode, index):
    all_p = all_paths_f1(mode)
    name = ["features"]
    df = pd.DataFrame(index=range(len(all_p)),columns=name)
    f1_list = []
    for i in range(len(all_p)):
        paste = str(all_p[i])[1:-1]
        df.iloc[i,:]['features'] = paste
        ind = convert_index_f1(all_p[i], index)
        f1 = read_tensor_f1(tensor, ind)
        f1_list.append(f1)
    
    df[['model','dataset',"attribute","bucket"]] = df['features'].str.split(', ',expand=True)
    df = df.drop(['features'], axis=1)
    df['f1'] = f1_list
    return df

def categorize_df_f1(df, ind_lst):
    df_new = copy.deepcopy(df)
    for i in range(len(df.columns)-2):
        col_dict = ind_lst[i]
        for j in range(len(df.index)):
            entry = df.iloc[j,i]
            entry = str(entry)[1:len(entry)-1]
            val = col_dict[str(entry)]
            df_new.iloc[j,i] = val
    df_new['bucket'] = pd.to_numeric(df_new['bucket'])
    return df_new
        

def scale_f1(tensor, mode_dict, index_dict):
    all_p = all_paths_f1(mode_dict)
    paths = convert_paths(all_p,index_dict)
    tensor2 = copy.deepcopy(tensor)
    array = tensor.flatten()
    m, sd = np.mean(array), np.std(array)
    for i in paths:
        val = read_tensor_f1(tensor, i)
        transformed = (val - m) / sd
        write_tensor_f1(tensor2, i, transformed) 
    return tensor2, m, sd

def scale_back_f1(val, mean, sd):
    scale_back = val * sd + mean
    return scale_back 

In [185]:
def get_modes(df, col_indices):
    numModes = len(col_indices)
    modeDict = {i: None for i in range(numModes)} 
    colInd = {j: None for j in col_indices} 
    for i in modeDict:
        c = col_indices[i]
        col = df.iloc[:,c]
        vals = pd.unique(col)
        modeDict[i] = set(vals)
        colInd[c] = i
    return modeDict, colInd

    
    modeDict = {i: None for i in col_indices} 
    for i in col_indices:
        col = df.iloc[:,i]
        vals = pd.unique(col)
        modeDict[i] = set(vals)
    return modeDict

def get_index(mode_dict):
    index_lst = []
    for m in mode_dict:
        md = dict()
        for i, n in enumerate(list(mode_dict[m])):
            md[n] = i
        index_lst.append(md)
    return index_lst


def all_paths(mode_dict):
    l = []
    for _,v in mode_dict.items():
        l.append(list(v))
    p = list(itertools.product(*l[:])) # all possible entries 
    return p


def init_tensor(modeDict):
    sl = []
    prod = 1
    for f in modeDict:
        size = len(modeDict[f])
        sl.append(size)
        prod *= size
    t = tl.zeros(sl)
    return t   

def read_tensor(tensor, maplist):
    first, rest = maplist[0], maplist[1:]
    if rest: 
        return read_tensor(tensor[first], rest)
    else:
        return tensor[first]

def scale(df, cols):
    new = df.copy()
    for i in cols:
        new.iloc[:,i] = preprocessing.scale(df.iloc[:,i])
    return new

def path_to_row(df, source='Source', target='Target'):
    path_dict = {}
    for i in range(len(df)):
        row = df.iloc[i,:]
        s, t = row[source], row[target]
        path_dict[(s,t)] = i
    return path_dict
        
def get_val(df, path, rows):
    s, t, b = path[0], path[1], path[2]
    if (s,t) in rows:
        r = rows[(s,t)]
        val = df.loc[r,b]
        return val
    else: return None
    
def convert_index(p, indDict):
    l = []
    for i,n in enumerate(p):
        l.append(indDict[i][n])
    return l

def write_tensor(tensor, maplist, val):
    first, rest = maplist[0], maplist[1:]

    if rest: 
        assert(isinstance(tensor[first], np.ndarray))
        return write_tensor(tensor[first], rest, val)
    else:
        if isinstance(tensor[first], np.ndarray):
            print("Maplist length not correct!")
        tensor[first] = val
    
def convert_paths(from_names, indexDict):
    to_paths = []
    for p in from_names:
        to_paths.append(convert_index(p, indexDict))
    return to_paths

    
def fill_tensor(tensor, df, path_lst, index_lst, source='Source', target='Target'):
    row_language = path_to_row(df, source=source, target=target)
    missing = []
    for p in path_lst:
        val = get_val(df, p, row_language)
        index = convert_index(p, index_lst)
        # If the path exists and value is not missing 
        if val != None and not math.isnan(val):
            write_tensor(tensor, index, val)
        else:
            write_tensor(tensor, index, 0)
            missing.append(p)
    return tensor, missing

                    
def create_mask(missing, tensor):
    shape = tensor.shape
    mask = tl.ones(shape, dtype=tl.float32)
    for r in missing:
        write_tensor(mask, r,0)
    return mask



In [186]:
def convert_to_df_cws(tensor, mode, index):
    all_p = all_paths_f1(mode)
    name = ["features"]
    df = pd.DataFrame(index=range(len(all_p)),columns=name)
    f1_list = []
    for i in range(len(all_p)):
        paste = str(all_p[i])[1:-1]
        df.iloc[i,:]['features'] = paste
        
        
        ind = convert_index_f1(all_p[i], index)
        f1 = read_tensor_f1(tensor, ind)
        f1_list.append(f1)
    
    df[['dataset','model',"attribute","bucket"]] = df['features'].str.split(', ',expand=True)
    df = df.drop(['features'], axis=1)
    df['f1'] = f1_list
    return df

def categorize_df_cws(df, ind_lst):
    df_new = copy.deepcopy(df)
    for i in range(len(df.columns)-2):
        col_dict = ind_lst[i]
        for j in range(len(df.index)):
            entry = df.iloc[j,i]
            entry = str(entry)[1:len(entry)-1]
            val = col_dict[str(entry)]
            df_new.iloc[j,i] = val
    df_new['bucket'] = pd.to_numeric(df_new['bucket'])
    return df_new
        

In [187]:
def reconstruct(tensor, mask_tensor, decomp ='pca'):

    if decomp == 'pca':
        masked_tensor = tensor * mask_tensor
        pca_res = robust_pca(masked_tensor, mask=mask_tensor)
        reconstructed = pca_res[0]
        
    if decomp == 'tucker_decomp': 
        masked_tensor = tensor * mask_tensor
        core, factors = tucker(masked_tensor, rank = [39,44,22])
        reconstructed = tucker_to_tensor((core, factors))

    if decomp == 'cp':
        masked_tensor = tensor * mask_tensor
        (w, f), err = parafac(masked_tensor, rank=5, mask = mask_tensor, return_errors = True)
        reconstructed = tl.kruskal_to_tensor((w, f))
    
    return reconstructed

def read_val(ind_list, tensor, s, t, f):
    s, t, f = ind_list[0][s], ind_list[1][t], ind_list[2][f]
    return tensor[s][t][f]

def transform(orig_col, val):
    m, sd = orig_col.mean(), orig_col.std()
    orig = val * sd + m
    return orig

# Construct order 3 tensor
def construct_tensor_from_df(df, lang_modes=[0,1], need_scale=True,\
                            source='Source', target='Target', feat_mode=None): 
    # Get first 2 lang modes
    modeDict, colInd = get_modes(df, lang_modes)
    
    # Specify axis 3
    buckets = df.columns[feat_mode]
    modeDict[2] = set(buckets)
    
    # Get all possible indices for the tensor
    all_p = all_paths(modeDict)
    indLst = get_index(modeDict)
    
    # Init empty tensors
    t = init_tensor(modeDict)
    
    if need_scale:
        # Scaled version
        df2 = scale(df, [feat_mode])
    else:
        df2 = copy.deepcopy(df) 
        
    # Create a tensor
    t_bleu, missing = fill_tensor(t, df2, all_p, indLst, source=source, target=target)
    return t_bleu, missing, modeDict, indLst, all_p

* Functions to perform k-fold cross-validation

In [188]:
def train_test_split(df, fold=5):
    rows = list(range(len(df)))
    kf = KFold(n_splits=fold)
    
    test_lst = []
    train_lst = []
    for train, test in kf.split(rows):
        train_lst.append(train)
        test_lst.append(test)
        
    shuffled_df = df.sample(frac=1).reset_index(drop=True)
    
    folds = []
    for i in range(len(test_lst)):
        index_train = list(train_lst[i])
        index_test = list(test_lst[i])
        train = shuffled_df.iloc[index_train,:]
        test = shuffled_df.iloc[index_test,:]
        folds.append((train, test))
    return folds, shuffled_df


def mask_out_test_y(test_y_df, missing, tensor, index_dict, data):
    new_missing = copy.deepcopy(missing)
    new_tensor = copy.deepcopy(tensor)
    test_entries = []
    # Mask out BLEUs for test entries
    for index, row in test_y_df.iterrows():
        if data == 'wiki':
            source, target, bleu = row['Source'], row['Target'], 'BLEU'
            
        if data == 'tsf':
            source, target, bleu = row['Task lang'], row['Aux lang'], 'Accuracy'
            
        if data == 'tsfmt':
            source, target, bleu = row[' Source lang'], row['Transfer lang'], 'BLEU'
            
        path = (source, target, bleu)
        new_missing.append(path)
        test_entries.append(path)
        
    missing_combined = convert_paths(new_missing, index_dict)
    new_mask = create_mask(missing_combined, tensor)
    return new_mask, test_entries


def mask_out_test_y_f1(test_y_df, missing, tensor, index_dict):
    new_missing = copy.deepcopy(missing)
    new_tensor = copy.deepcopy(tensor)
    test_entries = []
    # Mask out f1 for test entries
    for index, row in test_y_df.iterrows():
        mod, dat, attr, buck = row['model'][1:-1], row['dataset'][1:-1], row['attribute'][1:-1], row['bucket']
        path = (mod, dat, attr, int(buck))
        #print(path)
        new_missing.append(path)
        test_entries.append(path)
        
    missing_combined = convert_paths(new_missing, index_dict)
    new_mask = create_mask(missing_combined, tensor)
    return new_mask, test_entries

def mask_out_test_y_cws(test_y_df, missing, tensor, index_dict):
    new_missing = copy.deepcopy(missing)
    new_tensor = copy.deepcopy(tensor)
    test_entries = []
    # Mask out f1 for test entries
    for index, row in test_y_df.iterrows():
        dat, mod, attr, buck = row['dataset'][1:-1], row['model'][1:-1], row['attribute'][1:-1], row['bucket']
        path = (dat, mod, attr, int(buck))
        #print(path)
        new_missing.append(path)
        test_entries.append(path)
        
    missing_combined = convert_paths(new_missing, index_dict)
    new_mask = create_mask(missing_combined, tensor)
    return new_mask, test_entries

## Comparing performance prediction models for different NLP tasks

In [189]:
# Compare Baseline, XGBoost, LGBM, CP and PCA performance prediction 
# models on the same test sets using k-fold cross-validation 
# Returns a list of RMSE for each fold for each model 

def compare_models(df, tensor, missing, index_lst,\
                   fold, scale_back=True, \
                   cp_rank=3, data="wiki", \
                   non_scaled_tensor=None,\
                  non_scaled_mode_dict=None,\
                  non_scaled_index=None,\
                  reg_E=1, reg_J=1):
    
    folds, shulffled = train_test_split(df, fold)
    pca_rmse_lst = []
    cp_rmse_lst = []
    xbg_rmse_lst = []
    lg_rmse_lst = []
    bs_rmse_lst = []
    
    for i in range(len(folds)):    
        train, test = folds[i][0], folds[i][1]
        if data == 'wiki' or data == 'tsf' or data == 'tsfmt':
            # combined mask masks off entries that are missing in the original dataset and entries in test set
            combined_mask, test_missing_entries = mask_out_test_y(test, missing, tensor, index_lst, data)
            
        if data == 'f1':
            combined_mask, test_missing_entries = mask_out_test_y_f1(test, missing, tensor, index_lst)
            
        if data == 'cws':
            combined_mask, test_missing_entries = mask_out_test_y_cws(test, missing, tensor, index_lst)

        masked_tensor = tensor * combined_mask
        
        tensor_pred_pca = []
        tensor_pred_cp = []
        true_y = []

        # (1) Robust-PCA
        pca_res = robust_pca(masked_tensor, mask=combined_mask, reg_E=reg_E, reg_J=reg_J)    
        # Use only the low rank part, http://jeankossaifi.com/blog/rpca.html
        pca_reconstructed = pca_res[0]
        
        
        # (2) CP  
        (w, f), err = parafac(masked_tensor, rank=cp_rank, mask = combined_mask, return_errors = True)
        cp_reconstructed = tl.kruskal_to_tensor((w, f))
        

        for m in test_missing_entries:
            blank = convert_index(m, index_lst)
            pred_pca = read_tensor(pca_reconstructed, blank)
            pred_cp = read_tensor(cp_reconstructed, blank)
            true = read_tensor(tensor, blank)

            # if tensor has been standardized 
            if scale_back:
                if data == 'wiki':
                    pred_pca = transform(df.loc[:,'BLEU'], pred_pca)
                    pred_cp = transform(df.loc[:,'BLEU'], pred_cp)
                    true = transform(df.loc[:,'BLEU'], true)
                
                if data == 'tsf':
                    pred_pca = transform(df.loc[:,'Accuracy'], pred_pca)
                    pred_cp = transform(df.loc[:,'Accuracy'], pred_cp)
                    true = transform(df.loc[:,'Accuracy'], true)
                
                if data == 'tsfmt':
                    pred_pca = transform(df.loc[:,'BLEU'], pred_pca)
                    pred_cp = transform(df.loc[:,'BLEU'], pred_cp)
                    true = transform(df.loc[:,'BLEU'], true)
                
                if data == 'f1':
                    _, m, sd = scale_f1(non_scaled_tensor, non_scaled_mode_dict, non_scaled_index)
                    pred_pca = scale_back_f1(pred_pca, m, sd)
                    pred_cp = scale_back_f1(pred_cp, m, sd)
                    true = scale_back_f1(true, m, sd)
                    
                    
            tensor_pred_pca.append(pred_pca)
            tensor_pred_cp.append(pred_cp)
            true_y.append(true)


        # (3) xgboost
    
        # take in corresponding feature columns
        if data == 'wiki' or data == 'tsf' or data == 'tsfmt':
            fold_train_X = train.iloc[:,3:]
            fold_train_y = train.iloc[:,2]
            fold_test_X = test.iloc[:,3:]
            fold_test_y = test.iloc[:,2]
            
        if data == 'f1' or data == 'cws':
            train = categorize_df_f1(train, index_lst)
            test = categorize_df_f1(test, index_lst)
            fold_train_X = train.iloc[:,:-1]
            fold_train_y = train.iloc[:,-1]
            fold_test_X = test.iloc[:,:-1]
            fold_test_y = test.iloc[:,-1]
            
        
        reg = xgb.XGBRegressor(objective ='reg:squarederror', learning_rate=0.1,
                                   max_depth=10, n_estimators=100)
        res = reg.fit(fold_train_X, fold_train_y)
        xgb_pred_y = res.predict(fold_test_X)
        
        
        # (4) LGBM
            
        lg = lgb.LGBMRegressor(objective='regression')
        lg.fit(fold_train_X, fold_train_y, verbose=False)
        lg_pred_y = lg.predict(fold_test_X)
 
        
        # (5) Baseline
        mean_y = np.mean(fold_train_y)
        base_pred_y = [mean_y]*len(lg_pred_y)
        
        # Calculate RMSE
        rmse_pca = np.sqrt(mean_squared_error(tensor_pred_pca, true_y))
        rmse_cp = np.sqrt(mean_squared_error(tensor_pred_cp, true_y))
        rmse_xgb = np.sqrt(mean_squared_error(xgb_pred_y, fold_test_y))
        rmse_lg = np.sqrt(mean_squared_error(lg_pred_y, fold_test_y))
        rmse_bs = np.sqrt(mean_squared_error(base_pred_y, fold_test_y))
        
        
        pca_rmse_lst.append(rmse_pca)
        cp_rmse_lst.append(rmse_cp)
        xbg_rmse_lst.append(rmse_xgb)
        lg_rmse_lst.append(rmse_lg)
        bs_rmse_lst.append(rmse_bs)
        
        
    return pca_rmse_lst, cp_rmse_lst, xbg_rmse_lst, lg_rmse_lst, bs_rmse_lst


## Task 1: NER
### F1 Tensor Construction

In [190]:
# Read in a F1 dictionary 
fileName = "data/model_data_metric_bucket_evals6.pkl"
dict_f1 = get_dict_f1(fileName)
# Order 4 buckets of values in increasing order 
dict_f1 = order_buckets_f1(dict_f1)


# Find out all unique feature names and define the 4 dimensions for a tensor
# 4 modes: 0: models, 1: datasets, 2: attributes, 3: buckets
mdict = get_modes_f1(dict_f1, 4)


# Get a mapping between all unique feature names and tensor indices
index_mode = get_index_f1(mdict)

# Construct a tensor (for PCA and CP models) 
# with a mode dict, the original tensor dict, and mapping between mode and indices
t1 = construct_tensor_f1(mdict, dict_f1,index_mode)


# Convert f1 tensor to a dataframe (for XGboost and LGBM models)
f1_df = convert_to_df_f1(t1, mdict, index_mode)

# convert columns to integers
int_f1_df = categorize_df_f1(f1_df, index_mode)



### F1 Performance Prediction 

In [191]:
#ns_gp_result_f1
ns_pca_result_f1, ns_cp_result_f1, ns_xgb_result_f1, ns_lg_result_f1, ns_bs_result_f1 = compare_models(f1_df, t1, [], \
                                                                     index_mode, 5, data='f1',\
                                                                     scale_back=False,\
                                                                     reg_E=1, reg_J=1)

In [192]:
# mean RMSE for PCA 
np.mean(ns_pca_result_f1)

0.052179318

In [193]:
# mean RMSE for CP 
np.mean(ns_cp_result_f1)

0.06929777

In [194]:
# mean RMSE for XGBoost 
np.mean(ns_xgb_result_f1)

0.055889648180819186

In [195]:
# mean RMSE for LGBM 
np.mean(ns_lg_result_f1)

0.051130917351851636

In [181]:
# mean RMSE for baseline
np.mean(ns_bs_result_f1)

0.20874335857003318

## Task 2: MT
### Wiki Tensor Construction

In [116]:
# Read in wiki data
with open("data/dict_file_wiki.pkl", "rb") as f:
    wkdata = pickle.load(f)
    
X = wkdata['BLEU']['feats']
y = wkdata['BLEU']['labels']
langs = wkdata['BLEU']['langs']

In [117]:
# Obtain a wiki dataframe
wiki = pd.concat([langs, y, X], axis=1)

In [118]:
# Construct a tensor with 3 modes (source lang, target lang, features)
tensor, missing, mode_dict, ind_lst, all_p = construct_tensor_from_df(wiki, lang_modes=[0,1], \
                                           feat_mode=list(range(2,len(wiki.columns))), \
                                           need_scale=True)

In [120]:
# Sparcity
len(missing)/tensor.size

0.3458251150558843

### wikiMT Performance Prediction 

In [121]:
pca_result, cp_result, xgb_result, lg_result, bs_result = compare_models(wiki, tensor, missing, ind_lst, 5, reg_E=0.6, reg_J=1.65)

In [133]:
# mean RMSE for PCA 
np.mean(pca_result)

2.941274638049036

In [134]:
# mean RMSE for CP 
np.mean(cp_result)

4.076549853475644

In [135]:
# mean RMSE for XGBoost 
np.mean(xgb_result)

2.366065267157411

In [136]:
# mean RMSE for PCA 
np.mean(pca_result)

2.941274638049036

In [137]:
# mean RMSE for LGBM 
np.mean(lg_result)

2.2561946304899245

In [138]:
# mean RMSE for baseline
np.mean(bs_result)

6.389119363990967

## Task 3: POS
### tsf Tensor Construction

In [139]:
# Read in tsf data
with open("data/dict_file_tsf.pkl", "rb") as f:
    tsfdata = pickle.load(f)
    
X_tsf = tsfdata['Accuracy']['feats']
y_tsf = tsfdata['Accuracy']['labels']
langs_tsf = tsfdata['Accuracy']['langs']

In [140]:
# Obtain a dataframe
tsf = pd.concat([langs_tsf, y_tsf, X_tsf], axis=1)


In [141]:
# Construct a tensor with 3 modes
tensor_tsf, missing_tsf, mode_dict_tsf, ind_lst_tsf, all_p_tsf = construct_tensor_from_df(tsf, lang_modes=[0,1], \
                                           feat_mode=list(range(2,len(tsf.columns))),\
                                           need_scale=False, source='Task lang', target='Aux lang')

In [142]:
# Sparcity
len(missing_tsf)/tensor_tsf.size

0.01858974358974359

### tsfPOS Performance Prediction 

In [143]:
pca_result_tsf, cp_result_tsf, xgb_result_tsf,lg_result_tsf, bs_result_tsf = compare_models(tsf, tensor_tsf, missing_tsf, ind_lst_tsf, 5, \
                                                              data='tsf', scale_back=False)

In [144]:
# mean RMSE for PCA 
np.mean(pca_result_tsf)

5.818490958859231

In [145]:
# mean RMSE for CP
np.mean(cp_result_tsf)

34.84026188746132

In [146]:
# mean RMSE for XGBoost 
np.mean(xgb_result_tsf)

7.278306714645728

In [147]:
# mean RMSE for LGBM 
np.mean(lg_result_tsf)

7.245376756573846

In [148]:
# mean RMSE for Baseline 
np.mean(bs_result_tsf)

29.097613010215706

## Task 4: CWS
### Tensor Construction

In [149]:
# Read in a dictionary 
true_cws =  "data/dict_file_cws.pkl"
with open(true_cws, "rb") as f:
    cws_dict = pickle.load(f)

In [151]:
cws_modes = get_modes_f1(cws_dict, 4)
cws_index = get_index_f1(cws_modes)
cws_t = construct_tensor_f1(cws_modes, cws_dict, cws_index)

In [152]:
# convert tensor to a dataframe
cws_df = convert_to_df_cws(cws_t, cws_modes, cws_index)

# convert columns to integers
int_cws_df = categorize_df_cws(cws_df, cws_index)

In [154]:
pca_result_cws, cp_result_cws, xgb_result_cws,lg_result_cws, bs_result_cws = compare_models(cws_df, cws_t, [], \
                                                                     cws_index, 5, data='cws',\
                                                                     scale_back=False,\
                                                                     reg_E=1, reg_J=1)

### Performance Prediction 

In [155]:
# mean RMSE for PCA 
np.mean(pca_result_cws)

0.02788474

In [156]:
# mean RMSE for CP 
np.mean(cp_result_cws)

0.04636464

In [157]:
# mean RMSE for XGBoost 
np.mean(xgb_result_cws)

0.017748032775031924

In [160]:
# mean RMSE for LGBM 
np.mean(lg_result_cws)

0.03557505761049142

In [161]:
# mean RMSE for Baseline 
np.mean(bs_result_cws)

0.13689612466730292