In [93]:
import xgboost as xgb
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt 
import pandas as pd
import numpy  as np
from sklearn import metrics

import tensorly as tl
import copy
from tensorly.decomposition import parafac
import random
import pickle
import sys
import itertools
import math
import matplotlib.lines as mlines
import matplotlib.transforms as mtransforms
import csv
from sklearn import preprocessing
  
from tensorly.decomposition import tucker
from tensorly import tucker_to_tensor
from tensorly.decomposition import robust_pca

from sklearn.gaussian_process.kernels import Kernel, Hyperparameter, Matern, WhiteKernel
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process import GaussianProcessClassifier
import lightgbm as lgb

In [94]:
# FOR F1 TENSOR

def get_dict_f1(path, metric="f1"):
    with open(path, "rb") as f:
        tensorDict_list = pickle.load(f)
        if metric == "f1":
            d = tensorDict_list[0]
    return d
     

def iter_dict_f1(tensor_dict, mode_dict, dept=0):
    for k,v in tensor_dict.items():
        if isinstance(v,dict):
            mode_dict[dept].add(k)
            iter_dict_f1(v, mode_dict, dept+1)
        else:
            mode_dict[dept].add(k)
            mode_dict[dept+1].add(v)
            
    
def get_modes_f1(tensorDict, numMode):
    # numMode + 1 since values take 1 extra level 
    modeDict = {i: set() for i in range(numMode+1)} 
    iter_dict_f1(tensorDict, modeDict)
    return modeDict
    
# assumes ordinal buckets, but there are categorical buckets: e.g ("('loc',), 'misc',",'org','per',)?)
def order_buckets_f1(d):
    d_copy = copy.deepcopy(d)
    for model in d.keys():
        for dataset in d[model].keys():
            for attribute in d[model][dataset].keys():
                d_copy[model][dataset][attribute] = {}
                for i, (k, v) in enumerate(d[model][dataset][attribute].items()):
                    d_copy[model][dataset][attribute][i] = v
    return d_copy

# for now assumes directions go as: model -> dataset -> attribute -> bucket
def get_index_f1(mode_dict):
    index_lst = []
    for m in mode_dict:
        md = dict()
        for i, n in enumerate(list(mode_dict[m])):
            md[n] = i
        index_lst.append(md)
    return index_lst

# https://stackoverflow.com/questions/14692690/access-nested-dictionary-items-via-a-list-of-keys
def read_dict_f1(dataDict, maplist):
    first, rest = maplist[0], maplist[1:]

    # Handle missing data
    if dataDict.get(first) == None:
        return None
    
    if rest: 
        return read_dict_f1(dataDict[first], rest)
    
    else:
        # return None if key is missing
        return dataDict.get(first) 

    
def read_tensor_f1(tensor, maplist):
    first, rest = maplist[0], maplist[1:]
    if rest: 
        return read_tensor_f1(tensor[first], rest)
    else:
        return tensor[first]
   
    
    

def write_tensor_f1(tensor, maplist, val):
    first, rest = maplist[0], maplist[1:]

    if rest: 
        assert(isinstance(tensor[first], np.ndarray))
        return write_tensor_f1(tensor[first], rest, val)
    else:
        if isinstance(tensor[first], np.ndarray):
            print("Maplist length not correct!")
        tensor[first] = val
    
    
# Construct a tensor of dimensions *argv    
def init_tensor_f1(*argv):
    size = 1
    for s in argv:
        size *= s
    X = tl.tensor(np.arange(size).reshape(argv), dtype=tl.float32)
    return X



def convert_index_f1(p, indDict):
    l = []
    for i,n in enumerate(p):
        l.append(indDict[i][n])
    return l

    
def construct_tensor_f1(modeDict, tensorDict, indexDict):
    l = []
    for _,v in modeDict.items():
        l.append(list(v))
    from_paths = list(itertools.product(*l[:-1])) # all possible entries 
    
    to_paths = []
    for p in from_paths:
        to_paths.append(convert_index(p, indexDict))
  
    
    # tensor dim
    dims = [len(modeDict[i]) for i in modeDict.keys()][:-1]
    X = init_tensor_f1(*dims)
    
    
    for p in range(len(from_paths)):
       # print(p)
        val = read_dict_f1(tensorDict, from_paths[p])
        # if data is not missing
        if val != None:
            write_tensor_f1(X,to_paths[p],val) 
        else:
            print("MISSING DATA AT "+ str(from_paths[p]))
            write_tensor_f1(X,to_paths[p],None) 
    return X
        
def all_paths_f1(modeDict):
    l = []
    for _,v in modeDict.items():
        l.append(list(v))
    p = list(itertools.product(*l[:-1])) # all possible entries 
    return p



def convert_to_df_f1(tensor, mode, index):
    all_p = all_paths_f1(mode)
    name = ["features"]
    df = pd.DataFrame(index=range(len(all_p)),columns=name)
    f1_list = []
    for i in range(len(all_p)):
        paste = str(all_p[i])[1:-1]
        df.iloc[i,:]['features'] = paste
        
        
        ind = convert_index_f1(all_p[i], index)
        f1 = read_tensor_f1(tensor, ind)
        f1_list.append(f1)
    
    df[['model','dataset',"attribute","bucket"]] = df['features'].str.split(', ',expand=True)
    df = df.drop(['features'], axis=1)
    df['f1'] = f1_list
    return df

def categorize_df_f1(df, ind_lst):
    df_new = copy.deepcopy(df)
    for i in range(len(df.columns)-2):
        col_dict = ind_lst[i]
        for j in range(len(df.index)):
            entry = df.iloc[j,i]
            entry = str(entry)[1:len(entry)-1]
            val = col_dict[str(entry)]
            df_new.iloc[j,i] = val
    df_new['bucket'] = pd.to_numeric(df_new['bucket'])
    return df_new
        

def scale_f1(tensor, mode_dict, index_dict):
    all_p = all_paths_f1(mode_dict)
    paths = convert_paths(all_p,index_dict)
    tensor2 = copy.deepcopy(tensor)
    array = tensor.flatten()
    m, sd = np.mean(array), np.std(array)
    for i in paths:
        val = read_tensor_f1(tensor, i)
        transformed = (val - m) / sd
        write_tensor_f1(tensor2, i, transformed) 
    return tensor2, m, sd

def scale_back_f1(val, mean, sd):
#     all_p = all_paths_f1(mode_dict)
#     paths = convert_paths(all_p,index_dict)
# #     tensor2 = copy.deepcopy(tensor)
#     for i in paths:
 #       val = read_tensor_f1(tensor, i)
    scale_back = val * sd + mean
  #  write_tensor_f1(tensor2, i, scale_back) 
    
    return scale_back 

In [95]:

def train_test_split(df, fold=5):
    rows = list(range(len(df)))
    kf = KFold(n_splits=fold)
    
    test_lst = []
    train_lst = []
    for train, test in kf.split(rows):
        train_lst.append(train)
        test_lst.append(test)
        
    shuffled_df = df.sample(frac=1).reset_index(drop=True)
    
    folds = []
    for i in range(len(test_lst)):
        
        index_train = list(train_lst[i])
        index_test = list(test_lst[i])
       # print("train i", index_train)
        train = shuffled_df.iloc[index_train,:]
        test = shuffled_df.iloc[index_test,:]
        folds.append((train, test))
    return folds, shuffled_df


In [96]:
# For Wiki

def get_modes(df, col_indices):
    numModes = len(col_indices)
    modeDict = {i: None for i in range(numModes)} 
    colInd = {j: None for j in col_indices} 
    for i in modeDict:
        c = col_indices[i]
        col = df.iloc[:,c]
        vals = pd.unique(col)
        modeDict[i] = set(vals)
        colInd[c] = i
    return modeDict, colInd

    
    modeDict = {i: None for i in col_indices} 
    for i in col_indices:
        col = df.iloc[:,i]
        vals = pd.unique(col)
        modeDict[i] = set(vals)
    return modeDict

def get_index(mode_dict):
    index_lst = []
    for m in mode_dict:
        md = dict()
        for i, n in enumerate(list(mode_dict[m])):
            md[n] = i
        index_lst.append(md)
    return index_lst


def all_paths(mode_dict):
    l = []
    for _,v in mode_dict.items():
        l.append(list(v))
    p = list(itertools.product(*l[:])) # all possible entries 
    return p


# Put rows of dataframe to a dict
# {(a,b,c,d):BLEU, ...}

# Use first 6 cols as modes
def df_to_dict(df, key_cols=[0,1,3,4,5], val_col=2):
    npdf = np.array(df)
    d = {}
    for i in range(len(npdf)):
        row = npdf[i]
        sliced_row= tuple(row[key_cols])
       # print(sliced_row)
       # print(row[val_col])
        if sliced_row in d: 
            print("same key")
        d[sliced_row] = row[val_col]
    return d


def init_tensor(modeDict):
    sl = []
    prod = 1
    for f in modeDict:
        size = len(modeDict[f])
        sl.append(size)
        prod *= size
    t = tl.zeros(sl)
    return t   

def read_tensor(tensor, maplist):
    first, rest = maplist[0], maplist[1:]
    if rest: 
        return read_tensor(tensor[first], rest)
    else:
        return tensor[first]
   


In [97]:
# For Wiki


def scale(df, cols):
    new = df.copy()
    for i in cols:
        new.iloc[:,i] = preprocessing.scale(df.iloc[:,i])
    return new

def path_to_row(df, source='Source', target='Target'):
    path_dict = {}
    for i in range(len(df)):
        row = df.iloc[i,:]
        s, t = row[source], row[target]
        path_dict[(s,t)] = i
    return path_dict
        
def get_val(df, path, rows):
    s, t, b = path[0], path[1], path[2]
    if (s,t) in rows:
        r = rows[(s,t)]
        val = df.loc[r,b]
        return val
    else: return None
    
def convert_index(p, indDict):
    l = []
    for i,n in enumerate(p):
        l.append(indDict[i][n])
    return l

def write_tensor(tensor, maplist, val):
    first, rest = maplist[0], maplist[1:]

    if rest: 
        assert(isinstance(tensor[first], np.ndarray))
        return write_tensor(tensor[first], rest, val)
    else:
        if isinstance(tensor[first], np.ndarray):
            print("Maplist length not correct!")
        tensor[first] = val
    
def convert_paths(from_names, indexDict):
    to_paths = []
    for p in from_names:
        to_paths.append(convert_index(p, indexDict))
    return to_paths

    
def fill_tensor(tensor, df, path_lst, index_lst, source='Source', target='Target'):
    row_language = path_to_row(df, source=source, target=target)
    missing = []
    for p in path_lst:
        val = get_val(df, p, row_language)
        index = convert_index(p, index_lst)
        # If the path exists and value is not missing 
        if val != None and not math.isnan(val):
            write_tensor(tensor, index, val)
        else:
            write_tensor(tensor, index, 0)
            missing.append(p)
    return tensor, missing

                    
def create_mask(missing, tensor):
    shape = tensor.shape
    mask = tl.ones(shape, dtype=tl.float32)
    for r in missing:
        write_tensor(mask, r,0)
    return mask



In [98]:
def reconstruct(tensor, mask_tensor, decomp ='pca'):

    if decomp == 'pca':
        masked_tensor = tensor * mask_tensor
        pca_res = robust_pca(masked_tensor, mask=mask_tensor)
        reconstructed = pca_res[0]
        
    if decomp == 'tucker_decomp': 
        masked_tensor = tensor * mask_tensor
        core, factors = tucker(masked_tensor, rank = [39,44,22])
        reconstructed = tucker_to_tensor((core, factors))

    if decomp == 'cp':
        masked_tensor = tensor * mask_tensor
        (w, f), err = parafac(masked_tensor, rank=5, mask = mask_tensor, return_errors = True)
        reconstructed = tl.kruskal_to_tensor((w, f))
    
    return reconstructed

In [99]:
def read_val(ind_list, tensor, s, t, f):
    s, t, f = ind_list[0][s], ind_list[1][t], ind_list[2][f]
    return tensor[s][t][f]

def transform(orig_col, val):
    m, sd = orig_col.mean(), orig_col.std()
    orig = val * sd + m
    return orig

In [100]:
def construct_tensor_from_df(df, lang_modes=[0,1], need_scale=True,\
                            source='Source', target='Target', feat_mode=None): 
    
    # Get first 2 lang modes
    modeDict, colInd = get_modes(df, lang_modes)
    
    
    # Specify axis 3
    buckets = df.columns[feat_mode]
    

    modeDict[2] = set(buckets)
    
        
    
    
    # Get all possible indices for the tensor
    all_p = all_paths(modeDict)
    indLst = get_index(modeDict)
    
    # Init empty tensors
    t = init_tensor(modeDict)
    
    if need_scale:
        # Scaled version
        df2 = scale(df, [feat_mode])
    else:
        df2 = copy.deepcopy(df) 
        
    # Create a tensor
    t_bleu, missing = fill_tensor(t, df2, all_p, indLst, source=source, target=target)
    
    # Create a mask
   #missing_paths = convert_paths(missing, indLst)
   # mask = create_mask(missing_paths, t_bleu)

    return t_bleu, missing, modeDict, indLst, all_p

In [101]:
def mask_out_test_y(test_y_df, missing, tensor, index_dict, data):
    new_missing = copy.deepcopy(missing)
    new_tensor = copy.deepcopy(tensor)
    test_entries = []
    # Mask out BLEUs for test entries
    for index, row in test_y_df.iterrows():
        if data == 'wiki':
            source, target, bleu = row['Source'], row['Target'], 'BLEU'
            
        if data == 'tsf':
            source, target, bleu = row['Task lang'], row['Aux lang'], 'Accuracy'
            
        if data == 'tsfmt':
            source, target, bleu = row[' Source lang'], row['Transfer lang'], 'BLEU'
            
       # val = row['BLEU']
        path = (source, target, bleu)
        new_missing.append(path)
        test_entries.append(path)
        
    missing_combined = convert_paths(new_missing, index_dict)
    new_mask = create_mask(missing_combined, tensor)
    return new_mask, test_entries

In [102]:
def mask_out_test_y_f1(test_y_df, missing, tensor, index_dict):
    new_missing = copy.deepcopy(missing)
    new_tensor = copy.deepcopy(tensor)
    test_entries = []
    # Mask out f1 for test entries
    for index, row in test_y_df.iterrows():
        mod, dat, attr, buck = row['model'][1:-1], row['dataset'][1:-1], row['attribute'][1:-1], row['bucket']
        path = (mod, dat, attr, int(buck))
        #print(path)
        new_missing.append(path)
        test_entries.append(path)
        
    missing_combined = convert_paths(new_missing, index_dict)
    new_mask = create_mask(missing_combined, tensor)
    return new_mask, test_entries

In [242]:
def mask_out_test_y_cws(test_y_df, missing, tensor, index_dict):
    new_missing = copy.deepcopy(missing)
    new_tensor = copy.deepcopy(tensor)
    test_entries = []
    # Mask out f1 for test entries
    for index, row in test_y_df.iterrows():
        dat, mod, attr, buck = row['dataset'][1:-1], row['model'][1:-1], row['attribute'][1:-1], row['bucket']
        path = (dat, mod, attr, int(buck))
        #print(path)
        new_missing.append(path)
        test_entries.append(path)
        
    missing_combined = convert_paths(new_missing, index_dict)
    new_mask = create_mask(missing_combined, tensor)
    return new_mask, test_entries

In [238]:
def convert_to_df_cws(tensor, mode, index):
    all_p = all_paths_f1(mode)
    name = ["features"]
    df = pd.DataFrame(index=range(len(all_p)),columns=name)
    f1_list = []
    for i in range(len(all_p)):
        paste = str(all_p[i])[1:-1]
        df.iloc[i,:]['features'] = paste
        
        
        ind = convert_index_f1(all_p[i], index)
        f1 = read_tensor_f1(tensor, ind)
        f1_list.append(f1)
    
    df[['dataset','model',"attribute","bucket"]] = df['features'].str.split(', ',expand=True)
    df = df.drop(['features'], axis=1)
    df['f1'] = f1_list
    return df

def categorize_df_cws(df, ind_lst):
    df_new = copy.deepcopy(df)
    for i in range(len(df.columns)-2):
        col_dict = ind_lst[i]
        for j in range(len(df.index)):
            entry = df.iloc[j,i]
            entry = str(entry)[1:len(entry)-1]
            val = col_dict[str(entry)]
            df_new.iloc[j,i] = val
    df_new['bucket'] = pd.to_numeric(df_new['bucket'])
    return df_new
        

### Construct F1 Tensor

In [276]:
def compare_models(df, tensor, missing, index_lst,\
                   fold, scale_back=True, \
                   cp_rank=3, data="wiki", \
                   non_scaled_tensor=None,\
                  non_scaled_mode_dict=None,\
                  non_scaled_index=None,\
                  reg_E=1, reg_J=1):
    
    print("fold",fold)
    folds, shulffled = train_test_split(df, fold)
    pca_rmse_lst = []
    cp_rmse_lst = []
    xbg_rmse_lst = []
    gp_rmse_lst = []
    lg_rmse_lst = []
    bs_rmse_lst = []
    
    for i in range(len(folds)):
    
    
        train, test = folds[i][0], folds[i][1]

        # tensor pca
        if data == 'wiki' or data == 'tsf' or data == 'tsfmt':
            combined_mask, test_missing_entries = mask_out_test_y(test, missing, tensor, index_lst, data)
            
        if data == 'f1':
            combined_mask, test_missing_entries = mask_out_test_y_f1(test, missing, tensor, index_lst)
            
        if data == 'cws':
            combined_mask, test_missing_entries = mask_out_test_y_cws(test, missing, tensor, index_lst)

        masked_tensor = tensor * combined_mask
#         pca_res = robust_pca(masked_tensor, mask=combined_mask, \
#                              n_iter_max=200, reg_E=0.6, reg_J = 1.65,\
#                              learning_rate=1.1)
        
        pca_res = robust_pca(masked_tensor, mask=combined_mask, reg_E=reg_E, reg_J=reg_J)
        
        
        # Use only the low rank part, see http://jeankossaifi.com/blog/rpca.html
        pca_reconstructed = pca_res[0]
     #   pca_reconstructed = pca_res[0] + pca_res[1]


        tensor_pred_pca = []
        tensor_pred_cp = []
        true_y = []
        
        
        # tensor cp decomp
        
        (w, f), err = parafac(masked_tensor, rank=cp_rank, mask = combined_mask, return_errors = True)
        cp_reconstructed = tl.kruskal_to_tensor((w, f))


        

        for m in test_missing_entries:

#             if m == ('eng', 'fra', 'BLEU'):
#                 print("!!!!!")
#                 temp = convert_index(m,index_lst)
#                 pr = read_tensor(pca_reconstructed, temp)
#                 tr= read_tensor(tensor, temp)
#                 print("Tr,", tr)
#                 print("pr,", pr)
                
            blank = convert_index(m, index_lst)
            
            # PCA
            pred_pca = read_tensor(pca_reconstructed, blank)
            
        
            # CP
            pred_cp = read_tensor(cp_reconstructed, blank)
            
            
            true = read_tensor(tensor, blank)
            
          #  print("pred pca", pred_pca)
         #   print("true pca", true)

            # if tensor has been standardized 
            if scale_back:
                
                
                if data == 'wiki':
                    pred_pca = transform(df.loc[:,'BLEU'], pred_pca)
                   # pred_pca = transform(train.loc[:,'BLEU'], pred_pca)

                    pred_cp = transform(df.loc[:,'BLEU'], pred_cp)
                   # pred_cp = transform(train.loc[:,'BLEU'], pred_cp)

                    true = transform(df.loc[:,'BLEU'], true)
                   # true = transform(train.loc[:,'BLEU'], true)
                
                if data == 'tsf':
                    pred_pca = transform(df.loc[:,'Accuracy'], pred_pca)
                   # pred_pca = transform(train.loc[:,'BLEU'], pred_pca)

                    pred_cp = transform(df.loc[:,'Accuracy'], pred_cp)
                   # pred_cp = transform(train.loc[:,'BLEU'], pred_cp)

                    true = transform(df.loc[:,'Accuracy'], true)
                   # true = transform(train.loc[:,'BLEU'], true)
                
                if data == 'tsfmt':
                    pred_pca = transform(df.loc[:,'BLEU'], pred_pca)
                   # pred_pca = transform(train.loc[:,'BLEU'], pred_pca)

                    pred_cp = transform(df.loc[:,'BLEU'], pred_cp)
                   # pred_cp = transform(train.loc[:,'BLEU'], pred_cp)

                    true = transform(df.loc[:,'BLEU'], true)
                   # true = transform(train.loc[:,'BLEU'], true)
                
                
                
                if data == 'f1':
                    _, m, sd = scale_f1(non_scaled_tensor, non_scaled_mode_dict, non_scaled_index)
                    pred_pca = scale_back_f1(pred_pca, m, sd)
                    pred_cp = scale_back_f1(pred_cp, m, sd)
                    true = scale_back_f1(true, m, sd)
                    
                    
            tensor_pred_pca.append(pred_pca)
            tensor_pred_cp.append(pred_cp)
            
            true_y.append(true)


        # xgboost

        if data == 'wiki' or data == 'tsf' or data == 'tsfmt':
        
            fold_train_X = train.iloc[:,3:]
            fold_train_y = train.iloc[:,2]

            fold_test_X = test.iloc[:,3:]
            fold_test_y = test.iloc[:,2]
            
            
            
        if data == 'f1' or data == 'cws':
            
            
            train = categorize_df_f1(train, index_lst)
            test = categorize_df_f1(test, index_lst)
        
                
            fold_train_X = train.iloc[:,:-1]
            fold_train_y = train.iloc[:,-1]
            
            fold_test_X = test.iloc[:,:-1]
            fold_test_y = test.iloc[:,-1]
            
        
        
        
        reg = xgb.XGBRegressor(objective ='reg:squarederror', learning_rate=0.1,
                                   max_depth=10, n_estimators=100)
        res = reg.fit(fold_train_X, fold_train_y)

        xgb_pred_y = res.predict(fold_test_X)
        
        #### Add gp #####
        
        ker = Matern()# + WhiteKernel()
        gpr = GaussianProcessRegressor(kernel=ker).fit(fold_train_X, fold_train_y)
        gp_pred_y = gpr.predict(fold_test_X)
        
        #######
        
        
        
        
        ### Add LGBM ####
            
        lg = lgb.LGBMRegressor(objective='regression')#, num_leaves=120, learning_rate=0.08, \
                                  # n_estimators=30, max_depth = 5)
        lg.fit(fold_train_X, fold_train_y, verbose=False)
        lg_pred_y = lg.predict(fold_test_X)
        ### Added LGBM ####
        
        
        ### Add Baseline
        mean_y = np.mean(fold_train_y)
        base_pred_y = [mean_y]*len(lg_pred_y)
        #######
        
        
        
        
        rmse_pca = np.sqrt(mean_squared_error(tensor_pred_pca, true_y))
        rmse_cp = np.sqrt(mean_squared_error(tensor_pred_cp, true_y))
        rmse_xgb = np.sqrt(mean_squared_error(xgb_pred_y, fold_test_y))
        ### Add gp
        rmse_gp = np.sqrt(mean_squared_error(gp_pred_y, fold_test_y))
        ### Add lgbm
        rmse_lg = np.sqrt(mean_squared_error(lg_pred_y, fold_test_y))
        ### Add baseline
        rmse_bs = np.sqrt(mean_squared_error(base_pred_y, fold_test_y))
        
        
        
        pca_rmse_lst.append(rmse_pca)
        cp_rmse_lst.append(rmse_cp)
        xbg_rmse_lst.append(rmse_xgb)
        
        ### Add gp
        gp_rmse_lst.append(rmse_gp)
        ### Add lgbm
        lg_rmse_lst.append(rmse_lg)
        ### Add baseline
        bs_rmse_lst.append(rmse_bs)
        
        
    return pca_rmse_lst, cp_rmse_lst, xbg_rmse_lst, gp_rmse_lst, lg_rmse_lst, bs_rmse_lst
 #   return pca_reconstructed, tensor_pred, true_y, xgb_pred_y, fold_test_y

In [104]:
# Read in a dictionary 
fileName = "model_data_metric_bucket_evals6.pkl"
dict_f1 = get_dict_f1(fileName)

dict_f1 = order_buckets_f1(dict_f1)
dict_f1


# Find out all unique feature names and define the 4 dimensions for a tensor
# 4 modes: 0: models, 1: datasets, 2: attributes, 3: buckets
mdict = get_modes_f1(dict_f1, 4)
mdict

# Get a mapping between all unique feature names and tensor indices
index_mode = get_index_f1(mdict)

# Size of the tensor: numModel*numData*numAttribute*numBucket = 11*6*9*4
print(index_mode[0])
print(index_mode[1])
print(index_mode[2])
print(index_mode[3])

# Construct a tensor with a mode dict, the original tensor dict, and mapping between mode and indices
t1 = construct_tensor_f1(mdict, dict_f1,index_mode)
t1

# convert f1 tensor to a dataframe
f1_df = convert_to_df_f1(t1, mdict, index_mode)

# convert columns to integers
#int_f1_df = categorical_to_int(f1_df)
int_f1_df = categorize_df_f1(f1_df, index_mode)

# save the table
#int_f1_df.to_csv('f1_table.csv') 

# Specify features X and y
#X = f1_df[['model','dataset',"attribute","bucket"]]
#y = f1_df['f1']


{'CbertWnon_snonMlp': 0, 'CcnnWglove_cnnCrf': 1, 'CcnnWnone_lstmCrf': 2, 'CflairWglove_lstmCrf': 3, 'CelmoWnone_lstmCrf': 4, 'CcnnWglove_lstmCrf': 5, 'CcnnWglove_lstmMlp': 6, 'CcnnWrand_lstmCrf': 7, 'CflairWnone_lstmCrf': 8, 'CnoneWrand_lstmCrf': 9, 'CelmoWglove_lstmCrf': 10}
{'notebc': 0, 'conll03': 1, 'notebn': 2, 'notewb': 3, 'wnut16': 4, 'notemz': 5}
{'oDen': 0, 'eDen': 1, 'eLen': 2, 'eFre': 3, 'sLen': 4, 'eCon': 5, 'tag': 6, 'tCon': 7, 'tFre': 8}
{0: 0, 1: 1, 2: 2, 3: 3}


In [105]:
t1.shape

(11, 6, 9, 4)

In [106]:
# # xgboost

# f1X = int_f1_df.iloc[:,:-1]
# f1y = int_f1_df.iloc[:,-1]

# #fold_test_X = test.iloc[:,3:]
# #fold_test_y = test.iloc[:,2]

# reg_f1 = xgb.XGBRegressor(objective ='reg:squarederror', learning_rate=0.1,
#                            max_depth=10, n_estimators=100)
# res_f1 = reg_f1.fit(f1X, f1y)

# xgb_pred_y = res_f1.predict(f1X)
# xgb_pred_y

## NER F1
### not scaled 

In [107]:
compare_models(f1_df, t1, [],index_mode, 5, data='f1',scale_back=False,reg_E=1, reg_J=1)

fold 5


KeyboardInterrupt: 

In [176]:
#ns_gp_result_f1
ns_pca_result_f1, ns_cp_result_f1, ns_xgb_result_f1, ns_gp_result_f1,ns_lg_result_f1, ns_bs_result_f1 = compare_models(f1_df, t1, [], \
                                                                     index_mode, 5, data='f1',\
                                                                     scale_back=False,\
                                                                     reg_E=1, reg_J=1\
                                                                    )

fold 5


In [177]:
np.mean(ns_pca_result_f1)

0.052196927

In [178]:
np.mean(ns_cp_result_f1)

0.06890764

In [179]:
np.mean(ns_xgb_result_f1)

0.05419675368129184

In [180]:
np.mean(ns_gp_result_f1)

0.07680016503647537

In [181]:
np.mean(ns_lg_result_f1)

0.05886706942409233

In [182]:
np.mean(ns_bs_result_f1)

0.20871759077991756

### scaled

In [92]:
scaled_t1,_,_ = scale_f1(t1, mdict, index_mode)
s_pca_result_f1, s_cp_result_f1, s_xgb_result_f1, s_gp_result_f1 = compare_models(f1_df, scaled_t1, [], index_mode, 5, data='f1', \
                                                            scale_back=True, \
                                                            non_scaled_tensor=t1,
                                                            non_scaled_mode_dict=mdict,\
                                                            non_scaled_index=index_mode
                                                           )

fold 5


TypeError: Singleton array array(5) cannot be considered a valid collection.

In [236]:
#scaled_t1 = scale_f1(t1, mdict, index_mode)
#scaled_t1

In [237]:
np.mean(s_pca_result_f1)

0.058243893

In [238]:
np.mean(s_cp_result_f1)

0.06304242

In [22]:
np.mean(s_xgb_result_f1)

NameError: name 's_xgb_result_f1' is not defined

In [21]:
np.mean(s_gp_result_f1)

NameError: name 's_gp_result_f1' is not defined

In [None]:
scaled_t1,mm,ss= scale_f1(t1, mdict, index_mode)


## Wiki MT
### Compare XGboost with Tensor PCA

In [259]:
# Read in wiki data
with open("dict_file.pkl", "rb") as f:
    wkdata = pickle.load(f)
    
X = wkdata['BLEU']['feats']
y = wkdata['BLEU']['labels']
langs = wkdata['BLEU']['langs']

In [260]:
# Obtain a complete dataframe
wiki = pd.concat([langs, y, X], axis=1)
#wiki.columns

In [261]:
# Construct a tensor with 3 modes, scaled 
tensor, missing, mode_dict, ind_lst, all_p = construct_tensor_from_df(wiki, lang_modes=[0,1], \
                                           feat_mode=list(range(2,len(wiki.columns))), \
                                           need_scale=True)

In [262]:
#tensor
#len(ind_lst[1])

In [263]:
# Sparcity
len(missing)/tensor.size
#tensor.size
#tensor.shape

0.3458251150558843

### Do not forget to set REG_E, REG_J terms !

In [264]:

# DO NOT FORGET TO SET THE REG_E, REG_J terms !!!!!!
pca_result, cp_result, xgb_result, gp_result, lg_result, bs_result = compare_models(wiki, tensor, missing, ind_lst, 5, reg_E=0.6, reg_J=1.65)

fold 5


In [265]:
pca_result

[3.5065002847396687,
 2.0757556740884007,
 2.6479330137907273,
 2.9549646410347155,
 3.4725733117494366]

In [275]:
cp_result

[4.5633073504571255,
 3.2495450978397593,
 3.6346073224477298,
 4.12238532160684,
 4.793139240513713]

In [267]:
xgb_result

[2.59784630274561,
 2.141474952802471,
 2.994903010038946,
 2.3822415590608292,
 2.6302115501713765]

In [268]:
np.mean(pca_result)

2.9315453850805895

In [269]:
np.mean(cp_result)

4.072596866573034

In [270]:
np.mean(xgb_result)

2.5493354749638466

In [271]:
np.mean(gp_result)

10.614821602520268

In [273]:
np.mean(lg_result)

2.2876414815595645

In [274]:
np.mean(bs_result)

6.38012240548762

## tsf pos
### Compare XGboost with Tensor PCA

In [300]:
# Read in tsf data
with open("dict_file_tsf.pkl", "rb") as f:
    tsfdata = pickle.load(f)
    
X_tsf = tsfdata['Accuracy']['feats']
y_tsf = tsfdata['Accuracy']['labels']
langs_tsf = tsfdata['Accuracy']['langs']

In [301]:
#tsf

In [302]:
# Obtain a complete dataframe
tsf = pd.concat([langs_tsf, y_tsf, X_tsf], axis=1)


In [303]:
# Construct a tensor with 3 modes, scaled 
tensor_tsf, missing_tsf, mode_dict_tsf, ind_lst_tsf, all_p_tsf = construct_tensor_from_df(tsf, lang_modes=[0,1], \
                                           feat_mode=list(range(2,len(tsf.columns))),\
                                           need_scale=False, source='Task lang', target='Aux lang')

In [304]:

# Sparcity
len(missing_tsf)/tensor_tsf.size
#tensor.size
#len(missing_tsf)
#missing_tsf
#tensor_tsf.shape


0.01858974358974359

In [305]:
#tensor_tsf

### Do not forget to set REG_E, REG_J terms !

In [306]:

# DO NOT FORGET TO SET THE REG_E, REG_J terms !!!!!!
pca_result_tsf, cp_result_tsf, xgb_result_tsf, gp_result_tsf,lg_result_tsf, bs_result_tsf = compare_models(tsf, tensor_tsf, missing_tsf, ind_lst_tsf, 5, \
                                                              data='tsf', scale_back=False)

fold 5


In [307]:
pca_result_tsf

[6.2992236791204,
 5.220995138959067,
 6.244504347252348,
 5.309591453857032,
 5.661249683280757]

In [308]:
cp_result_tsf

[35.75320383101946,
 35.054940952357185,
 34.06411820079632,
 35.09605539789433,
 34.10117319297286]

In [309]:
xgb_result_tsf

[8.372023745611038,
 6.33853626718467,
 8.747223800968516,
 6.311569132593977,
 7.227456793547817]

In [310]:
np.mean(pca_result_tsf)

5.747112860493921

In [311]:
np.mean(cp_result_tsf)

34.81389831500803

In [312]:
np.mean(xgb_result_tsf)

7.399361947981204

In [313]:
np.mean(gp_result_tsf)

56.5308513830819

In [314]:
np.mean(lg_result_tsf)

7.050069030751123

In [315]:
np.mean(bs_result_tsf)

29.10018021212573

## tsf mt
### Compare XGboost with Tensor PCA

In [60]:
# Read in tsf data
with open("dict_file_tsfmt.pkl", "rb") as f:
    tsfmt = pickle.load(f)
    
X_tsfmt = tsfmt['BLEU']['feats']
y_tsfmt = tsfmt['BLEU']['labels']
langs_tsfmt = tsfmt['BLEU']['langs']

In [61]:
#tsfmt

In [62]:
# Obtain a complete dataframe
tsfmt = pd.concat([langs_tsfmt, y_tsfmt, X_tsfmt], axis=1)
tsfmt = tsfmt.drop(columns=['Target lang'])

In [73]:
# Construct a tensor with 3 modes, scaled 
tensor_tsfmt, missing_tsfmt, mode_dict_tsfmt, ind_lst_tsfmt, all_p_tsfmt = construct_tensor_from_df(tsfmt, \
                                           lang_modes=[0,1], \
                                           feat_mode=list(range(2,len(tsfmt.columns))),\
                                           need_scale=False, source=' Source lang', target='Transfer lang')

In [74]:
len(missing_tsfmt)/tensor_tsfmt.size

0.018518518518518517

In [75]:
tensor_tsfmt.shape

(54, 54, 21)

In [290]:

# DO NOT FORGET TO SET THE REG_E, REG_J terms !!!!!!
pca_result_tsfmt, cp_result_tsfmt, xgb_result_tsfmt, gp_result_tsfmt, lg_result_tsfmt, bs_result_tsfmt = compare_models(tsfmt, tensor_tsfmt, missing_tsfmt, ind_lst_tsfmt, 5, \
                                                              data='tsfmt', scale_back=False, reg_E=0.6, reg_J=3)

fold 5


In [291]:
pca_result_tsfmt

[1.5382860382122203,
 1.3828968519866267,
 0.8663211394403862,
 1.4093513545570078,
 1.6031248685127348]

In [292]:
cp_result_tsfmt

[8.404722935871503,
 8.505859801163583,
 8.869100714358991,
 8.570899948944483,
 8.978368776632799]

In [293]:
xgb_result_tsfmt

[1.5414878818578572,
 1.455519084439652,
 0.9791849288407051,
 1.5129307068516908,
 1.7787268568990355]

In [294]:
np.mean(pca_result_tsfmt)

1.3599960505417952

In [295]:
np.mean(cp_result_tsfmt)

8.665790435394273

In [296]:
np.mean(xgb_result_tsfmt)

1.4535698917777882

In [297]:
np.mean(gp_result_tsfmt)

23.865616524712625

In [298]:
np.mean(lg_result_tsfmt)

1.3073909851934349

In [299]:
np.mean(bs_result_tsfmt)

10.770827148260167

## Load CWS

In [225]:
# Read in a dictionary 
true_cws =  "true_cws_dict.pkl"
with open(true_cws, "rb") as f:
    cws_dict = pickle.load(f)

In [226]:
cws_modes = get_modes_f1(cws_dict, 4)

In [227]:
cws_index = get_index_f1(cws_modes)

In [230]:
cws_t = construct_tensor_f1(cws_modes, cws_dict, cws_index)


In [239]:
# convert f1 tensor to a dataframe
cws_df = convert_to_df_cws(cws_t, cws_modes, cws_index)

# convert columns to integers
#int_f1_df = categorical_to_int(f1_df)
int_cws_df = categorize_df_cws(cws_df, cws_index)

In [240]:
cws_df

Unnamed: 0,dataset,model,attribute,bucket,f1
0,'sxu','Cw2vBw2vLstmCrf','oDen',0,0.955938
1,'sxu','Cw2vBw2vLstmCrf','oDen',1,0.941879
2,'sxu','Cw2vBw2vLstmCrf','oDen',2,0.929166
3,'sxu','Cw2vBw2vLstmCrf','oDen',3,0.934691
4,'sxu','Cw2vBw2vLstmCrf','cFre',0,0.876042
5,'sxu','Cw2vBw2vLstmCrf','cFre',1,0.891689
6,'sxu','Cw2vBw2vLstmCrf','cFre',2,0.940681
7,'sxu','Cw2vBw2vLstmCrf','cFre',3,0.972888
8,'sxu','Cw2vBw2vLstmCrf','wFre',0,0.849907
9,'sxu','Cw2vBw2vLstmCrf','wFre',1,0.814121


In [277]:
#ns_gp_result_f1
pca_result_cws, cp_result_cws, xgb_result_cws, gp_result_cws,lg_result_cws, bs_result_cws = compare_models(cws_df, cws_t, [], \
                                                                     cws_index, 5, data='cws',\
                                                                     scale_back=False,\
                                                                     reg_E=1, reg_J=1\
                                                                    )

fold 5


In [278]:
pca_result_cws

[0.022234488, 0.025269667, 0.030292174, 0.040998533, 0.026698781]

In [279]:
cp_result_cws

[0.034425203, 0.047388755, 0.04387816, 0.046927255, 0.041738246]

In [280]:
xgb_result_cws

[0.02238433635627803,
 0.016810709704798858,
 0.01885663775968977,
 0.027082089256251975,
 0.021411307921565745]

In [281]:
gp_result_cws

[0.0892792442340237,
 0.07384662082151357,
 0.08242185138624998,
 0.10057866934186983,
 0.10194449660125048]

In [282]:
lg_result_cws

[0.03486975941658747,
 0.03472427003199159,
 0.04159289545887568,
 0.050123173220666596,
 0.04158469596137467]

In [283]:
bs_result_cws

[0.12049468931846594,
 0.13820834763262552,
 0.14978932801740874,
 0.16011736309027552,
 0.11410698791924323]

In [284]:
np.mean(pca_result_cws)

0.029098729

In [285]:
np.mean(cp_result_cws)

0.042871527

In [286]:
np.mean(xgb_result_cws)

0.021309016199716875

In [287]:
np.mean(gp_result_cws)

0.08961417647698151

In [288]:
np.mean(lg_result_cws)

0.040578958817899205

In [289]:
np.mean(bs_result_cws)

0.13654334319560377