# Processing the data

In this notebook parts of the data that was used for my capstone is processed. Below the codes can be found to:
1. Do initial processing of raw OPP-115 data
2. Divide data into 5 fold training / validation and testing set
3. Locate the best parameters for each class with the grid search that was done in the 'pribert_full_model.ipynb' file

# Dividing the data
below it is shown how the *'op115_processed.csv'* data is used in to create 5 different sets.

In [None]:
!pip install verstack
import verstack
import pandas as pd
import numpy as np

In [None]:
TRAIN_PATH = '../input/privbert-data/op115_data/op115_train_k0.csv'
VAL_PATH = '../input/privbert-data/op115_data/op115_val_k0.csv'
TEST_PATH = '../input/privbert-data/op115_data/op115_test_k0.csv'
ALL_PATH = '../input/privbert-data/op115_processed.csv'
op115_train = pd.read_csv(TRAIN_PATH)
op115_val = pd.read_csv(VAL_PATH)
op115_test = pd.read_csv(TEST_PATH)
op115_all = pd.read_csv(ALL_PATH)

In [None]:
print(op115_train.policy_uid.unique())
print(op115_val.policy_uid.unique())
print(op115_test.policy_uid.unique())

In [None]:
from shutil import copyfile

# copy our file into the working directory (make sure it has .py suffix)
copyfile(src = "../input/privbert-data/data_processing.py", dst = "../working/data_processing.py")
copyfile(src = "../input/privbert-data/pytorch_classifier.py", dst = "../working/pytorch_classifier.py")
copyfile(src = "../input/privbert-data/hierarchical_data.py", dst = "../working/hierarchical_data.py")
copyfile(src = "../input/privbert-data/privbert_gridsearches3.csv", dst = "../working/privbert_gridsearches.csv")
copyfile(src = "../input/privbert-data/tresholds.csv", dst = "../working/treshold.csv")

In [None]:
from data_processing import Op115OneHots
ALL_POLS = '../input/privbert-data/op115_processed.csv'
all_pols_df = pd.read_csv(ALL_POLS)
poll_uids = sorted(all_pols_df.policy_uid.unique())
labels_per_pol = []

op115_all_c = Op115OneHots(all_pols_df)
op115_all_c.go2(majority = True)
uniques = op115_all_c.return_oh_names()
catsub_index, catval_index, subval_index, inds = op115_all_c.len_onehots()

catsub,catval,subval,cats,subs,vals,my_texts = op115_all_c.new_onehots()

In [None]:
catarr = np.array(cats)
labelsupport = []
for i in range(10):
    labelsupport.append(sum(catarr[:,i]))

print(labelsupport)

In [None]:
cat_arry = np.array(cats)
indexes = []
for i in range(10):
    print(sum(cat_arry[:,i]))
    colrow= cat_arry[:,i]
    inds = np.where(colrow == 1)
    indexes.append(inds)
    

In [None]:
policies = []
polsegs = op115_all_c.pol_seg()

In [None]:
for i in range(10):
    pol_uids = []
    for ind in indexes[i][0].astype(int):
        pol_uids.append(polsegs[ind][0])
    policies.append(pol_uids)


In [None]:
s = 0
specials = [0,2,6,8]
special_pols = []
for special in specials:
    print(set(policies[special]))
    special_pols = special_pols + policies[special]

special_pols = list(set(special_pols))
print(special_pols)

In [None]:
from sklearn.model_selection import train_test_split

train_val_inds, test_inds = train_test_split(special_pols, test_size = 0.35)
train_inds, val_inds = train_test_split(train_val_inds, test_size = 0.2)
print(len(train_inds),len(val_inds),len(test_inds))

In [None]:
import os
import errno
filename = "./op115_data/"
if not os.path.exists(os.path.dirname(filename)):
    try:
        os.makedirs(os.path.dirname(filename))
    except OSError as exc: # Guard against race condition
        if exc.errno != errno.EEXIST:
            raise


kfold = 5
all_poluids = all_pols_df['policy_uid'].unique()
without_special = [poluid for poluid in all_poluids if poluid not in special_pols]
for i in range(kfold):
    train_val_inds, test_inds = train_test_split(special_pols, test_size = 0.35)
    train_inds, val_inds = train_test_split(train_val_inds, test_size = 0.2)
    
    train_val_pols, test_pols = train_test_split(without_special, test_size = 0.24)
    train_pols, val_pols = train_test_split(train_val_pols, test_size = 0.16)
    
    train_pols = train_pols + train_inds
    val_pols = val_pols + val_inds
    test_pols = test_pols + test_inds
    
    train_df = all_pols_df[all_pols_df['policy_uid'].isin(train_pols)]
    val_df = all_pols_df[all_pols_df['policy_uid'].isin(val_pols)]
    test_df = all_pols_df[all_pols_df['policy_uid'].isin(test_pols)]
    
    train_df.to_csv('./op115_data/op115_train_k{}.csv'.format(i))
    val_df.to_csv('./op115_data/op115_val_k{}.csv'.format(i))
    test_df.to_csv('./op115_data/op115_test_k{}.csv'.format(i))

    print(len(train_pols),len(val_pols),len(test_pols))

# Picking optimal paramters
Below it is shown how the optimal parameters for each class were picked using the gridsearch

In [None]:
#load data
import pandas as pd
import numpy as np
vertical_stack = pd.read_csv('data/advice_tresholds.csv')

In [None]:
# get the category / advice models names
cat_names = list(vertical_stack['class'].unique())
advice_models = vertical_stackf.columns[-8:]
final_tresholds = list(vertical_stack['final_treshold'].unique()) # all best paramters must have the same final treshold
print(advice_models)


In [None]:
# create a dataframe with the best scores per final_treshold per category
max_df = pd.DataFrame(columns = vertical_stack.columns)
for fn in final_tresholds:
    fn_df = vertical_stack[vertical_stack['final_treshold'] == fn] #subdf with only that final treshold
    for cn in cat_names:
        subdf = fn_df[fn_df['class'] == cn] # subdf with only respective class
        subdf_advice = subdf[advice_models] # only take the advice model values
        max_inds = subdf_advice.idxmax(axis = 0).tolist() #indexes with maximum values
        donelist = []
        for ind in max_inds:
            if ind not in donelist: #dont want duplcates
                row = list(subdf.loc[ind].values) #create a row with maximum values to add to the df

                max_df.loc[len(max_df)] = row
                donelist.append(ind)
print(len(max_df))

In [None]:
def return_inds(adf):
    """
    This function takes a df and returns the indexes from when each new class starts
    """
    inds_classes = []
    classes =  adf['class'].to_list()
    for i in range(len(classes)-1):
        if classes[i] != classes[i+1]:
            inds_classes.append(i+1)
    inds_classes = [0] + inds_classes +[len(adf)]
    return inds_classes

In [None]:
# create initial output list, dimensions are 8x6. 8 for each advice layer combination and 6 for each final threshold
scores_per_val = [[0,0,0,0,0,0,0,0],
                  [0,0,0,0,0,0,0,0],
                  [0,0,0,0,0,0,0,0],
                  [0,0,0,0,0,0,0,0],
                  [0,0,0,0,0,0,0,0],
                  [0,0,0,0,0,0,0,0]]

index_per_val = [[0,0,0,0,0,0,0,0],
                  [0,0,0,0,0,0,0,0],
                  [0,0,0,0,0,0,0,0],
                  [0,0,0,0,0,0,0,0],
                  [0,0,0,0,0,0,0,0],
                  [0,0,0,0,0,0,0,0]]

In [None]:
from data_processing import Op115OneHots

#To test how well it did I wanted to not only look at the scores per class but also the support for the respective class
#for that I've taken the first validation set as an example

VAL_PATH = 'data/valk0.csv'
val_df = pd.read_csv(VAL_PATH)
op115_v_c = Op115OneHots(val_df)
op115_v_c.go2(majority = True)
v_catsub,v_catval,v_subval,v_cats,v_subs,v_vals,v_my_texts = op115_v_c.new_onehots()
npcats = np.array(v_cats)
n_classes = []
for i in range(10):
    n_classes.append(sum(npcats[:,i]))
print(n_classes)

In [None]:
# Put the maxium scores into the respecitve outputlist
past_lenghts = 0
max_indlist = []
klist = []
for k,fn in enumerate(final_tresholds):
    fn_subdf = max_df[max_df['final_treshold'] == fn] # subdf again
    fn_subdf = fn_subdf.reset_index()
    inds_classes = return_inds(fn_subdf)
    max_df_scores = fn_subdf[advice_models].values
    for i in range(len(inds_classes)-1): # here we use the indexes
        s = inds_classes[i]
        n = inds_classes[i+1]
        for j in range(s,n): 
            max_vals = max_df_scores[j,:] * n_classes[i] # multiply percentage scores correct with total amount of support
            max_df_scores[j,:] = max_vals # set the maxium scores
 
        for m in range(8): # iterate over each advice set
            cat_vals = max_df_scores[s:n]
            max_val = max(cat_vals[:,m])

            max_index = list(cat_vals[:,m]).index(max_val) + s + past_lenghts
            max_indlist.append(max_index)
            klist.append(k)
            #input the scores per k
            if scores_per_val[k][m]:
                scores_per_val[k][m].append(max_val)
                index_per_val[k][m].append(max_index)
            else:
                scores_per_val[k][m] = [max_val]
                index_per_val[k][m] = [max_index]
    past_lenghts += len(max_df_scores)


In [None]:
#Here we take the scores per k and put them together
max_per_val = [[0,0,0,0,0,0,0,0],
                  [0,0,0,0,0,0,0,0],
                  [0,0,0,0,0,0,0,0],
                  [0,0,0,0,0,0,0,0],
                  [0,0,0,0,0,0,0,0],
                  [0,0,0,0,0,0,0,0]]

for k,fn in enumerate(final_tresholds):
    tr_scores = scores_per_val[k]
    for i,adv in enumerate(tr_scores):
        scores = np.mean(adv)
        max_per_val[k][i] = scores

In [None]:
#using the combined scores we can now pick the best one
maxmaxval = []
maxmaxind = []
for scorelist in max_per_val:
    maxval = max(scorelist)
    catval = scorelist[0]
    maxind = (scorelist.index(maxval))
    maxmaxval.append(maxval)
    maxmaxind.append(maxind)
    print(maxind,maxval,catval)
    
bestval = max(maxmaxval)
bestind = maxmaxval.index(bestval)
besterind = maxmaxind[bestind]
besttup = (bestind,besterind)
bestvalues = index_per_val[bestind][besterind]

# a subdf that contains the best parameters
best_df = max_df.loc[bestvalues]

In [None]:
#output the df as a json file that can be used 
import json

best_param_dict = {}
for aclass in best_df['class'].values:
    param_dict = {}
    classrow = best_df[best_df['class'] == aclass]
    parnames = classrow.columns[2:8]
    tresholds = classrow[parnames].values[0]
    for i,parname in enumerate(parnames):
        param_dict[parname] = [tresholds[i]]
    best_param_dict[aclass] = param_dict
    print(len(tresholds),tresholds)
    
with open('advice_parameters.json', 'w') as outfile:
    json.dump(best_param_dict, outfile)
