# Soft Voting

In [None]:
import numpy as np
import pandas as pd
import random
import re
import os
from scipy.special import softmax

SEED = 42
random.seed(SEED)
os.environ['PYTHONHASHSEED'] = str(SEED)
np.random.seed(SEED)

LABELS = [1,2,4,5]
id2label = {idx:label for idx, label in enumerate(LABELS)}
label2id = {label:idx for idx, label in enumerate(LABELS)}

## Load History

In [None]:
submission_dir = './data/samples/'
dir_pt = re.compile('^.*/(.*?)_epoch10_(.*?)$')
tree = {parent:files for parent,dirs,files in list(os.walk(submission_dir)) if dir_pt.match(parent)}

history = list()
for parent, files in tree.items():
    history += ['/'.join([parent,file]) for file in files]

file_pt = re.compile('^.*/(.*?)_epoch10_(.*?)/epoch(.*?)$')
alias = lambda name: ''.join([str(s[0]).upper() for s in file_pt.findall(name)[0]])
history = {alias(file):file for file in history}

roberta_dir = submission_dir+'roberta/'
roberta_history = list(os.walk(roberta_dir))
history.update({'R'+file[0].upper():roberta_dir+file for file in roberta_history[0][2]})

history_df = pd.DataFrame(history.items(), columns=['index','path'])
history_df = history_df.set_index('index').sort_index()
history_df.index

Index(['KC1', 'KC2', 'KC3', 'KC4', 'KC5', 'KC6', 'KC7', 'KC8', 'KC9', 'KV1',
       'KV2', 'KV3', 'KV4', 'KV5', 'KV6', 'KV7', 'KV8', 'KV9', 'RC', 'RC1',
       'RC2', 'RC3', 'RC4', 'RC5', 'RC6', 'RC7', 'RC8', 'RC9', 'RV', 'RV1',
       'RV2', 'RV3', 'RV4', 'RV5', 'RV6', 'RV7', 'RV8', 'RV9'],
      dtype='object', name='index')

In [None]:
voters = ['KV2','KC6','RV2','RC4']
election = '+'.join(voters)
proba_list = history_df.loc[voters]['path'].tolist()
ensembled = sum([pd.read_csv(proba) for proba in proba_list])
ensembled = ensembled.apply(lambda x: softmax(x), axis=1)
ensembled.head()

Unnamed: 0,1,2,4,5
0,0.024343,0.931997,0.022058,0.021602
1,0.877985,0.058975,0.031477,0.031563
2,0.021469,0.021614,0.026267,0.93065
3,0.916195,0.034313,0.024729,0.024763
4,0.915622,0.034671,0.024804,0.024903


## Submission

In [None]:
submission = pd.read_csv('./data/sample_submission.csv')
preds = ensembled.apply(lambda x: id2label[np.argmax(x)], axis=1)
submission['target'] = preds
submission.head()

Unnamed: 0,id,target
0,0,2
1,1,1
2,2,5
3,3,1
4,4,1


In [None]:
if os.path.exists(f'{submission_dir}/used/soft_voting_{election}.csv'):
    raise Exception('File already exists.')
submission.to_csv(f'{submission_dir}/soft_voting_{election}.csv', index=False)

## Compare Hard and Soft

In [None]:
hard_dir = './data/samples/old/'
tree = list(os.walk(hard_dir))

def alias(name: str):
    name = name.replace('_with','').replace('_test','').replace('.csv','')
    name = ''.join([word[0].upper() for word in name.split('_')])
    return name

hard_history = {alias(file):hard_dir+file for file in tree[0][2] if file not in {'.DS_Store'}}

koelectra_dir, checkpoints = tree[1][0]+'/', tree[1][2]
hard_history.update({'KV'+re.findall('epoch(.*?).csv',file)[0]:koelectra_dir+file for file in checkpoints})

for name,path in hard_history.items():
    hard_history[name] = pd.read_csv(path)['target'].tolist()

hard_history = pd.DataFrame(hard_history)
hard_history = hard_history[sorted(hard_history.columns)]
hard_history.head()

Unnamed: 0,KAC,KAV,KCC,KCV,KCVC,KCVV,KV1,KV2,KV3,KV4,KV5,KV6,KV7,KV8,KV9,KVC,KVCE,KVV,KVVE,RL
0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
1,1,1,1,1,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1
2,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5
3,1,1,1,1,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1
4,1,1,1,1,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1


In [None]:
soft_history = {idx:pd.read_csv(path).apply(lambda x: id2label[np.argmax(x)], axis=1) for idx,path in zip(history_df.index,history_df['path'])}
cols = list(soft_history.keys())
soft_history = pd.concat(list(soft_history.values()),axis=1)
soft_history.columns = cols
soft_history.head()

Unnamed: 0,KC1,KC2,KC3,KC4,KC5,KC6,KC7,KC8,KC9,KV1,...,RV,RV1,RV2,RV3,RV4,RV5,RV6,RV7,RV8,RV9
0,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
1,1,1,1,1,1,1,1,2,2,1,...,1,2,1,1,1,1,1,1,2,2
2,5,5,5,5,5,5,5,5,5,5,...,5,5,5,5,5,5,5,5,5,5
3,1,1,1,1,1,1,1,2,1,1,...,1,1,1,1,1,1,1,1,1,1
4,1,1,1,1,1,1,1,2,1,1,...,1,1,1,1,1,1,1,1,1,1


In [None]:
sum(soft_history['KV3'] == hard_history['KVV'])

21630