# Purpose

Collate the results of fine-tuning for the 3 scenarios (10%, 20%, and random) into fewer files for easier processing in the report code due to less files that need to be downloaded via `gdown`.

In [9]:
%cd /root
! git clone https://github.com/myles-i/DLH_TransferLearning.git
%cd DLH_TransferLearning

/root
Cloning into 'DLH_TransferLearning'...
remote: Enumerating objects: 565, done.[K
remote: Counting objects: 100% (38/38), done.[K
remote: Compressing objects: 100% (29/29), done.[K
remote: Total 565 (delta 17), reused 24 (delta 9), pack-reused 527[K
Receiving objects: 100% (565/565), 4.32 MiB | 23.64 MiB/s, done.
Resolving deltas: 100% (326/326), done.
/root/DLH_TransferLearning


In [10]:
%%capture
! pip install -r requirements.txt

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
PROJECT_DIR = '/content/drive/MyDrive/DLHProject'
JOB_DIR = PROJECT_DIR + '/jobs'

In [6]:
RESULT_DIR = JOB_DIR + '/results1d'
! mkdir -p $RESULT_DIR

In [7]:
# 10, 20, ..., 100
SEEDS = list(range(10, 100+10, 10))

In [8]:
import pandas as pd

In [28]:
def get_histories(job_dir, weight_type, seeds):
    histories = []
    for seed in seeds:
        path = f'{job_dir}/finetune__{weight_type}_seed{seed}/history.csv'
        history = pd.read_csv(path)
        # str cast needed due to 10 and 20 weight_types being int, and we
        # want to ensure type compatibility with 'random', which is str.
        history['weight_type'] = str(weight_type)
        history['seed'] = seed
        histories.append(history)
    # ignore_index=True makes it so that the df index will go from 0 to N-1
    # where N is the total number of rows.
    out = pd.concat(histories, axis=0, ignore_index=True)
    return out

Collect the fine-tune results:

In [22]:
random_history = get_histories(JOB_DIR, 'random', SEEDS)

In [24]:
random_history.head()

Unnamed: 0,epoch,acc,f1,loss,val_acc,val_loss,weight_type,seed
0,0,0.593589,0.33449,0.974036,0.637002,0.869664,random,10
1,1,0.66208,0.359683,0.79273,0.672131,0.816346,random,10
2,2,0.700235,0.408642,0.708797,0.65808,0.78584,random,10
3,3,0.733855,0.561089,0.645951,0.735363,0.647018,random,10
4,4,0.74136,0.454773,0.637633,0.498829,0.931044,random,10


In [29]:
pct_10_history = get_histories(JOB_DIR, '10', SEEDS)

In [33]:
pct_10_history.head()

Unnamed: 0,epoch,acc,f1,loss,val_acc,val_loss,weight_type,seed
0,0,0.674746,0.564819,0.748175,0.765808,0.576427,10,10
1,1,0.808131,0.736593,0.492132,0.805621,0.535082,10,10
2,2,0.835966,0.714168,0.428795,0.777518,0.595895,10,10
3,3,0.8405,0.7562,0.42024,0.822014,0.469741,10,10
4,4,0.858796,0.749825,0.386815,0.82904,0.454056,10,10


In [34]:
pct_20_history = get_histories(JOB_DIR, '20', SEEDS)

In [35]:
pct_20_history.head()

Unnamed: 0,epoch,acc,f1,loss,val_acc,val_loss,weight_type,seed
0,0,0.686317,0.555963,0.753518,0.761124,0.568837,20,10
1,1,0.804848,0.777497,0.492315,0.82904,0.474271,20,10
2,2,0.837373,0.747841,0.436066,0.82904,0.449575,20,10
3,3,0.846755,0.773182,0.407595,0.843091,0.428911,20,10
4,4,0.859109,0.809443,0.383724,0.859485,0.416499,20,10


Now combine the 3 dataframes into one and save to `RESULT_DIR`.

In [36]:
history_all = pd.concat([random_history, pct_10_history, pct_20_history], axis=0, ignore_index=True)

In [37]:
# index=False makes it so that the index (bold left most numbers in the head()
# output above) is not saved to the csv file.
history_all.to_csv(RESULT_DIR + '/history_all.csv', index=False)

Double check that the saved csv file looks ok.

In [39]:
! head -n 5 $RESULT_DIR/history_all.csv

epoch,acc,f1,loss,val_acc,val_loss,weight_type,seed
0,0.5935887694358826,0.3344904711264158,0.9740358591079712,0.6370023488998413,0.8696644306182861,random,10
1,0.6620797514915466,0.3596833304035212,0.7927300930023193,0.6721311211585999,0.8163464069366455,random,10
2,0.7002345323562622,0.408641975308642,0.7087966799736023,0.6580796241760254,0.7858404517173767,random,10
3,0.7338545918464661,0.5610890015378394,0.6459510922431946,0.7353630065917969,0.6470179557800293,random,10


In [40]:
! tail -n 5 $RESULT_DIR/history_all.csv

52,1.0,0.7187809190785078,0.0012898701243102,0.824355959892273,0.9872190952301024,20,100
53,1.0,0.7197346750765693,0.0012464015744626,0.8220140337944031,1.0171642303466797,20,100
54,1.0,0.7242433538684648,0.0012513172114267,0.8266978859901428,1.0243656635284424,20,100
55,1.0,0.7200047912784286,0.0010929591953754,0.8266978859901428,1.0100680589675903,20,100
56,1.0,0.7221077842281604,0.0009315672796219,0.824355959892273,1.042158126831055,20,100


Next we collate the test prediction csv file. The process is as follows:

1. Read in each of the `test_predictions.csv` files for all weight types and all experiments (seeds).
2. Process each csv file using `read_predictions()` function by the paper authors. The result is a dictionary.
3. Associate with each dictionary the weight type and seed.
4. Combine all the augmented dictionaries into one dictionary.
5. Use `save_pkl` function to save the combined dict to RESULT_DIR.

In [54]:
from transplant.utils import read_predictions, load_pkl, save_pkl

In [42]:
def get_test_predictions(job_dir, weight_type, seeds):
    test_preds = {}
    for seed in seeds:
        path = f'{job_dir}/finetune__{weight_type}_seed{seed}/test_predictions.csv'
        test_pred = read_predictions(path)
        # Step 3
        key = f'{weight_type}_{seed}'
        test_preds[key] = test_pred
    return test_preds

In [43]:
random_predictions = get_test_predictions(JOB_DIR, 'random', SEEDS)

In [44]:
random_predictions.keys()

dict_keys(['random_10', 'random_20', 'random_30', 'random_40', 'random_50', 'random_60', 'random_70', 'random_80', 'random_90', 'random_100'])

In [45]:
random_predictions['random_10']

{'y_true': array([[0, 1, 0, 0],
        [0, 1, 0, 0],
        [0, 0, 1, 0],
        ...,
        [0, 1, 0, 0],
        [0, 1, 0, 0],
        [0, 1, 0, 0]]),
 'y_prob': array([[4.26490800e-04, 9.31569600e-01, 6.79090100e-02, 9.49260200e-05],
        [1.48110080e-04, 9.26177800e-01, 6.11333600e-02, 1.25408440e-02],
        [6.82196240e-06, 8.88694000e-08, 9.99976500e-01, 1.66221600e-05],
        ...,
        [2.32006870e-03, 9.48752940e-01, 4.76910100e-02, 1.23595920e-03],
        [1.38626190e-02, 8.19023250e-01, 5.98533530e-02, 1.07260786e-01],
        [7.88477400e-05, 9.17878700e-01, 8.19820800e-02, 6.03894900e-05]]),
 'classes': ['A', 'N', 'O', '~']}

In [46]:
pct_10_predictions = get_test_predictions(JOB_DIR, '10', SEEDS)

In [47]:
pct_10_predictions.keys()

dict_keys(['10_10', '10_20', '10_30', '10_40', '10_50', '10_60', '10_70', '10_80', '10_90', '10_100'])

In [48]:
pct_20_predictions = get_test_predictions(JOB_DIR, '20', SEEDS)

In [49]:
pct_20_predictions.keys()

dict_keys(['20_10', '20_20', '20_30', '20_40', '20_50', '20_60', '20_70', '20_80', '20_90', '20_100'])

In [51]:
# Step 4
# Utilizing | syntax available in Python 3.10+
predictions_all = random_predictions | pct_10_predictions | pct_20_predictions

In [52]:
predictions_all.keys()

dict_keys(['random_10', 'random_20', 'random_30', 'random_40', 'random_50', 'random_60', 'random_70', 'random_80', 'random_90', 'random_100', '10_10', '10_20', '10_30', '10_40', '10_50', '10_60', '10_70', '10_80', '10_90', '10_100', '20_10', '20_20', '20_30', '20_40', '20_50', '20_60', '20_70', '20_80', '20_90', '20_100'])

In [55]:
# Step 5
save_pkl(RESULT_DIR + '/predictions_all.pkl', **predictions_all)

In [56]:
! ls -lh $RESULT_DIR

total 1.8M
-rw------- 1 root root 189K Apr 26 07:04 history_all.csv
-rw------- 1 root root 1.6M Apr 26 07:20 predictions_all.pkl


In [57]:
reread_predictions_all = load_pkl(RESULT_DIR + '/predictions_all.pkl')

In [58]:
reread_predictions_all.keys()

dict_keys(['random_10', 'random_20', 'random_30', 'random_40', 'random_50', 'random_60', 'random_70', 'random_80', 'random_90', 'random_100', '10_10', '10_20', '10_30', '10_40', '10_50', '10_60', '10_70', '10_80', '10_90', '10_100', '20_10', '20_20', '20_30', '20_40', '20_50', '20_60', '20_70', '20_80', '20_90', '20_100'])

In [59]:
reread_predictions_all['20_70']

{'y_true': array([[0, 1, 0, 0],
        [0, 1, 0, 0],
        [0, 0, 1, 0],
        ...,
        [0, 1, 0, 0],
        [0, 1, 0, 0],
        [0, 1, 0, 0]]),
 'y_prob': array([[1.1279869e-02, 8.2210296e-01, 1.6595362e-01, 6.6356070e-04],
        [4.0929340e-03, 8.7436250e-01, 1.1346357e-01, 8.0809700e-03],
        [9.8937750e-04, 3.8871703e-10, 9.9897960e-01, 3.0928353e-05],
        ...,
        [5.2196320e-03, 9.2748640e-01, 5.4160893e-02, 1.3133076e-02],
        [1.2717006e-01, 5.1796836e-01, 1.4334172e-01, 2.1151988e-01],
        [6.1627590e-03, 8.4792274e-01, 1.4538486e-01, 5.2960410e-04]]),
 'classes': ['A', 'N', 'O', '~']}