# Purpose

Collate the results of fine-tuning for the 3 scenarios (10%, 20%, and random) into fewer files for easier processing in the report code due to less files that need to be downloaded via `gdown`.

In [1]:
%cd /root
! git clone https://github.com/myles-i/DLH_TransferLearning.git
%cd DLH_TransferLearning

/root
Cloning into 'DLH_TransferLearning'...
remote: Enumerating objects: 688, done.[K
remote: Counting objects: 100% (161/161), done.[K
remote: Compressing objects: 100% (116/116), done.[K
remote: Total 688 (delta 101), reused 95 (delta 45), pack-reused 527[K
Receiving objects: 100% (688/688), 5.03 MiB | 10.94 MiB/s, done.
Resolving deltas: 100% (410/410), done.
/root/DLH_TransferLearning


In [2]:
%%capture
! pip install -r requirements.txt

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
PROJECT_DIR = '/content/drive/MyDrive/DLHProject'
JOB_DIR = PROJECT_DIR + '/jobs'

In [19]:
RESULT_DIR = PROJECT_DIR + '/results2d/experimental_nodb'
! mkdir -p $RESULT_DIR

In [6]:
# 10, 20, ..., 100
SEEDS = list(range(10, 100+10, 10))

In [7]:
import pandas as pd

In [9]:
def get_histories(job_dir, weight_type, seeds):
    histories = []
    for seed in seeds:
        path = f'{job_dir}/finetune__{weight_type}_seed{seed}/history.csv'
        history = pd.read_csv(path)
        # str cast needed due to 10 and 20 weight_types being int, and we
        # want to ensure type compatibility with 'random', which is str.
        history['weight_type'] = str(weight_type)
        history['seed'] = seed
        histories.append(history)
    # ignore_index=True makes it so that the df index will go from 0 to N-1
    # where N is the total number of rows.
    out = pd.concat(histories, axis=0, ignore_index=True)
    return out

Collect the fine-tune results:

In [10]:
random_history = get_histories(JOB_DIR + '/spectrogram/finetuning/experimental_nodb', 'random', SEEDS)

In [11]:
random_history.head()

Unnamed: 0,epoch,acc,f1,loss,val_acc,val_loss,weight_type,seed
0,0,0.644879,0.186029,0.895165,0.592506,20.432926,random,10
1,1,0.74136,0.249469,0.626114,0.629977,4.835564,random,10
2,2,0.775919,0.464849,0.549603,0.662763,0.990988,random,10
3,3,0.795778,0.323216,0.511288,0.313817,3.944339,random,10
4,4,0.801564,0.326531,0.504377,0.325527,3.477914,random,10


In [12]:
pct_10_history = get_histories(JOB_DIR + '/spectrogram/finetuning/experimental_nodb', '10', SEEDS)

In [13]:
pct_10_history.head()

Unnamed: 0,epoch,acc,f1,loss,val_acc,val_loss,weight_type,seed
0,0,0.603284,0.132574,1.099666,0.227166,607.648743,10,10
1,1,0.670211,0.241949,0.788602,0.351288,2.866085,10,10
2,2,0.707115,0.546582,0.709575,0.723653,0.734261,10,10
3,3,0.748084,0.375594,0.631063,0.667447,1.029231,10,10
4,4,0.756529,0.376595,0.591005,0.716628,0.687509,10,10


In [14]:
pct_20_history = get_histories(JOB_DIR + '/spectrogram/finetuning/experimental_nodb', '20', SEEDS)

In [15]:
pct_20_history.head()

Unnamed: 0,epoch,acc,f1,loss,val_acc,val_loss,weight_type,seed
0,0,0.579984,0.015873,1.05428,0.032787,2658.274902,20,10
1,1,0.682877,0.129531,0.788388,0.215457,4.633831,20,10
2,2,0.731353,0.34365,0.646866,0.59719,1.163379,20,10
3,3,0.761063,0.475713,0.588947,0.644028,0.966104,20,10
4,4,0.767005,0.535799,0.579496,0.723653,0.65267,20,10


Now combine the 3 dataframes into one and save to `RESULT_DIR`.

In [16]:
history_all = pd.concat([random_history, pct_10_history, pct_20_history], axis=0, ignore_index=True)

In [20]:
# index=False makes it so that the index (bold left most numbers in the head()
# output above) is not saved to the csv file.
history_all.to_csv(RESULT_DIR + '/history_all.csv', index=False)

Double check that the saved csv file looks ok.

In [21]:
! head -n 5 {RESULT_DIR + '/history_all.csv'}

epoch,acc,f1,loss,val_acc,val_loss,weight_type,seed
0,0.6448788046836853,0.1860294117647059,0.8951646685600281,0.5925058722496033,20.43292617797852,random,10
1,0.7413604259490967,0.2494694397283531,0.6261142492294312,0.6299765706062317,4.835563659667969,random,10
2,0.7759186625480652,0.464848714998082,0.5496029257774353,0.6627634763717651,0.9909878969192504,random,10
3,0.7957779765129089,0.3232162728984762,0.5112877488136292,0.3138173222541809,3.94433856010437,random,10


In [22]:
! tail -n 5 $RESULT_DIR/history_all.csv

60,0.984831929206848,0.6920405310821314,0.0453895330429077,0.8149883151054382,1.7034633159637451,20,100
61,0.994526982307434,0.7076283821585125,0.0188831631094217,0.8266978859901428,1.218589425086975,20,100
62,0.9971852898597716,0.6977826477634899,0.0091564198955893,0.8173301815986633,1.2450664043426514,20,100
63,0.9989054203033448,0.7252598614072495,0.0038668473716825,0.824355959892273,1.3784340620040894,20,100
64,0.9996872544288636,0.707882002088577,0.0024341840762645,0.8196721076965332,1.3522300720214844,20,100


Next we collate the test prediction csv file. The process is as follows:

1. Read in each of the `test_predictions.csv` files for all weight types and all experiments (seeds).
2. Process each csv file using `read_predictions()` function by the paper authors. The result is a dictionary.
3. Associate with each dictionary the weight type and seed.
4. Combine all the augmented dictionaries into one dictionary.
5. Use `save_pkl` function to save the combined dict to RESULT_DIR.

In [23]:
from transplant.utils import read_predictions, load_pkl, save_pkl

In [24]:
def get_test_predictions(job_dir, weight_type, seeds):
    test_preds = {}
    for seed in seeds:
        path = f'{job_dir}/finetune__{weight_type}_seed{seed}/test_predictions.csv'
        test_pred = read_predictions(path)
        # Step 3
        key = f'{weight_type}_{seed}'
        test_preds[key] = test_pred
    return test_preds

In [25]:
random_predictions = get_test_predictions(JOB_DIR + '/spectrogram/finetuning/experimental_nodb', 'random', SEEDS)

In [26]:
random_predictions.keys()

dict_keys(['random_10', 'random_20', 'random_30', 'random_40', 'random_50', 'random_60', 'random_70', 'random_80', 'random_90', 'random_100'])

In [27]:
random_predictions['random_10']

{'y_true': array([[0, 1, 0, 0],
        [0, 1, 0, 0],
        [0, 0, 1, 0],
        ...,
        [0, 1, 0, 0],
        [0, 1, 0, 0],
        [0, 1, 0, 0]]),
 'y_prob': array([[3.3007294e-04, 8.3876100e-01, 1.6084108e-01, 6.7899084e-05],
        [9.3049890e-03, 8.3999900e-01, 1.4415039e-01, 6.5456224e-03],
        [1.1324053e-01, 2.2932235e-03, 8.8445556e-01, 1.0733121e-05],
        ...,
        [5.8705690e-04, 9.5388900e-01, 4.4400442e-02, 1.1234076e-03],
        [8.8114420e-02, 3.9896038e-01, 3.4956038e-01, 1.6336483e-01],
        [6.6215270e-04, 8.6971635e-01, 1.2960379e-01, 1.7713932e-05]]),
 'classes': ['A', 'N', 'O', '~']}

In [29]:
pct_10_predictions = get_test_predictions(JOB_DIR + '/spectrogram/finetuning/experimental_nodb', '10', SEEDS)

In [30]:
pct_10_predictions.keys()

dict_keys(['10_10', '10_20', '10_30', '10_40', '10_50', '10_60', '10_70', '10_80', '10_90', '10_100'])

In [31]:
pct_20_predictions = get_test_predictions(JOB_DIR + '/spectrogram/finetuning/experimental_nodb', '20', SEEDS)

In [32]:
pct_20_predictions.keys()

dict_keys(['20_10', '20_20', '20_30', '20_40', '20_50', '20_60', '20_70', '20_80', '20_90', '20_100'])

In [33]:
# Step 4
# Utilizing | syntax available in Python 3.10+
predictions_all = random_predictions | pct_10_predictions | pct_20_predictions

In [34]:
predictions_all.keys()

dict_keys(['random_10', 'random_20', 'random_30', 'random_40', 'random_50', 'random_60', 'random_70', 'random_80', 'random_90', 'random_100', '10_10', '10_20', '10_30', '10_40', '10_50', '10_60', '10_70', '10_80', '10_90', '10_100', '20_10', '20_20', '20_30', '20_40', '20_50', '20_60', '20_70', '20_80', '20_90', '20_100'])

In [36]:
# Step 5
save_pkl(RESULT_DIR + '/predictions_all.pkl', **predictions_all)

In [37]:
! ls -lh $RESULT_DIR

total 1.7M
-rw------- 1 root root 193K May  2 05:40 history_all.csv
-rw------- 1 root root 1.5M May  2 05:42 predictions_all.pkl


In [38]:
reread_predictions_all = load_pkl(RESULT_DIR + '/predictions_all.pkl')

In [39]:
reread_predictions_all.keys()

dict_keys(['random_10', 'random_20', 'random_30', 'random_40', 'random_50', 'random_60', 'random_70', 'random_80', 'random_90', 'random_100', '10_10', '10_20', '10_30', '10_40', '10_50', '10_60', '10_70', '10_80', '10_90', '10_100', '20_10', '20_20', '20_30', '20_40', '20_50', '20_60', '20_70', '20_80', '20_90', '20_100'])

In [40]:
reread_predictions_all['20_70']

{'y_true': array([[0, 1, 0, 0],
        [0, 1, 0, 0],
        [0, 0, 1, 0],
        ...,
        [0, 1, 0, 0],
        [0, 1, 0, 0],
        [0, 1, 0, 0]]),
 'y_prob': array([[4.0037630e-06, 9.9930620e-01, 6.8971846e-04, 1.2857017e-08],
        [1.2590403e-04, 9.9524610e-01, 4.6229820e-03, 5.0137000e-06],
        [2.1163608e-01, 3.1606443e-04, 7.8804654e-01, 1.2704440e-06],
        ...,
        [8.7108240e-04, 8.7545097e-01, 3.4736690e-03, 1.2020423e-01],
        [9.2403874e-02, 6.4856520e-01, 1.1199919e-02, 2.4783105e-01],
        [6.5124120e-06, 9.9503460e-01, 4.9588800e-03, 7.7330600e-10]]),
 'classes': ['A', 'N', 'O', '~']}