# Purpose

Collate the results of fine-tuning for the 4 scenarios (1%, 10%, 20%, and random) into fewer files for easier processing in the report code due to less files that need to be downloaded via `gdown`.

In [1]:
%cd /root
! git clone https://github.com/myles-i/DLH_TransferLearning.git
%cd DLH_TransferLearning

/root
Cloning into 'DLH_TransferLearning'...
remote: Enumerating objects: 950, done.[K
remote: Counting objects: 100% (164/164), done.[K
remote: Compressing objects: 100% (117/117), done.[K
remote: Total 950 (delta 102), reused 98 (delta 47), pack-reused 786[K
Receiving objects: 100% (950/950), 7.38 MiB | 18.65 MiB/s, done.
Resolving deltas: 100% (598/598), done.
/root/DLH_TransferLearning


In [2]:
%%capture
! pip install -r requirements.txt

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
PROJECT_DIR = '/content/drive/MyDrive/DLHProject'
JOB_DIR = PROJECT_DIR + '/jobs'

In [5]:
RESULT_DIR = PROJECT_DIR + '/results2d_all'
! mkdir -p {RESULT_DIR}

In [6]:
# 10, 20, ..., 100
SEEDS = list(range(10, 100+10, 10))

In [7]:
import pandas as pd

In [8]:
def get_histories(job_dir, weight_type, seeds):
    histories = []
    for seed in seeds:
        path = f'{job_dir}/finetune__{weight_type}_seed{seed}/history.csv'
        history = pd.read_csv(path)
        # str cast needed due to 1, 10 and 20 weight_types being int, and we
        # want to ensure type compatibility with 'random', which is str.
        history['weight_type'] = str(weight_type)
        history['seed'] = seed
        histories.append(history)
    # ignore_index=True makes it so that the df index will go from 0 to N-1
    # where N is the total number of rows.
    out = pd.concat(histories, axis=0, ignore_index=True)
    return out

Collect the fine-tune results:

In [9]:
random_history = get_histories(JOB_DIR + '/spectrogram/finetuning/experimental_nodb', 'random', SEEDS)

In [10]:
random_history.head()

Unnamed: 0,epoch,acc,f1,loss,val_acc,val_loss,weight_type,seed
0,0,0.638468,0.015873,0.931254,0.032787,64.22818,random,10
1,1,0.741673,0.22831,0.624243,0.484778,4.273595,random,10
2,2,0.777013,0.178887,0.550991,0.327869,4.557386,random,10
3,3,0.801251,0.290557,0.507306,0.461358,2.471529,random,10
4,4,0.825801,0.454459,0.456386,0.704918,0.977387,random,10


In [11]:
pct_10_history = get_histories(JOB_DIR + '/spectrogram/finetuning/experimental_nodb', '10', SEEDS)

In [12]:
pct_10_history.head()

Unnamed: 0,epoch,acc,f1,loss,val_acc,val_loss,weight_type,seed
0,0,0.711181,0.12518,0.714168,0.290398,9.279119,10,10
1,1,0.831587,0.111818,0.447856,0.288056,8.730294,10,10
2,2,0.857545,0.111818,0.398115,0.288056,7.310343,10,10
3,3,0.870367,0.111818,0.365114,0.288056,5.366314,10,10
4,4,0.875997,0.154856,0.335455,0.311475,2.023373,10,10


In [13]:
pct_20_history = get_histories(JOB_DIR + '/spectrogram/finetuning/experimental_nodb', '20', SEEDS)

In [14]:
pct_20_history.head()

Unnamed: 0,epoch,acc,f1,loss,val_acc,val_loss,weight_type,seed
0,0,0.696951,0.11399,0.766771,0.290398,2.20856,20,10
1,1,0.829242,0.111818,0.465779,0.288056,4.926505,20,10
2,2,0.851134,0.111818,0.402379,0.288056,3.732189,20,10
3,3,0.865833,0.286332,0.356012,0.5363,1.777663,20,10
4,4,0.880219,0.384243,0.328823,0.75644,0.801954,20,10


In [15]:
pct_1_history = get_histories(JOB_DIR + '/spectrogram/finetuning/experimental_nodb', '1', SEEDS)

In [16]:
pct_1_history.head()

Unnamed: 0,epoch,acc,f1,loss,val_acc,val_loss,weight_type,seed
0,0,0.726349,0.094179,0.689278,0.126464,5.313791,1,10
1,1,0.813135,0.1329,0.4857,0.311475,2.609473,1,10
2,2,0.832682,0.684645,0.438713,0.747073,0.674383,1,10
3,3,0.843159,0.372751,0.41196,0.733021,0.873403,1,10
4,4,0.849726,0.655611,0.397554,0.747073,0.98332,1,10


In [17]:
pct_88_history = get_histories(JOB_DIR + '/spectrogram/finetuning/experimental_nodb', '88', SEEDS)

Now combine the 3 dataframes into one and save to `RESULT_DIR`.

In [18]:
pct_88_history.head()

Unnamed: 0,epoch,acc,f1,loss,val_acc,val_loss,weight_type,seed
0,0,0.666458,0.224386,0.903484,0.283372,1.635203,88,10
1,1,0.816732,0.295894,0.501984,0.337237,3.381644,88,10
2,2,0.840813,0.306409,0.423221,0.344262,2.623443,88,10
3,3,0.863174,0.314567,0.376545,0.358314,1.695445,88,10
4,4,0.878499,0.520976,0.33737,0.64637,0.776398,88,10


In [19]:
history_all = pd.concat([random_history, pct_1_history, pct_10_history, pct_20_history, pct_88_history], axis=0, ignore_index=True)

In [20]:
RESULT_DIR

'/content/drive/MyDrive/DLHProject/results2d_all'

In [22]:
# index=False makes it so that the index (bold left most numbers in the head()
# output above) is not saved to the csv file.
history_all.to_csv(RESULT_DIR + '/history_all.csv', index=False)

Double check that the saved csv file looks ok.

In [23]:
! head -n 5 {RESULT_DIR + '/history_all.csv'}

epoch,acc,f1,loss,val_acc,val_loss,weight_type,seed
0,0.63846755027771,0.0158730158730158,0.9312543272972108,0.0327868834137916,64.22817993164062,random,10
1,0.7416731715202332,0.2283103592314118,0.6242430806159973,0.4847775101661682,4.273594856262207,random,10
2,0.7770133018493652,0.1788872891814068,0.550990879535675,0.3278688490390777,4.55738639831543,random,10
3,0.8012509942054749,0.2905569504192282,0.5073061585426331,0.4613583087921142,2.471529245376587,random,10


In [24]:
! tail -n 5 {RESULT_DIR + '/history_all.csv'}

51,1.0,0.7863467758527471,0.0001995086349779,0.8501170873641968,1.2458524703979492,88,100
52,1.0,0.7966977826367281,0.0002650656679179,0.8524590134620667,1.2378417253494265,88,100
53,1.0,0.781375775079052,0.0001604934368515,0.8477751612663269,1.2773743867874146,88,100
54,1.0,0.7926809122869416,0.0001825633662519,0.8548009395599365,1.2644087076187134,88,100
55,1.0,0.7926809122869416,0.000159089002409,0.8548009395599365,1.2752809524536133,88,100


Next we collate the test prediction csv file. The process is as follows:

1. Read in each of the `test_predictions.csv` files for all weight types and all experiments (seeds).
2. Process each csv file using `read_predictions()` function by the paper authors. The result is a dictionary.
3. Associate with each dictionary the weight type and seed.
4. Combine all the augmented dictionaries into one dictionary.
5. Use `save_pkl` function to save the combined dict to RESULT_DIR.

In [25]:
from transplant.utils import read_predictions, load_pkl, save_pkl

In [26]:
def get_test_predictions(job_dir, weight_type, seeds):
    test_preds = {}
    for seed in seeds:
        path = f'{job_dir}/finetune__{weight_type}_seed{seed}/test_predictions.csv'
        test_pred = read_predictions(path)
        # Step 3
        key = f'{weight_type}_{seed}'
        test_preds[key] = test_pred
    return test_preds

In [27]:
random_predictions = get_test_predictions(JOB_DIR + '/spectrogram/finetuning/experimental_nodb', 'random', SEEDS)

In [28]:
random_predictions.keys()

dict_keys(['random_10', 'random_20', 'random_30', 'random_40', 'random_50', 'random_60', 'random_70', 'random_80', 'random_90', 'random_100'])

In [29]:
random_predictions['random_10']

{'y_true': array([[0, 1, 0, 0],
        [0, 1, 0, 0],
        [0, 0, 1, 0],
        ...,
        [0, 1, 0, 0],
        [0, 1, 0, 0],
        [0, 1, 0, 0]]),
 'y_prob': array([[4.0233585e-06, 9.9611060e-01, 3.8801680e-03, 5.2357520e-06],
        [1.0252142e-03, 9.8240760e-01, 1.5601903e-02, 9.6522056e-04],
        [2.0659697e-01, 5.2052917e-07, 7.9340130e-01, 1.2273739e-06],
        ...,
        [8.4059720e-06, 9.9840490e-01, 1.4306230e-03, 1.5604241e-04],
        [1.5855663e-02, 5.3430410e-02, 3.8218427e-02, 8.9249550e-01],
        [6.1154740e-04, 5.8477910e-01, 4.1460747e-01, 1.8813652e-06]]),
 'classes': ['A', 'N', 'O', '~']}

In [30]:
pct_10_predictions = get_test_predictions(JOB_DIR + '/spectrogram/finetuning/experimental_nodb', '10', SEEDS)

In [31]:
pct_10_predictions.keys()

dict_keys(['10_10', '10_20', '10_30', '10_40', '10_50', '10_60', '10_70', '10_80', '10_90', '10_100'])

In [32]:
pct_20_predictions = get_test_predictions(JOB_DIR + '/spectrogram/finetuning/experimental_nodb', '20', SEEDS)

In [33]:
pct_20_predictions.keys()

dict_keys(['20_10', '20_20', '20_30', '20_40', '20_50', '20_60', '20_70', '20_80', '20_90', '20_100'])

In [34]:
pct_1_predictions = get_test_predictions(JOB_DIR + '/spectrogram/finetuning/experimental_nodb', '1', SEEDS)

In [35]:
pct_1_predictions.keys()

dict_keys(['1_10', '1_20', '1_30', '1_40', '1_50', '1_60', '1_70', '1_80', '1_90', '1_100'])

In [39]:
pct_88_predictions = get_test_predictions(JOB_DIR + '/spectrogram/finetuning/experimental_nodb', '88', SEEDS)

In [40]:
pct_88_predictions.keys()

dict_keys(['88_10', '88_20', '88_30', '88_40', '88_50', '88_60', '88_70', '88_80', '88_90', '88_100'])

In [41]:
# Step 4
# Utilizing | syntax available in Python 3.10+
predictions_all = random_predictions | pct_1_predictions | pct_10_predictions | pct_20_predictions | pct_88_predictions

In [42]:
predictions_all.keys()

dict_keys(['random_10', 'random_20', 'random_30', 'random_40', 'random_50', 'random_60', 'random_70', 'random_80', 'random_90', 'random_100', '1_10', '1_20', '1_30', '1_40', '1_50', '1_60', '1_70', '1_80', '1_90', '1_100', '10_10', '10_20', '10_30', '10_40', '10_50', '10_60', '10_70', '10_80', '10_90', '10_100', '20_10', '20_20', '20_30', '20_40', '20_50', '20_60', '20_70', '20_80', '20_90', '20_100', '88_10', '88_20', '88_30', '88_40', '88_50', '88_60', '88_70', '88_80', '88_90', '88_100'])

In [43]:
RESULT_DIR

'/content/drive/MyDrive/DLHProject/results2d_all'

In [44]:
# Step 5
save_pkl(RESULT_DIR + '/predictions_all.pkl', **predictions_all)

In [45]:
! ls -lh {RESULT_DIR}

total 2.7M
-rw------- 1 root root 293K May  6 10:31 history_all.csv
-rw------- 1 root root 2.5M May  6 10:33 predictions_all.pkl


In [46]:
reread_predictions_all = load_pkl(RESULT_DIR + '/predictions_all.pkl')

In [47]:
reread_predictions_all.keys()

dict_keys(['random_10', 'random_20', 'random_30', 'random_40', 'random_50', 'random_60', 'random_70', 'random_80', 'random_90', 'random_100', '1_10', '1_20', '1_30', '1_40', '1_50', '1_60', '1_70', '1_80', '1_90', '1_100', '10_10', '10_20', '10_30', '10_40', '10_50', '10_60', '10_70', '10_80', '10_90', '10_100', '20_10', '20_20', '20_30', '20_40', '20_50', '20_60', '20_70', '20_80', '20_90', '20_100', '88_10', '88_20', '88_30', '88_40', '88_50', '88_60', '88_70', '88_80', '88_90', '88_100'])

In [None]:
reread_predictions_all['1_70']

{'y_true': array([[0, 1, 0, 0],
        [0, 1, 0, 0],
        [0, 0, 1, 0],
        ...,
        [0, 1, 0, 0],
        [0, 1, 0, 0],
        [0, 1, 0, 0]]),
 'y_prob': array([[4.3508837e-08, 9.9999270e-01, 7.3297590e-06, 6.7238040e-10],
        [3.2719791e-09, 9.9999390e-01, 6.0244850e-06, 1.6408523e-08],
        [4.1461378e-04, 1.0456796e-08, 9.9958533e-01, 4.2921297e-10],
        ...,
        [1.4071182e-10, 1.0000000e+00, 3.3588133e-08, 2.1974580e-09],
        [1.6410399e-03, 8.9535210e-01, 1.0164365e-01, 1.3631686e-03],
        [2.8411785e-07, 9.9992620e-01, 7.3564240e-05, 1.2701198e-08]]),
 'classes': ['A', 'N', 'O', '~']}