# Purpose

Collate the results of fine-tuning for the 5 scenarios (1%, 10%, 20%, 100%, and random) into fewer files for easier processing in the report code due to less files that need to be downloaded via `gdown`.

In [None]:
%cd /root
! git clone https://github.com/myles-i/DLH_TransferLearning.git
%cd DLH_TransferLearning

/root
Cloning into 'DLH_TransferLearning'...
remote: Enumerating objects: 830, done.[K
remote: Counting objects: 100% (44/44), done.[K
remote: Compressing objects: 100% (30/30), done.[K
remote: Total 830 (delta 24), reused 30 (delta 14), pack-reused 786[K
Receiving objects: 100% (830/830), 6.39 MiB | 8.23 MiB/s, done.
Resolving deltas: 100% (520/520), done.
/root/DLH_TransferLearning


In [None]:
%%capture
! pip install -r requirements.txt

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
PROJECT_DIR = '/content/drive/MyDrive/DLHProject'
JOB_DIR = PROJECT_DIR + '/jobs'

In [None]:
# has additional 1%, 100% pre-train weights
RESULT_DIR = PROJECT_DIR + '/results1d_all'
! mkdir -p {RESULT_DIR}

In [None]:
# 10, 20, ..., 100
SEEDS = list(range(10, 100+10, 10))

In [None]:
import pandas as pd

In [None]:
def get_histories(job_dir, weight_type, seeds):
    histories = []
    for seed in seeds:
        path = f'{job_dir}/finetune__{weight_type}_seed{seed}/history.csv'
        history = pd.read_csv(path)
        # str cast needed due to 1, 10, 20, 100 weight_types being int, and we
        # want to ensure type compatibility with 'random', which is str.
        history['weight_type'] = str(weight_type)
        history['seed'] = seed
        histories.append(history)
    # ignore_index=True makes it so that the df index will go from 0 to N-1
    # where N is the total number of rows.
    out = pd.concat(histories, axis=0, ignore_index=True)
    return out

Collect the fine-tune results:

In [None]:
random_history = get_histories(JOB_DIR, 'random', SEEDS)

In [None]:
random_history.head()

Unnamed: 0,epoch,acc,f1,loss,val_acc,val_loss,weight_type,seed
0,0,0.593589,0.33449,0.974036,0.637002,0.869664,random,10
1,1,0.66208,0.359683,0.79273,0.672131,0.816346,random,10
2,2,0.700235,0.408642,0.708797,0.65808,0.78584,random,10
3,3,0.733855,0.561089,0.645951,0.735363,0.647018,random,10
4,4,0.74136,0.454773,0.637633,0.498829,0.931044,random,10


In [None]:
pct_10_history = get_histories(JOB_DIR, '10', SEEDS)

In [None]:
pct_10_history.head()

Unnamed: 0,epoch,acc,f1,loss,val_acc,val_loss,weight_type,seed
0,0,0.674746,0.564819,0.748175,0.765808,0.576427,10,10
1,1,0.808131,0.736593,0.492132,0.805621,0.535082,10,10
2,2,0.835966,0.714168,0.428795,0.777518,0.595895,10,10
3,3,0.8405,0.7562,0.42024,0.822014,0.469741,10,10
4,4,0.858796,0.749825,0.386815,0.82904,0.454056,10,10


In [None]:
pct_20_history = get_histories(JOB_DIR, '20', SEEDS)

In [None]:
pct_20_history.head()

Unnamed: 0,epoch,acc,f1,loss,val_acc,val_loss,weight_type,seed
0,0,0.686317,0.555963,0.753518,0.761124,0.568837,20,10
1,1,0.804848,0.777497,0.492315,0.82904,0.474271,20,10
2,2,0.837373,0.747841,0.436066,0.82904,0.449575,20,10
3,3,0.846755,0.773182,0.407595,0.843091,0.428911,20,10
4,4,0.859109,0.809443,0.383724,0.859485,0.416499,20,10


In [None]:
pct_1_history = get_histories(JOB_DIR, '1', SEEDS)

In [None]:
pct_1_history.head()

Unnamed: 0,epoch,acc,f1,loss,val_acc,val_loss,weight_type,seed
0,0,0.713839,0.65454,0.703285,0.75644,0.622668,1,10
1,1,0.783581,0.579529,0.537493,0.789227,0.562751,1,10
2,2,0.80688,0.748903,0.493542,0.810304,0.52196,1,10
3,3,0.822518,0.66841,0.477193,0.76815,0.611207,1,10
4,4,0.835966,0.715701,0.432852,0.782201,0.605715,1,10


In [None]:
pct_100_history = get_histories(JOB_DIR, '100', SEEDS)

In [None]:
pct_100_history.head()

Unnamed: 0,epoch,acc,f1,loss,val_acc,val_loss,weight_type,seed
0,0,0.64222,0.46764,0.92093,0.730679,0.644483,100,10
1,1,0.782955,0.588217,0.547784,0.782201,0.522857,100,10
2,2,0.82283,0.703805,0.45489,0.814988,0.481579,100,10
3,3,0.8405,0.707483,0.417069,0.819672,0.439723,100,10
4,4,0.854574,0.72285,0.38773,0.831382,0.415285,100,10


Now combine the 5 dataframes into one and save to `RESULT_DIR`.

In [None]:
history_all = pd.concat([
    random_history,
    pct_1_history,
    pct_10_history,
    pct_20_history,
    pct_100_history
], axis=0, ignore_index=True)

In [None]:
RESULT_DIR

'/content/drive/MyDrive/DLHProject/results1d_all'

In [None]:
# index=False makes it so that the index (bold left most numbers in the head()
# output above) is not saved to the csv file.
history_all.to_csv(RESULT_DIR + '/history_all.csv', index=False)

Double check that the saved csv file looks ok.

In [None]:
! head -n 5 {RESULT_DIR + '/history_all.csv'}

epoch,acc,f1,loss,val_acc,val_loss,weight_type,seed
0,0.5935887694358826,0.3344904711264158,0.9740358591079712,0.6370023488998413,0.8696644306182861,random,10
1,0.6620797514915466,0.3596833304035212,0.7927300930023193,0.6721311211585999,0.8163464069366455,random,10
2,0.7002345323562622,0.408641975308642,0.7087966799736023,0.6580796241760254,0.7858404517173767,random,10
3,0.7338545918464661,0.5610890015378394,0.6459510922431946,0.7353630065917969,0.6470179557800293,random,10


In [None]:
! tail -n 5 {RESULT_DIR + '/history_all.csv'}

54,0.9884284734725952,0.7399275158339197,0.0380599647760391,0.8220140337944031,0.7693212628364563,100,100
55,0.99155592918396,0.7230307218626393,0.0324767269194126,0.8337236642837524,0.9284173846244812,100,100
56,0.9973416924476624,0.7335782918745186,0.0135923894122242,0.8290398120880127,0.9002867341041565,100,100
57,0.9978107810020448,0.7549776649827187,0.0117767183110117,0.8524590134620667,0.9176496267318726,100,100
58,0.9967162013053894,0.7213050821888668,0.0154281640425324,0.8266978859901428,1.2365210056304932,100,100


Next we collate the test prediction csv file. The process is as follows:

1. Read in each of the `test_predictions.csv` files for all weight types and all experiments (seeds).
2. Process each csv file using `read_predictions()` function by the paper authors. The result is a dictionary.
3. Associate with each dictionary the weight type and seed.
4. Combine all the augmented dictionaries into one dictionary.
5. Use `save_pkl` function to save the combined dict to RESULT_DIR.

In [None]:
from transplant.utils import read_predictions, load_pkl, save_pkl

In [None]:
def get_test_predictions(job_dir, weight_type, seeds):
    test_preds = {}
    for seed in seeds:
        path = f'{job_dir}/finetune__{weight_type}_seed{seed}/test_predictions.csv'
        test_pred = read_predictions(path)
        # Step 3
        key = f'{weight_type}_{seed}'
        test_preds[key] = test_pred
    return test_preds

In [None]:
random_predictions = get_test_predictions(JOB_DIR, 'random', SEEDS)

In [None]:
random_predictions.keys()

dict_keys(['random_10', 'random_20', 'random_30', 'random_40', 'random_50', 'random_60', 'random_70', 'random_80', 'random_90', 'random_100'])

In [None]:
random_predictions['random_10']

{'y_true': array([[0, 1, 0, 0],
        [0, 1, 0, 0],
        [0, 0, 1, 0],
        ...,
        [0, 1, 0, 0],
        [0, 1, 0, 0],
        [0, 1, 0, 0]]),
 'y_prob': array([[4.26490800e-04, 9.31569600e-01, 6.79090100e-02, 9.49260200e-05],
        [1.48110080e-04, 9.26177800e-01, 6.11333600e-02, 1.25408440e-02],
        [6.82196240e-06, 8.88694000e-08, 9.99976500e-01, 1.66221600e-05],
        ...,
        [2.32006870e-03, 9.48752940e-01, 4.76910100e-02, 1.23595920e-03],
        [1.38626190e-02, 8.19023250e-01, 5.98533530e-02, 1.07260786e-01],
        [7.88477400e-05, 9.17878700e-01, 8.19820800e-02, 6.03894900e-05]]),
 'classes': ['A', 'N', 'O', '~']}

In [None]:
pct_10_predictions = get_test_predictions(JOB_DIR, '10', SEEDS)

In [None]:
pct_10_predictions.keys()

dict_keys(['10_10', '10_20', '10_30', '10_40', '10_50', '10_60', '10_70', '10_80', '10_90', '10_100'])

In [None]:
pct_20_predictions = get_test_predictions(JOB_DIR, '20', SEEDS)

In [None]:
pct_20_predictions.keys()

dict_keys(['20_10', '20_20', '20_30', '20_40', '20_50', '20_60', '20_70', '20_80', '20_90', '20_100'])

In [None]:
pct_1_predictions = get_test_predictions(JOB_DIR, '1', SEEDS)

In [None]:
pct_1_predictions.keys()

dict_keys(['1_10', '1_20', '1_30', '1_40', '1_50', '1_60', '1_70', '1_80', '1_90', '1_100'])

In [None]:
pct_100_predictions = get_test_predictions(JOB_DIR, '100', SEEDS)

In [None]:
pct_100_predictions.keys()

dict_keys(['100_10', '100_20', '100_30', '100_40', '100_50', '100_60', '100_70', '100_80', '100_90', '100_100'])

In [None]:
# Step 4
# Utilizing | syntax available in Python 3.10+
predictions_all = (
    random_predictions
    | pct_1_predictions
    | pct_10_predictions
    | pct_20_predictions
    | pct_100_predictions
)

In [None]:
predictions_all.keys()

dict_keys(['random_10', 'random_20', 'random_30', 'random_40', 'random_50', 'random_60', 'random_70', 'random_80', 'random_90', 'random_100', '1_10', '1_20', '1_30', '1_40', '1_50', '1_60', '1_70', '1_80', '1_90', '1_100', '10_10', '10_20', '10_30', '10_40', '10_50', '10_60', '10_70', '10_80', '10_90', '10_100', '20_10', '20_20', '20_30', '20_40', '20_50', '20_60', '20_70', '20_80', '20_90', '20_100', '100_10', '100_20', '100_30', '100_40', '100_50', '100_60', '100_70', '100_80', '100_90', '100_100'])

In [None]:
RESULT_DIR

'/content/drive/MyDrive/DLHProject/results1d_all'

In [None]:
# Step 5
save_pkl(RESULT_DIR + '/predictions_all.pkl', **predictions_all)

In [None]:
! ls -lh {RESULT_DIR}

total 2.9M
-rw------- 1 root root 311K May  5 00:39 history_all.csv
-rw------- 1 root root 2.6M May  5 00:42 predictions_all.pkl


In [None]:
reread_predictions_all = load_pkl(RESULT_DIR + '/predictions_all.pkl')

In [None]:
reread_predictions_all.keys()

dict_keys(['random_10', 'random_20', 'random_30', 'random_40', 'random_50', 'random_60', 'random_70', 'random_80', 'random_90', 'random_100', '1_10', '1_20', '1_30', '1_40', '1_50', '1_60', '1_70', '1_80', '1_90', '1_100', '10_10', '10_20', '10_30', '10_40', '10_50', '10_60', '10_70', '10_80', '10_90', '10_100', '20_10', '20_20', '20_30', '20_40', '20_50', '20_60', '20_70', '20_80', '20_90', '20_100', '100_10', '100_20', '100_30', '100_40', '100_50', '100_60', '100_70', '100_80', '100_90', '100_100'])

In [None]:
reread_predictions_all['100_70']

{'y_true': array([[0, 1, 0, 0],
        [0, 1, 0, 0],
        [0, 0, 1, 0],
        ...,
        [0, 1, 0, 0],
        [0, 1, 0, 0],
        [0, 1, 0, 0]]),
 'y_prob': array([[4.1091787e-03, 9.0807570e-01, 8.6139366e-02, 1.6757126e-03],
        [2.1433453e-03, 9.3173295e-01, 6.1164778e-02, 4.9589570e-03],
        [4.8491080e-04, 1.3701040e-09, 9.9950740e-01, 7.5844955e-06],
        ...,
        [4.5451904e-03, 9.3916726e-01, 4.8811170e-02, 7.4765054e-03],
        [9.0035930e-02, 5.3427035e-01, 9.6816406e-02, 2.7887732e-01],
        [3.0593292e-03, 9.2525310e-01, 7.0938960e-02, 7.4862165e-04]]),
 'classes': ['A', 'N', 'O', '~']}