In [1]:
import easydict
import gc
import pandas as pd

from preprocessor import PairwisePreprocessor, _20CodeCellPreprocessor
from dataset import _20SampleDataset, PairwiseDataset, _20sample_data_setup, pairwise_data_setup
from train import pairwise_train_setup, train_setup
from util import pairwise_debug_setup, _20sample_debug_setup
from metrics import kendall_tau
from train import train

args = {
    'model_name_or_path': 'microsoft/graphcodebert-base',

    'input_path': '../input/',

    'train_path': './data/train.csv',
    'train_mark_path': './data/train_mark.csv',
    'train_features_path': './data/train_fts.json',

    'val_path': "./data/val.csv",
    'val_mark_path': './data/val_mark.csv',
    'val_features_path': './data/val_fts.json',

    'output_path': './output-graphcodebert-20sample-debug',

    'md_max_len': 64,
    'total_max_len': 512,
    'batch_size': 32,
    'accumulation_steps': 1,
    'epoch': 0,
    'epochs': 5,
    'n_workers': 8,
    'debug': True,
    'load_train': False,
    'max_lr': 3e-5,
    'min_lr': .3e-6,
    'kfold': True
}

args = easydict.EasyDict(args)

In [2]:
preprocessor = _20CodeCellPreprocessor(**vars(args))
train_df, val_df, train_df_mark, val_df_mark, train_fts, val_fts = preprocessor.run()

print('before debug', train_df.shape, val_df.shape, train_df_mark.shape, val_df.shape, len(train_fts), len(val_fts))

kfolds = []
if args.debug:
    for i in range(5):
        fold = _20sample_debug_setup(train_df, train_df_mark, train_fts, val_df, val_df_mark, val_fts)
        kfolds.append(fold)
    
train_df, train_df_mark, train_fts, val_df, val_df_mark, val_fts = kfolds[0]
print('after debug', train_df.shape, val_df.shape, train_df_mark.shape, val_df.shape, len(train_fts), len(val_fts))

train_df, train_df_mark, train_fts, val_df, val_df_mark, val_fts = kfolds[1]
print('after debug', train_df.shape, val_df.shape, train_df_mark.shape, val_df.shape, len(train_fts), len(val_fts))

train_df, train_df_mark, train_fts, val_df, val_df_mark, val_fts = kfolds[2]
print('after debug', train_df.shape, val_df.shape, train_df_mark.shape, val_df.shape, len(train_fts), len(val_fts))

train_df, train_df_mark, train_fts, val_df, val_df_mark, val_fts = kfolds[3]
print('after debug', train_df.shape, val_df.shape, train_df_mark.shape, val_df.shape, len(train_fts), len(val_fts))

train_df, train_df_mark, train_fts, val_df, val_df_mark, val_fts = kfolds[4]
print('after debug', train_df.shape, val_df.shape, train_df_mark.shape, val_df.shape, len(train_fts), len(val_fts))


train_df, val_df are already exits
train_fts, val_fts are already exists
before debug (5740832, 8) (629814, 8) (1950118, 8) (629814, 8) 125292 13964
after debug (566977, 8) (60759, 8) (192487, 8) (60759, 8) 12529 1396
after debug (570218, 8) (63777, 8) (192522, 8) (63777, 8) 12529 1396
after debug (575337, 8) (62287, 8) (193953, 8) (62287, 8) 12529 1396
after debug (566148, 8) (62488, 8) (193033, 8) (62488, 8) 12529 1396
after debug (563686, 8) (63828, 8) (189789, 8) (63828, 8) 12529 1396


In [3]:
df_orders = pd.read_csv(args.input_path + 'train_orders.csv',
                        index_col='id',
                        squeeze=True).str.split()

for i in range(5):
    train_df, train_df_mark, train_fts, val_df, val_df_mark, val_fts = kfolds[i]

    train_loader, val_loader = _20sample_data_setup(train_df_mark, val_df_mark, train_fts, val_fts, args)

    del train_df, train_df_mark, train_fts
    gc.collect()

    args.num_train_steps = args.epochs * len(train_loader) / args.accumulation_steps

    model, optimizer, scheduler, scaler = train_setup(args)
    model.cuda()

    train(model, train_loader, val_loader, optimizer, scheduler, scaler, val_df, df_orders, args)

    del model, optimizer, scheduler, scaler, val_fts, train_loader, val_loader
    gc.collect()




  df_orders = pd.read_csv(args.input_path + 'train_orders.csv',
Some weights of the model checkpoint at microsoft/graphcodebert-base were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at microsoft/graphcodebert-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weig

Epoch 1 Loss: 0.4063 lr: [1.9950124688279302e-08, 1.9950124688279302e-08]:   0%|          | 1/6015 [00:19<31:48:10, 19.04s/it]



Epoch 1 Loss: 0.1822 lr: [2.526315789473684e-05, 2.526315789473684e-05]: 100%|██████████| 6015/6015 [37:32<00:00,  2.67it/s]  
100%|██████████| 653/653 [01:32<00:00,  7.09it/s]
Preds score 0.779829136959287
Epoch 2 Loss: 0.1305 lr: [1.894736842105263e-05, 1.894736842105263e-05]: 100%|██████████| 6015/6015 [37:15<00:00,  2.69it/s]  
100%|██████████| 653/653 [01:28<00:00,  7.34it/s]
Preds score 0.8066387884338179
Epoch 3 Loss: 0.1131 lr: [1.263157894736842e-05, 1.263157894736842e-05]: 100%|██████████| 6015/6015 [37:02<00:00,  2.71it/s]  
100%|██████████| 653/653 [01:27<00:00,  7.43it/s]
Preds score 0.8065701257641509
Epoch 4 Loss: 0.1008 lr: [6.31578947368421e-06, 6.31578947368421e-06]: 100%|██████████| 6015/6015 [37:00<00:00,  2.71it/s]    
100%|██████████| 653/653 [01:28<00:00,  7.42it/s]
Preds score 0.8115441565615875
Epoch 5 Loss: 0.092 lr: [0.0, 0.0]: 100%|██████████| 6015/6015 [36:59<00:00,  2.71it/s]                                       
100%|██████████| 653/653 [01:27<00:00,  7.

Some weights of the model checkpoint at microsoft/graphcodebert-base were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at microsoft/graphcodebert-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to

Epoch 1 Loss: 0.4688 lr: [1.9946808510638297e-08, 1.9946808510638297e-08]:   0%|          | 1/6016 [00:15<25:09:55, 15.06s/it]



Epoch 1 Loss: 0.2457 lr: [2.526315789473684e-05, 2.526315789473684e-05]: 100%|██████████| 6016/6016 [37:09<00:00,  2.70it/s]  
100%|██████████| 702/702 [01:34<00:00,  7.43it/s]
Preds score 0.612228226146861
Epoch 2 Loss: 0.2574 lr: [1.894736842105263e-05, 1.894736842105263e-05]: 100%|██████████| 6016/6016 [38:29<00:00,  2.61it/s]  
100%|██████████| 702/702 [01:34<00:00,  7.46it/s]
Preds score 0.6128817611899084
Epoch 3 Loss: 0.2568 lr: [1.263157894736842e-05, 1.263157894736842e-05]: 100%|██████████| 6016/6016 [37:19<00:00,  2.69it/s]  
100%|██████████| 702/702 [01:34<00:00,  7.39it/s]
Preds score 0.6153550669339356
Epoch 4 Loss: 0.2564 lr: [6.31578947368421e-06, 6.31578947368421e-06]: 100%|██████████| 6016/6016 [37:20<00:00,  2.69it/s]    
100%|██████████| 702/702 [01:34<00:00,  7.43it/s]
Preds score 0.6155473296645173
Epoch 5 Loss: 0.2561 lr: [0.0, 0.0]: 100%|██████████| 6016/6016 [37:18<00:00,  2.69it/s]                                      
100%|██████████| 702/702 [01:34<00:00,  7.

Some weights of the model checkpoint at microsoft/graphcodebert-base were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at microsoft/graphcodebert-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to

Epoch 1 Loss: 0.3389 lr: [1.979871308364956e-08, 1.979871308364956e-08]:   0%|          | 1/6061 [00:15<25:32:09, 15.17s/it]



Epoch 1 Loss: 0.1788 lr: [2.526315789473684e-05, 2.526315789473684e-05]: 100%|██████████| 6061/6061 [37:31<00:00,  2.69it/s]  
100%|██████████| 678/678 [01:30<00:00,  7.46it/s]
Preds score 0.7987488574016076
Epoch 2 Loss: 0.1294 lr: [1.894736842105263e-05, 1.894736842105263e-05]: 100%|██████████| 6061/6061 [37:40<00:00,  2.68it/s]  
100%|██████████| 678/678 [01:31<00:00,  7.40it/s]
Preds score 0.8097194923588006
Epoch 3 Loss: 0.1124 lr: [1.263157894736842e-05, 1.263157894736842e-05]: 100%|██████████| 6061/6061 [37:32<00:00,  2.69it/s]  
100%|██████████| 678/678 [01:31<00:00,  7.42it/s]
Preds score 0.8160483147439671
Epoch 4 Loss: 0.1004 lr: [6.31578947368421e-06, 6.31578947368421e-06]: 100%|██████████| 6061/6061 [37:32<00:00,  2.69it/s]    
100%|██████████| 678/678 [01:31<00:00,  7.44it/s]
Preds score 0.8149994273801862
Epoch 5 Loss: 0.0917 lr: [0.0, 0.0]: 100%|██████████| 6061/6061 [37:19<00:00,  2.71it/s]                                      
100%|██████████| 678/678 [01:31<00:00,  7

Some weights of the model checkpoint at microsoft/graphcodebert-base were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at microsoft/graphcodebert-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to

Epoch 1 Loss: 0.7305 lr: [1.9893899204244032e-08, 1.9893899204244032e-08]:   0%|          | 1/6032 [00:14<24:58:56, 14.91s/it]



Epoch 1 Loss: 0.1874 lr: [2.526315789473684e-05, 2.526315789473684e-05]: 100%|██████████| 6032/6032 [37:08<00:00,  2.71it/s]  
100%|██████████| 674/674 [01:30<00:00,  7.46it/s]
Preds score 0.7787443863345459
Epoch 2 Loss: 0.132 lr: [1.894736842105263e-05, 1.894736842105263e-05]: 100%|██████████| 6032/6032 [37:20<00:00,  2.69it/s]   
100%|██████████| 674/674 [01:31<00:00,  7.37it/s]
Preds score 0.7990120828513242
Epoch 3 Loss: 0.1147 lr: [1.263157894736842e-05, 1.263157894736842e-05]: 100%|██████████| 6032/6032 [37:29<00:00,  2.68it/s]  
100%|██████████| 674/674 [01:30<00:00,  7.42it/s]
Preds score 0.8031986346861846
Epoch 4 Loss: 0.1022 lr: [6.31578947368421e-06, 6.31578947368421e-06]: 100%|██████████| 6032/6032 [37:10<00:00,  2.70it/s]    
100%|██████████| 674/674 [01:34<00:00,  7.14it/s]
Preds score 0.8048780058676824
Epoch 5 Loss: 0.0935 lr: [0.0, 0.0]: 100%|██████████| 6032/6032 [37:33<00:00,  2.68it/s]                                      
100%|██████████| 674/674 [01:31<00:00,  7

Some weights of the model checkpoint at microsoft/graphcodebert-base were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at microsoft/graphcodebert-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to

Epoch 1 Loss: 0.3822 lr: [2.0236087689713324e-08, 2.0236087689713324e-08]:   0%|          | 1/5930 [00:15<24:57:04, 15.15s/it]



Epoch 1 Loss: 0.1775 lr: [2.526315789473684e-05, 2.526315789473684e-05]: 100%|██████████| 5930/5930 [36:46<00:00,  2.69it/s]  
100%|██████████| 672/672 [01:30<00:00,  7.40it/s]
Preds score 0.7898881606158608
Epoch 2 Loss: 0.1289 lr: [1.894736842105263e-05, 1.894736842105263e-05]: 100%|██████████| 5930/5930 [36:44<00:00,  2.69it/s]  
100%|██████████| 672/672 [01:29<00:00,  7.47it/s]
Preds score 0.8076668483923037
Epoch 3 Loss: 0.1116 lr: [1.263157894736842e-05, 1.263157894736842e-05]: 100%|██████████| 5930/5930 [36:30<00:00,  2.71it/s]  
100%|██████████| 672/672 [01:30<00:00,  7.47it/s]
Preds score 0.8196607404374079
Epoch 4 Loss: 0.0992 lr: [6.31578947368421e-06, 6.31578947368421e-06]: 100%|██████████| 5930/5930 [36:48<00:00,  2.69it/s]    
100%|██████████| 672/672 [01:30<00:00,  7.46it/s]
Preds score 0.8160648752301378
Epoch 5 Loss: 0.0899 lr: [0.0, 0.0]: 100%|██████████| 5930/5930 [36:34<00:00,  2.70it/s]                                      
100%|██████████| 672/672 [01:29<00:00,  7