In [None]:
!nvidia-smi

Tue May 31 09:25:47 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   54C    P0    40W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install -q transformers
!pip install -q datasets
!pip install -q sentence-transformers

In [None]:
import os, sys
import random
from pathlib import Path
from tqdm.notebook import tqdm
from typing import Dict


import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset

from datasets import load_metric

from transformers import RobertaForMaskedLM, RobertaTokenizerFast, RobertaTokenizer
from transformers import LineByLineTextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
from transformers.modeling_outputs import SequenceClassifierOutput

from sentence_transformers import InputExample
from sentence_transformers import models, SentenceTransformer
from sentence_transformers import losses
from sentence_transformers.util import cos_sim
from sentence_transformers.evaluation import LabelAccuracyEvaluator

# Prepare Dataset

## Load Dataframe dataset

In [None]:
train_df = pd.read_pickle('/content/drive/MyDrive/NLP/AI4Code/dataset/exp_5/clean_train_dataset.pkl')
train_df

Unnamed: 0,id,cell_id,cell_type,source,rank,ancestor_id,parent_id,pct_rank
0,00001756c60be8,1862f0a6,code,# this python 3 environment comes with many he...,0,945aea18,,0.0
1,00001756c60be8,2a9e43d6,code,import numpy as np import pandas as pd import ...,2,945aea18,,0.034483
2,00001756c60be8,038b763d,code,import warnings warnings.filterwarnings('ignore'),4,945aea18,,0.068966
3,00001756c60be8,2eefe0ef,code,matplotlib.rcparams.update({'font.size': 14}),6,945aea18,,0.103448
4,00001756c60be8,0beab1cd,code,"def evaluate_preds(train_true_values, train_pr...",8,945aea18,,0.137931
...,...,...,...,...,...,...,...,...
6055708,fffe1d764579d5,0d770d6b,markdown,removing the outliers,43,3c40bfa6,,0.597222
6055709,fffe1d764579d5,d45ddc62,markdown,dimensionality curse,33,3c40bfa6,,0.458333
6055710,fffe1d764579d5,1a63248d,markdown,bangalore house price prediction,0,3c40bfa6,,0.0
6055711,fffe1d764579d5,a8ffc8b4,markdown,we have achieved accuracy in predicting the pr...,69,3c40bfa6,,0.958333


In [None]:
val_df = pd.read_pickle('/content/drive/MyDrive/NLP/AI4Code/dataset/exp_5/clean_val_dataset.pkl')
val_df

Unnamed: 0,id,cell_id,cell_type,source,rank,ancestor_id,parent_id,pct_rank
0,0002115f48f982,18281c6c,code,import numpy as np # linear algebra import pan...,1,272b483a,,0.111111
1,0002115f48f982,e3b6b115,code,df = pd.read_csv('../input/metadata_train.csv'...,2,272b483a,,0.222222
2,0002115f48f982,4a044c54,code,df.head(),3,272b483a,,0.333333
3,0002115f48f982,365fe576,code,#let's check if targets are consistent within ...,4,272b483a,,0.444444
4,0002115f48f982,a3188e54,code,"sns.countplot(x='target',data=targets) # it sh...",5,272b483a,,0.555556
...,...,...,...,...,...,...,...,...
314928,fff06cc23780af,f8135651,markdown,lowering the case,14,7b4c5375,,0.17284
314929,fff06cc23780af,b61ba8ec,markdown,testing the model on test set,60,7b4c5375,,0.740741
314930,fff06cc23780af,e98b7e0d,markdown,the aim of this notebook is to predict if twee...,2,7b4c5375,,0.024691
314931,fff06cc23780af,f31fa490,markdown,having created this notebook from scratch star...,80,7b4c5375,,0.987654


## Create cellid--source dictionary

In [None]:
train_dict_cellid_source = dict(zip(train_df['cell_id'].values, train_df['source'].values))
val_dict_cellid_source = dict(zip(val_df['cell_id'].values, val_df['source'].values))

In [None]:
def generate_triplet(df):
    triplets = []
    count = 0

    for id, df_tmp in tqdm(df.groupby('id')):
        df_tmp_markdown = df_tmp[df_tmp['cell_type'] == 'markdown']
        df_tmp_code = df_tmp[df_tmp['cell_type'] == 'code']

        df_tmp_code_rank = df_tmp_code['rank'].values
        df_tmp_code_cellid = df_tmp_code['cell_id'].values

        for cell_id, rank in df_tmp_markdown[['cell_id', 'rank']].values:
            labels = np.array([r == (rank+1) for r in df_tmp_code_rank]).astype('int')

            pos_position = np.where(labels == 1)[0]

            if len(pos_position):
                pos_position = pos_position[0]
                pos_cellid = df_tmp_code_cellid[pos_position]
                pos_label = labels[pos_position]
                triplets.append([cell_id, pos_cellid, int(pos_label)])

                neg_positions = [x for x in range(0, len(labels)) if x != pos_position]

                if len(neg_positions):
                    neg_ranks = df_tmp_code_rank[neg_positions].tolist()
                    fil = list(filter(lambda x: x > rank, neg_ranks))
                    if len(fil):
                        hard_neg_rank = min(fil)
                        hard_neg_cellid = df_tmp_code.loc[df_tmp_code['rank'] == hard_neg_rank, 'cell_id'].values[0]
                        hard_neg_label = 0
                        triplets.append([cell_id, hard_neg_cellid, hard_neg_label])
                        # print(rank, hard_neg_rank)

                # if len(r):
                #     neg_positions = r
                #     neg_cellid = df_tmp_code_cellid[neg_positions].tolist()
                #     neg_labels = labels[neg_positions].tolist()
                    
                #     for neg_idx in range(len(neg_cellid)):
                #         triplets.append([cell_id, neg_cellid[neg_idx], neg_labels[neg_idx]])
                #     # triplets.append([cell_id, neg_cellid, 0])
    
    return triplets

## Take fraction of dataset for test running

In [None]:
train_df = train_df.loc[:600000]
train_df

Unnamed: 0,id,cell_id,cell_type,source,rank,ancestor_id,parent_id,pct_rank
0,00001756c60be8,1862f0a6,code,# this python 3 environment comes with many he...,0,945aea18,,0.0
1,00001756c60be8,2a9e43d6,code,import numpy as np import pandas as pd import ...,2,945aea18,,0.034483
2,00001756c60be8,038b763d,code,import warnings warnings.filterwarnings('ignore'),4,945aea18,,0.068966
3,00001756c60be8,2eefe0ef,code,matplotlib.rcparams.update({'font.size': 14}),6,945aea18,,0.103448
4,00001756c60be8,0beab1cd,code,"def evaluate_preds(train_true_values, train_pr...",8,945aea18,,0.137931
...,...,...,...,...,...,...,...,...
599996,1931dbf2a81b02,ce9f7f15,markdown,summary of punt player position,100,9d3504c4,,0.925926
599997,1931dbf2a81b02,bac7a460,markdown,dataset play information dataset,24,9d3504c4,,0.222222
599998,1931dbf2a81b02,718a4996,markdown,dataset play information field name season type,43,9d3504c4,,0.398148
599999,1931dbf2a81b02,e331a94f,markdown,total game injuries start time,89,9d3504c4,,0.824074


In [None]:
val_df = val_df.loc[:30000]
val_df

Unnamed: 0,id,cell_id,cell_type,source,rank,ancestor_id,parent_id,pct_rank
0,0002115f48f982,18281c6c,code,import numpy as np # linear algebra import pan...,1,272b483a,,0.111111
1,0002115f48f982,e3b6b115,code,df = pd.read_csv('../input/metadata_train.csv'...,2,272b483a,,0.222222
2,0002115f48f982,4a044c54,code,df.head(),3,272b483a,,0.333333
3,0002115f48f982,365fe576,code,#let's check if targets are consistent within ...,4,272b483a,,0.444444
4,0002115f48f982,a3188e54,code,"sns.countplot(x='target',data=targets) # it sh...",5,272b483a,,0.555556
...,...,...,...,...,...,...,...,...
29996,171795faf8c7fe,70625677,code,"masked_img = cv2.bitwise_and(img6,img6,mask=ma...",71,bc4af4a7,,0.972603
29997,171795faf8c7fe,3b17d23a,code,"masked_img_his = cv2.calchist([img6],channels=...",72,bc4af4a7,,0.986301
29998,171795faf8c7fe,32093f52,markdown,add gaussian noise,32,bc4af4a7,,0.438356
29999,171795faf8c7fe,71fcbe0c,markdown,shape and text,6,bc4af4a7,,0.082192


In [None]:
train_triplets = generate_triplet(train_df)
val_triplets = generate_triplet(val_df)

  0%|          | 0/13092 [00:00<?, ?it/s]

  0%|          | 0/644 [00:00<?, ?it/s]

In [None]:
train_triplets[:100]

[['21616367', '86497fe1', 1],
 ['21616367', 'e2c8e725', 0],
 ['fcb6792d', '5bf9ca51', 1],
 ['fcb6792d', 'f5504853', 0],
 ['63c26fa2', '62638fba', 1],
 ['63c26fa2', 'bb69e88c', 0],
 ['4bb2e30a', 'bd94f005', 1],
 ['4bb2e30a', '62638fba', 0],
 ['a6357f7e', 'ff7c44ed', 1],
 ['a6357f7e', '0e7c906e', 0],
 ['45082c89', '781bbf3c', 1],
 ['45082c89', 'bd94f005', 0],
 ['77e56113', '2eefe0ef', 1],
 ['77e56113', '0beab1cd', 0],
 ['448eb224', '2a9e43d6', 1],
 ['448eb224', '038b763d', 0],
 ['032e2820', 'a98c5d9f', 1],
 ['032e2820', '06365725', 0],
 ['8554b284', '59959af5', 1],
 ['8554b284', '80151ab7', 0],
 ['ac301a84', '0e7c906e', 1],
 ['ac301a84', 'dd0c804a', 0],
 ['23705731', 'ebe125d5', 1],
 ['23705731', 'd9dced8b', 0],
 ['1496beaf', '8ca8392c', 1],
 ['1496beaf', '17ec3fc4', 0],
 ['2e1a5949', '80151ab7', 1],
 ['2e1a5949', '5bf9ca51', 0],
 ['7e2f170a', '038b763d', 1],
 ['7e2f170a', '2eefe0ef', 0],
 ['bfbde93e', '8522781a', 1],
 ['bfbde93e', '8ca8392c', 0],
 ['915643b3', 'f5504853', 1],
 ['915643b

## Define custom Dataset for Huggingface Trainer

In [None]:
train_samples = []
for triplet in train_triplets:
    markdown_text = train_dict_cellid_source[triplet[0]]
    code_text = train_dict_cellid_source[triplet[1]]
    label = triplet[2]
    train_samples.append(InputExample(
        texts=[markdown_text, code_text],
        label=label
    ))

In [None]:
val_samples = []
for triplet in val_triplets:
    markdown_text = val_dict_cellid_source[triplet[0]]
    code_text = val_dict_cellid_source[triplet[1]]
    label = triplet[2]
    val_samples.append(InputExample(
        texts=[markdown_text, code_text],
        label=label
    ))

## Training dataloader

In [None]:
BATCH_SIZE = 16

train_dataloader = DataLoader(train_samples, batch_size=BATCH_SIZE, shuffle=False,
                              num_workers=0, drop_last=True)

val_dataloader = DataLoader(val_samples, batch_size=BATCH_SIZE, shuffle=False,
                              num_workers=0, drop_last=False)

# Sentence Transformer Model

In [None]:
bert = models.Transformer("/content/drive/MyDrive/NLP/AI4Code/pretrained/exp_5/pretrained_mlm/checkpoint-160000", max_seq_length=256)
pooler = models.Pooling(
    bert.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True,
)

model = SentenceTransformer(modules=[bert, pooler])
# model = SentenceTransformer('/content/test/500')
model

Some weights of the model checkpoint at /content/drive/MyDrive/NLP/AI4Code/pretrained/exp_5/pretrained_mlm/checkpoint-160000 were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at /content/drive/MyDrive/NLP/AI4Code/pretrained/exp_5/pretrained_mlm/checkpoint-160000 and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: RobertaModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
)

## Freeze some layer for faster training

In [None]:
# for name, module in model.named_modules():
#     print(name)

In [None]:
# model[0].auto_model.encoder

In [None]:
# for param in model[0].auto_model.encoder.parameters():
#     param.requires_grad = False

# Loss function

In [None]:
loss = losses.SoftmaxLoss(
    model=model,
    sentence_embedding_dimension=model.get_sentence_embedding_dimension(),
    num_labels=2,
)

# Train Model

In [None]:
evaluator = LabelAccuracyEvaluator(val_dataloader, softmax_model=loss)

In [None]:
epochs = 1
warmup_steps = int(len(train_dataloader) * epochs * 0.1)

model.fit(
    train_objectives=[(train_dataloader, loss)],
    epochs=epochs,
    evaluator=evaluator,
    evaluation_steps=5000,
    warmup_steps=warmup_steps,
    output_path='/content/drive/MyDrive/NLP/AI4Code/pretrained/exp_5/pretrained_task/demo',
    checkpoint_path = '/content/drive/MyDrive/NLP/AI4Code/pretrained/exp_5/pretrained_task/demo',
    checkpoint_save_steps=10000,
    checkpoint_save_total_limit=4,
    use_amp=True,
    show_progress_bar=True,
)