In [1]:
!nvidia-smi

Fri Jun  3 08:08:41 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   38C    P0    51W / 300W |      0MiB / 16160MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# !unzip -qq "/content/drive/MyDrive/NLP/AI4Code/dataset/AI4Code.zip" -d "/content/dataset"

In [4]:
!pip install -q transformers
!pip install -q datasets
!pip install -q sentence-transformers

In [5]:
import os, sys
import random
from pathlib import Path
from tqdm.notebook import tqdm
from typing import Dict


import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset

from datasets import load_metric

from transformers import RobertaForMaskedLM, RobertaTokenizerFast, RobertaTokenizer
from transformers import LineByLineTextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
from transformers.modeling_outputs import SequenceClassifierOutput

from sentence_transformers import InputExample
from sentence_transformers import models, SentenceTransformer
from sentence_transformers import losses, util
from sentence_transformers.util import cos_sim
from sentence_transformers.evaluation import LabelAccuracyEvaluator

# Dataloader for validation

## Load val dataset

In [6]:
val_df = pd.read_pickle('/content/drive/MyDrive/NLP/AI4Code/dataset/exp_5/clean_val_dataset.pkl')
val_df

Unnamed: 0,id,cell_id,cell_type,source,rank,ancestor_id,parent_id,pct_rank
0,0002115f48f982,18281c6c,code,import numpy as np # linear algebra import pan...,1,272b483a,,0.111111
1,0002115f48f982,e3b6b115,code,df = pd.read_csv('../input/metadata_train.csv'...,2,272b483a,,0.222222
2,0002115f48f982,4a044c54,code,df.head(),3,272b483a,,0.333333
3,0002115f48f982,365fe576,code,#let's check if targets are consistent within ...,4,272b483a,,0.444444
4,0002115f48f982,a3188e54,code,"sns.countplot(x='target',data=targets) # it sh...",5,272b483a,,0.555556
...,...,...,...,...,...,...,...,...
314928,fff06cc23780af,f8135651,markdown,lowering the case,14,7b4c5375,,0.17284
314929,fff06cc23780af,b61ba8ec,markdown,testing the model on test set,60,7b4c5375,,0.740741
314930,fff06cc23780af,e98b7e0d,markdown,the aim of this notebook is to predict if twee...,2,7b4c5375,,0.024691
314931,fff06cc23780af,f31fa490,markdown,having created this notebook from scratch star...,80,7b4c5375,,0.987654


In [7]:
# val_df = val_df.loc[:20070]
# val_df.tail()
# val_df = val_df.loc[:1001]
# val_df.tail(20)

In [8]:
paths_val = val_df['id'].unique().tolist()

## Load raw dataset

In [9]:
def read_notebook(path):
    return (pd.read_json(path, dtype={'cell_type': 'category', 'source': 'str'}).assign(id=path.stem).rename_axis('cell_id'))

In [10]:
# Reading val dataframe
data_dir = data_dir = Path("/content/dataset")


paths_train = [data_dir / 'train' / (path + '.json') for path in paths_val]
notebooks_train = [read_notebook(path) for path in tqdm(paths_train, desc='Val NBs')]
df = pd.concat(notebooks_train).set_index('id', append=True).swaplevel().sort_index(level='id', sort_remaining=False).reset_index()

# # Reading test dataframe
# data_dir = Path("../input/AI4Code")


# paths_train = list((data_dir / 'test').glob('*.json'))
# notebooks_train = [read_notebook(path) for path in tqdm(paths_train, desc='Test NBs')]
# df = pd.concat(notebooks_train).set_index('id', append=True).swaplevel().sort_index(level='id', sort_remaining=False)

Val NBs:   0%|          | 0/6895 [00:00<?, ?it/s]

## Preprocess

In [11]:
import os
import re
import nltk
import string
from pathlib import Path
from tqdm.notebook import tqdm 

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity


def preprocess_markdown(text):
    # Converting to lowercase
    text = text.lower()

    # Remove text in square brackets
    # text = re.sub('\[.*?\]', '', text)

    # Remove link
    text = re.sub('https?://\S+|www\.\S+', '', text)

    # Remove prefixed 'b'
    text = re.sub(r'^b\s+', '', text)
    
    # Remove all the special characters
    # text = re.sub(r'\W', ' ', str(text))

    # Remove all single characters
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)

    # Remove words containing numbers
    text = re.sub('\w*\d+\w*', '', text)

    # Remove text on htlm tag
    text = re.sub('<.*?>+', ' ', text)
    
    # Remove punctuation
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)

    # Remove \n
    text = re.sub(r'\r', '', text)
    text = re.sub(r'\n+', '\n', text)
    text = re.sub(r'\n', '. ', text)

    # Substrituting multiple spaces with single space
    text = re.sub(r'\s+', ' ', text)

    # Remove not word at the begining and the end
    text = re.sub(r'(^\W+|\W+$)', '', text)

    # Lemmatization
    # tokens = text.split()
    # tokens = [stemmer.lemmatize(word) for word in tokens]
    # tokens = [word for word in tokens if len(word) > 3]
    # preprocessed_text = ' '.join(tokens)

    return text


def preprocess_code(text):
    # Converting to lowercase
    text = text.lower()

    # Remove link
    text = re.sub('https?://\S+|www\.\S+', '', text)

    # Substrituting multiple tab with single space
    text = re.sub(r'\t+', ' ', text)

    # Remove words containing numbers
    text = re.sub(r'[a-zA-Z]+\d+[a-zA-Z]+', '', text)

    # Remove punctuation
    # text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)

    # Remove all single characters
    # text = re.sub(r'(\s+[a-zA-Z]\s+|^[a-zA-Z]\s+)', ' ', text)

    # Remove multiple #
    text = re.sub(r'#+', '#', text)

    # Remove \n
    text = text.replace('\r', '').replace('\n+', '\n').replace('\n', ' ')

    # Substrituting multiple spaces with single space
    text = re.sub(r'\s+', ' ', text)

    # Remove not word at the begining and the end
    # text = re.sub(r'(^\W+|\W+$)', '', text)

    return text

def preprocess_df(df):
    """
    This function is for processing source of notebook
    returns preprocessed dataframe
    """
    df.loc[df['cell_type'] == 'markdown', 'source'] = df.loc[df['cell_type'] == 'markdown', 'source'].apply(preprocess_markdown)
    df.loc[df['cell_type'] == 'code', 'source'] = df.loc[df['cell_type'] == 'code', 'source'].apply(preprocess_code)
    return df

In [12]:
df = preprocess_df(df)

## Prepare DataLoader

In [13]:
df["rank"] = df.groupby(["id", "cell_type"]).cumcount()
df["pred"] = df.groupby(["id", "cell_type"])["rank"].rank(pct=False)

In [14]:
df

Unnamed: 0,id,cell_id,cell_type,source,rank,pred
0,0002115f48f982,18281c6c,code,import numpy as np # linear algebra import pan...,0,1.0
1,0002115f48f982,e3b6b115,code,df = pd.read_csv('../input/metadata_train.csv'...,1,2.0
2,0002115f48f982,4a044c54,code,df.head(),2,3.0
3,0002115f48f982,365fe576,code,#let's check if targets are consistent within ...,3,4.0
4,0002115f48f982,a3188e54,code,"sns.countplot(x='target',data=targets) # it sh...",4,5.0
...,...,...,...,...,...,...
314928,fff06cc23780af,f8135651,markdown,lowering the case,15,16.0
314929,fff06cc23780af,b61ba8ec,markdown,testing the model on test set,16,17.0
314930,fff06cc23780af,e98b7e0d,markdown,the aim of this notebook is to predict if twee...,17,18.0
314931,fff06cc23780af,f31fa490,markdown,having created this notebook from scratch star...,18,19.0


# Load model

In [15]:
embedder = SentenceTransformer('/content/drive/MyDrive/NLP/AI4Code/pretrained/exp_5/pretrained_task/trial_6/150000')
embedder

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: RobertaModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
)

# Define kendall tau metrics

In [16]:
from bisect import bisect

def count_inversions(a):
    inversions = 0
    sorted_so_far = []
    for i, u in enumerate(a):
        j = bisect(sorted_so_far, u)
        inversions += i - j
        sorted_so_far.insert(j, u)
    return inversions


def kendall_tau(ground_truth, predictions):
    total_inversions = 0
    total_2max = 0  # twice the maximum possible inversions across all instances
    for gt, pred in zip(ground_truth, predictions):
        ranks = [gt.index(x) for x in pred]  # rank predicted order in terms of ground truth
        total_inversions += count_inversions(ranks)
        n = len(gt)
        total_2max += n * (n - 1)
    return 1 - 4 * total_inversions / total_2max

# Validation

In [17]:
data_dir = Path("/content/dataset")
df_orders = pd.read_csv(
    data_dir / 'train_orders.csv',
    index_col='id',
    squeeze=True
).str.split()

df_orders

id
00001756c60be8    [1862f0a6, 448eb224, 2a9e43d6, 7e2f170a, 038b7...
00015c83e2717b    [2e94bd7a, 3e99dee9, b5e286ea, da4f7550, c4172...
0001bdd4021779    [3fdc37be, 073782ca, 8ea7263c, 80543cd8, 38310...
0001daf4c2c76d    [97266564, a898e555, 86605076, 76cc2642, ef279...
0002115f48f982    [9ec225f0, 18281c6c, e3b6b115, 4a044c54, 365fe...
                                        ...                        
fffc30d5a0bc46    [09727c0c, ff1ea6a0, ddfef603, a01ce9b3, 3ba95...
fffc3b44869198    [978a5137, faa48f03, 28dfb12a, eea2e812, 64fef...
fffc63ff750064    [5015c300, 411b85d9, 8238198c, f4781d1d, b5532...
fffcd063cda949    [7e6266ad, d8281fc5, d4ffcaef, 3e0e4a47, 21387...
fffe1d764579d5    [1a63248d, 9c3b96a5, 1398a873, 4e2d4c2d, f71c5...
Name: cell_order, Length: 139256, dtype: object

# Analysis

In [18]:
# def generate_gt(val_df):
#     gts = []
#     count = 0

#     for id, df_tmp in tqdm(df.groupby('id')):
#         df_tmp_markdown = df_tmp[df_tmp['cell_type'] == 'markdown']
#         df_tmp_code = df_tmp[df_tmp['cell_type'] == 'code']

#         df_tmp_code_rank = df_tmp_code['rank'].values
#         df_tmp_code_cellid = df_tmp_code['cell_id'].values

#         for cell_id, rank in df_tmp_markdown[['cell_id', 'rank']].values:
#             pos_positions = sorted(i for i in df_tmp_code_rank if df_tmp_code_rank[i] > rank)

#             if len(pos_positions):
#                 # print(pos_positions[0])
#                 pos_position = pos_positions[0]
#                 pos_cellid = df_tmp_code_cellid[pos_position]
#                 gts.append((cell_id, pos_cellid))
#             # break

#     df_gt = pd.DataFrame(gts, columns=['cell_id', 'pos_cellid'])
#     return df_gt


In [19]:
# df_gt = generate_gt(val_df)
# df_gt

In [20]:
# def postprocess(embedder, df, df_gt, mode=1):
#     # mark_cellid_rank_dict = {}
#     preds = []
#     for id, df_tmp in tqdm(df.groupby('id')):
#         label_list = df_orders[id]

#         df_tmp_mark = df_tmp[df_tmp['cell_type'] == 'markdown']
#         cellid_mark_values = df_tmp_mark['cell_id'].values
#         queries = df_tmp_mark['source'].values

#         df_tmp_code = df_tmp[df_tmp['cell_type'] == 'code']
#         cellid_code_values = df_tmp_code['cell_id'].values
#         corpus = df_tmp_code['source'].values
#         rank_code_values = df_tmp_code['rank'].rank().values

#         corpus_embeddings = embedder.encode(corpus, convert_to_tensor=True)

#         for cellid_mark, query in zip(cellid_mark_values, queries):
#             query_embedding = embedder.encode(query, convert_to_tensor=True)

#             cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
#             best_match = torch.topk(cos_scores, k=min(10, len(corpus)))
            
#             scores, ids = best_match[0].cpu().tolist(), best_match[1].cpu().tolist()
#             cellid_code = cellid_code_values[ids]
#             if cellid_mark in df_gt['cell_id'].values:
#                 if df_gt.loc[df_gt['cell_id'] == cellid_mark, 'pos_cellid'].values[0] in cellid_code:
#                     df_gt.loc[df_gt['cell_id'] == cellid_mark, 'predicted'] = True
#                 else:
#                     df_gt.loc[df_gt['cell_id'] == cellid_mark, 'predicted'] = False
#                 df_gt.loc[df_gt['cell_id'] == cellid_mark, 'score'] = sum(scores) / len(scores)

#             # score, idx = best_match[0].cpu().item(), best_match[1].cpu().item()
#             # cellid_code = cellid_code_values[idx]
#             # if cellid_mark in df_gt['cell_id'].values:
#             #     if df_gt.loc[df_gt['cell_id'] == cellid_mark, 'pos_cellid'].values[0] == cellid_code:
#             #         df_gt.loc[df_gt['cell_id'] == cellid_mark, 'predicted'] = True
#             #     else:
#             #         df_gt.loc[df_gt['cell_id'] == cellid_mark, 'predicted'] = False
#             #     df_gt.loc[df_gt['cell_id'] == cellid_mark, 'score'] = score
# postprocess(embedder, df, df_gt)

In [21]:
# import seaborn as sns

# sns.displot(df_gt, x="score", hue="predicted")

# Validation

In [22]:
def validate(embedder, df, mode=1):
    # mark_cellid_rank_dict = {}
    preds = []
    for id, df_tmp in tqdm(df.groupby('id')):
        label_list = df_orders[id]

        df_tmp_mark = df_tmp[df_tmp['cell_type'] == 'markdown']
        cellid_mark_values = df_tmp_mark['cell_id'].values
        queries = df_tmp_mark['source'].values

        df_tmp_code = df_tmp[df_tmp['cell_type'] == 'code']
        cellid_code_values = df_tmp_code['cell_id'].values
        corpus = df_tmp_code['source'].values
        rank_code_values = df_tmp_code['rank'].rank().values

        corpus_embeddings = embedder.encode(corpus, convert_to_tensor=True)

        for query in queries:
            query_embedding = embedder.encode(query, convert_to_tensor=True)

            cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
            best_match = torch.topk(cos_scores, k=len(corpus))

            pred_tmp = best_match[0].cpu().tolist()
            ids = best_match[1].cpu().tolist()

            soft = np.exp((pred_tmp-np.mean(pred_tmp)) *20)/np.sum(np.exp((pred_tmp-np.mean(pred_tmp)) *20))
            rank = np.sum(soft * rank_code_values[ids])
            
            # score, idx = best_match[0].cpu().item(), best_match[1].cpu().item()
            # rank = rank_code_values[idx] - score
            preds.append(rank)

    df['pred'] = df.groupby(['id', 'cell_type'])['rank'].rank(pct=False)
    # Change rank of markdown cell
    df.loc[df['cell_type'] == 'markdown', 'pred'] = preds

    # Calculate kendall-tau score
    y_dummy = df.sort_values('pred').groupby('id')['cell_id'].apply(list)
    score = kendall_tau(df_orders.loc[y_dummy.index], y_dummy)

    print("Score: ", score)

validate(embedder, df)

  0%|          | 0/6895 [00:00<?, ?it/s]

Score:  0.8520746279466271


In [23]:
def validate(embedder, df, mode=1):
    # mark_cellid_rank_dict = {}
    preds = []
    for id, df_tmp in tqdm(df.groupby('id')):
        label_list = df_orders[id]

        df_tmp_mark = df_tmp[df_tmp['cell_type'] == 'markdown']
        cellid_mark_values = df_tmp_mark['cell_id'].values
        queries = df_tmp_mark['source'].values

        df_tmp_code = df_tmp[df_tmp['cell_type'] == 'code']
        cellid_code_values = df_tmp_code['cell_id'].values
        corpus = df_tmp_code['source'].values
        rank_code_values = df_tmp_code['rank'].rank().values

        corpus_embeddings = embedder.encode(corpus, convert_to_tensor=True)

        for query in queries:
            query_embedding = embedder.encode(query, convert_to_tensor=True)

            cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
            best_match = torch.topk(cos_scores, k=1)
            
            score, idx = best_match[0].cpu().item(), best_match[1].cpu().item()
            rank = rank_code_values[idx] - score
            preds.append(rank)

    df['pred'] = df.groupby(['id', 'cell_type'])['rank'].rank(pct=False)
    # Change rank of markdown cell
    df.loc[df['cell_type'] == 'markdown', 'pred'] = preds

    # Calculate kendall-tau score
    y_dummy = df.sort_values('pred').groupby('id')['cell_id'].apply(list)
    score = kendall_tau(df_orders.loc[y_dummy.index], y_dummy)

    print("Score: ", score)

validate(embedder, df)

  0%|          | 0/6895 [00:00<?, ?it/s]

Score:  0.8550871468979113


In [24]:
# for i in range(1, 4):
#     validate(model, df, dataloader, mode=i)

# For submition

In [25]:
# def export_submit(embedder, df, mode=1):
#     # mark_cellid_rank_dict = {}
#     preds = []
#     for id, df_tmp in tqdm(df.groupby('id')):

#         df_tmp_mark = df_tmp[df_tmp['cell_type'] == 'markdown']
#         cellid_mark_values = df_tmp_mark['cell_id'].values
#         queries = df_tmp_mark['source'].values

#         df_tmp_code = df_tmp[df_tmp['cell_type'] == 'code']
#         cellid_code_values = df_tmp_code['cell_id'].values
#         corpus = df_tmp_code['source'].values
#         rank_code_values = df_tmp_code['rank'].rank().values
    
#         corpus_embeddings = embedder.encode(corpus, convert_to_tensor=True, show_progress_bar=False)

#         for query in queries:
#             query_embedding = embedder.encode(query, convert_to_tensor=True, show_progress_bar=False)

#             cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
#             best_match = torch.topk(cos_scores, k=len(corpus))

#             pred_tmp = best_match[0].cpu().tolist()
#             ids = best_match[1].cpu().tolist()

#             soft = np.exp((pred_tmp-np.mean(pred_tmp)) *20)/np.sum(np.exp((pred_tmp-np.mean(pred_tmp)) *20))
#             rank = np.sum(soft * rank_code_values[ids])
            
#             # score, idx = best_match[0].cpu().item(), best_match[1].cpu().item()
#             # rank = rank_code_values[idx] - score
#             preds.append(rank)

#     df['pred'] = df.groupby(['id', 'cell_type'])['rank'].rank(pct=False)
#     # Change rank of markdown cell
#     df.loc[df['cell_type'] == 'markdown', 'pred'] = preds

#     # Sbumit df
#     sub_df = df.sort_values('pred').groupby('id')['cell_id'].apply(lambda x: " ".join(x)).reset_index()
#     sub_df.rename(columns={"cell_id": "cell_order"}, inplace=True)
    
#     return sub_df

In [26]:
# submit_df = export_submit(embedder, df)

In [27]:
# submit_df.to_csv("submission.csv", index=False)