In [56]:
TESTING_SUBSET = True  # ***** FOR TESTING ONLY !!!!!!!! ******

In [57]:
import sys
import os

IN_COLAB = 'google.colab' in sys.modules

if IN_COLAB:

    # Enable line-wrapping
    # from IPython.display import HTML, display

    # def set_css():
    #     display(HTML('''
    #         <style>
    #         pre {
    #             white-space: pre-wrap;
    #             }
    #         </style>
    #     '''))
    # get_ipython().events.register('pre_run_cell', set_css)


    # mount google drive
    from google.colab import drive
    drive.mount('/content/drive')

    # # Install python libraries in google drive once and then we can load it 
    # # from our area every time, we don't have do the reinstall every time
    # LIBRARIES_PATH = '/content/drive/MyDrive/ML/lecr/code/libraries/'

    # INSTALL_LIBRARIES = False # Set to True only once during first install
    # if INSTALL_LIBRARIES:
    #     %pip install --target=$LIBRARIES_PATH transformers
    #     %pip install --target=$LIBRARIES_PATH datasets
    #     %pip install --target=$LIBRARIES_PATH pynvml

    # sys.path.append(LIBRARIES_PATH)
    %pip install transformers
    %pip install datasets
    %pip install pynvml

In [58]:
if IN_COLAB:
    # Enable line-wrapping
    from IPython.display import HTML, display

    def set_css():
        display(HTML('''
            <style>
            pre {
                white-space: pre-wrap;
                }
            </style>
        '''))
    get_ipython().events.register('pre_run_cell', set_css)


    # mount google drive
    from google.colab import drive
    drive.mount('/content/drive')

In [59]:
import datetime
import numpy as np
from sklearn.metrics import fbeta_score
from matplotlib import pyplot as plt
import math
import pandas as pd
import json
import time
import torch
from torch import nn
import random
#from tqdm.auto import tqdm
from tqdm import tqdm
from torch.utils.data import DataLoader
from datasets import Dataset
import torchsummary
from functools import partialmethod
from itertools import product

from transformers import AutoTokenizer, AutoModel
from dataclasses import dataclass
from transformers import PreTrainedTokenizer
from torch.utils.data import DataLoader

if IN_COLAB:
  sys.path.append('/content/drive/MyDrive/ML/lecr/code/')
from l_utils import *
from model_utils import *

#from model_defs import Topic_NN, Content_NN

%env CUBLAS_WORKSPACE_CONFIG=:16:8

device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

env: CUBLAS_WORKSPACE_CONFIG=:16:8
cuda


In [60]:
if IN_COLAB:
    class CFG:
        # MODEL = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
        # MODEL = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
        DIRNAME_PROCESSED_DATA = '/content/drive/MyDrive/ML/lecr/data/'
        FILENAME_DATA_CONTENT_F = DIRNAME_PROCESSED_DATA + "content.f"
        FILENAME_DATA_TOPICS_F = DIRNAME_PROCESSED_DATA + "topics.f"
        FILENAME_DATA_CORRELATIONS_CSV = DIRNAME_PROCESSED_DATA + "correlations.csv"
        NUM_FOLDS = 5
        SAVE_MODEL_DIR = "/content/drive/MyDrive/ML/lecr/models/"
else:
    from cfg import *
CFG.MODEL = "sentence-transformers/distiluse-base-multilingual-cased-v2"
topics_tokenized_pq_file = CFG.DIRNAME_PROCESSED_DATA + "topics_tokenized.pq"
contents_tokenized_pq_file = CFG.DIRNAME_PROCESSED_DATA + "contents_tokenized.pq"


In [61]:
!nvidia-smi

Wed Mar 29 17:54:06 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 515.86.01    Driver Version: 515.86.01    CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:01:00.0 Off |                  N/A |
|  0%   34C    P8    18W / 275W |   4287MiB / 11264MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------

In [62]:
# Split data into folds

# Feature engineering ... use same as the one used for bce-lightning

# Training for a Top-k metric ... see if this performs better than directly using minilm embeddings

In [63]:
def progress_bar_control(show_progress_bars = True):
    tqdm.__init__ = partialmethod(tqdm.__init__, disable=(not show_progress_bars))

In [64]:
contents_df = pd.read_feather(CFG.FILENAME_DATA_CONTENT_F)
topics_df   = pd.read_feather(CFG.FILENAME_DATA_TOPICS_F)
correlations_df =  pd.read_csv(CFG.FILENAME_DATA_CORRELATIONS_CSV)

In [65]:
contents_df.info()
topics_df.info()
correlations_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 154047 entries, 0 to 154046
Data columns (total 8 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   id                154047 non-null  object
 1   title             154038 non-null  object
 2   description       89456 non-null   object
 3   kind              154047 non-null  object
 4   text              74035 non-null   object
 5   language          154047 non-null  object
 6   copyright_holder  71821 non-null   object
 7   license           74035 non-null   object
dtypes: object(8)
memory usage: 9.4+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76972 entries, 0 to 76971
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           76972 non-null  object
 1   title        76970 non-null  object
 2   description  34953 non-null  object
 3   channel      76972 non-null  object
 4   category     76972 no

In [66]:
topics_tokenized_df   = pd.read_parquet(topics_tokenized_pq_file)
contents_tokenized_df = pd.read_parquet(contents_tokenized_pq_file)

In [67]:
contents_tokenized_df.info()
topics_tokenized_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 154047 entries, 0 to 154046
Data columns (total 3 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   id                  154047 non-null  object
 1   c_0_input_ids       154047 non-null  object
 2   c_1_attention_mask  154047 non-null  object
dtypes: object(3)
memory usage: 3.5+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61517 entries, 0 to 61516
Data columns (total 3 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   id                  61517 non-null  object
 1   t_0_input_ids       61517 non-null  object
 2   t_1_attention_mask  61517 non-null  object
dtypes: object(3)
memory usage: 1.4+ MB


In [68]:
# train_df_orig, val_df_orig = create_train_val_split_pos_corr(topics_df, contents_df, correlations_df, CFG.NUM_FOLDS, min_train_perc = 80.0, use_topic_trees = True, random_seed = 3) # pick a random seed that gives a good split
train_df_orig, val_df_orig = create_train_val_split_pos_corr_from_kaggle(topics_df, correlations_df, num_splits = 4, val_fold = 0)

177290
102629
2023-03-29 17:54:09.149249 : Splitting non-source into 4 folds
fold
0.0    25667
1.0    25677
2.0    25663
3.0    25622
dtype: int64
fold
0.0    6070
1.0    4971
2.0    8493
3.0    5469
dtype: int64


In [69]:
# ***** FOR TESTING ONLY !!!!!!!! ******
if TESTING_SUBSET:
    train_df = train_df_orig[:1000]
    val_df = val_df_orig[:1000]
else:
    train_df = train_df_orig
    val_df = val_df_orig

In [70]:
train_df.info()
val_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   t_id    1000 non-null   object
 1   c_id    1000 non-null   object
dtypes: object(2)
memory usage: 15.8+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   t_id    1000 non-null   object
 1   c_id    1000 non-null   object
dtypes: object(2)
memory usage: 15.8+ KB


In [71]:
#train_df.loc[random.sample(range(0,len(train_df)),10)]
train_df.loc[train_df['t_id']=='t_f2872cd7562c']

Unnamed: 0,t_id,c_id


In [72]:
PARTITION_BATCH_SIZE = 350

In [73]:
# train_tokenized_df = get_partitioned_data_for_pos_corr(train_df, topics_tokenized_df, contents_tokenized_df, batch_size = PARTITION_BATCH_SIZE, do_partitioning = False, sort = False)
train_tokenized_df, output_classes = get_labeled_data_for_pos_corr(train_df.copy(), topics_tokenized_df, contents_tokenized_df)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 219 entries, 0 to 218
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   t_id    219 non-null    object
 1   label   219 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 3.5+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   c_id    1000 non-null   object
 1   label   1000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 15.8+ KB
2023-03-29 17:54:09.352856 : Merging in topic and content data into the folds data ... 


In [74]:
train_tokenized_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1219 entries, 0 to 999
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   label           1219 non-null   int64 
 1   input_ids       1219 non-null   object
 2   attention_mask  1219 non-null   object
dtypes: int64(1), object(2)
memory usage: 38.1+ KB


In [75]:
val_topics_tokenized_df, val_contents_tokenized_df, val_corr_df = get_pos_corr_subsets_for_binary_fold( val_df, contents_tokenized_df, topics_tokenized_df)

In [76]:
val_topics_tokenized_df.info()
val_contents_tokenized_df.info()
val_corr_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 239 entries, 0 to 238
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   index               239 non-null    int64 
 1   id                  239 non-null    object
 2   t_0_input_ids       239 non-null    object
 3   t_1_attention_mask  239 non-null    object
dtypes: int64(1), object(3)
memory usage: 7.6+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 984 entries, 0 to 983
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   index               984 non-null    int64 
 1   id                  984 non-null    object
 2   c_0_input_ids       984 non-null    object
 3   c_1_attention_mask  984 non-null    object
dtypes: int64(1), object(3)
memory usage: 30.9+ KB
<class 'pandas.core.series.Series'>
Int64Index: 239 entries, 0 to 238
Series name: c_idx
Non-Null Count  Dtype 
---

In [77]:
class TransModel(torch.nn.Module):
    """
    Wrapper. Gets tokenized features and returns the output logits for whether the token and content are matching.
    """
    def __init__(self, trans_model_name):
        super().__init__()
        self.transformer = AutoModel.from_pretrained(trans_model_name)

    @staticmethod
    def mean_pooling(token_embeddings, attention_mask):
        """
        Average the output embeddings using the attention mask 
        to ignore certain tokens.
        """
        input_mask_expanded = (
            attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        )
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(
            input_mask_expanded.sum(1), min=1e-9
        )

    def get_transformer_output(self, input_ids, attention_mask):
        outputs = self.transformer(input_ids = input_ids, attention_mask=attention_mask)
        pooled_embeddings=self.mean_pooling(outputs[0], attention_mask)
        return pooled_embeddings

    def forward(self, features):
        # Grab embeddings from transformer
        embs = self.get_transformer_output(
            features[:,0,:],
            features[:,1,:]
        )

        return embs


In [78]:
class CustomDS(torch.utils.data.Dataset):
    def __init__(self, train_tokenized_df):
        feature_cols = [col for col in train_tokenized_df.columns if 'label' not in col]
        self.features = torch.tensor(train_tokenized_df[feature_cols].values.tolist())
        self.label   = torch.tensor(train_tokenized_df[['label']].values.tolist())
            
    def __len__(self):
        return self.features.size(dim=0)
    
    def __getitem__(self, item):
        
        features = self.features[item]
        label    = self.label[item]


        return features, label

In [79]:

train_ds = CustomDS(train_tokenized_df)


In [80]:
train_ds.features.size()

torch.Size([1219, 2, 64])

In [81]:
t_cols = [col for col in val_topics_tokenized_df.columns if col.startswith('t_')]
c_cols = [col for col in val_contents_tokenized_df.columns if col.startswith('c_')]
val_data = (torch.tensor(val_topics_tokenized_df[t_cols].values.tolist()),
            torch.tensor(val_contents_tokenized_df[c_cols].values.tolist())
            ,val_corr_df
            )

In [82]:
try: trans_model
except NameError: trans_model = None
if trans_model is not None:
  del trans_model
report_gpu()
if TESTING_SUBSET:
    # 20 works good for training for TESTING_SUBSET
    BATCH_SIZE = 20
else:
    # BATCH_SIZE = 1100 # Works during first round of training, but if we do a reload of saved model, this number causes out of memory at 2nd epoch
    BATCH_SIZE = 1050 # 
    # 1875 works for arccos, 1950 works but runs out of memory after few epochs
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers = 2)

GPU:0
process     700802 uses     4255.000 MB GPU memory


In [86]:
# Add option to load pre-existing model
LOAD_CHECKPOINT_MODEL = False
SAVE_BEST_MODEL = True
DRY_RUN = False
TRAIN_PATIENCE = 5
k_vals = [5,10, 50]
METRIC = 'recall'
METRIC_KEY = 10
MARGIN = 0.4
GAMMA = 2.0
MODEL_FILENAME_PREFIX = f'one_trans_distiluse_ArcFace_{MARGIN}_focal_loss_{GAMMA}_fp16_ml64_train_shuffled_parent_title_new_sep_unique_id_batch_size_{BATCH_SIZE}'
SAVE_MODEL_DIR = CFG.SAVE_MODEL_DIR
# TRANS_OUTPUT_EMBS = 768
trans_model = TransModel( trans_model_name = CFG.MODEL)
focal_loss = torch.hub.load(
	'adeelh/pytorch-multi-class-focal-loss',
	model='focal_loss',
	# alpha=[.75, .25],
	gamma=GAMMA,
	reduction='mean',
	device=device,
	dtype=torch.float32,
	force_reload=False,
    trust_repo=True
)
TRANS_OUTPUT_EMBS = trans_model.transformer.pooler.dense.out_features
criterion1 = ArcosLossWithWeights(TRANS_OUTPUT_EMBS, output_classes, loss_function = focal_loss, margin=MARGIN)
optimizer = torch.optim.Adam((list(trans_model.parameters())) + (list(criterion1.parameters())),
                              lr=2e-4,
                             #weight_decay = 1e-4 # L2 regularization, 1e-4 is a good value
                             )
# criterion0 = MultipleNegativesRankingLoss()
model_dict = {'trans_model': trans_model, 'optimizer': optimizer, 'criterion':criterion1}
if LOAD_CHECKPOINT_MODEL:
    # MODEL_FILENAME = SAVE_MODEL_DIR + 'one_trans_MiniLM_train_shuffled_parent_title_epoch_5_recall_50_0.904.pth'
    # t_model, _, _ = load_trans_model(t_model, criterion = None, optimizer = None, filename = MODEL_FILENAME)
    # train_scores = []
    # val_scores = []
    # epoch_count = 0
    # MODEL_FILENAME = None

    MODEL_FILENAME = SAVE_MODEL_DIR + 'one_trans_distiluse_MNRL_pretrained_ArcFace_0.4_focal_loss_2.0_fp16_ml64_train_shuffled_parent_title_new_sep_unique_id_batch_size_1050_epoch_35_recall_10_0.778.pth'
    load_model( model_dict, filename = MODEL_FILENAME)
    for state in optimizer.state.values():
        for k, v in state.items():
            if isinstance(v, torch.Tensor):
                state[k] = v.cuda()

    # criterion1.margin = 0.6


    # TRAIN_DATA_FILENAME = SAVE_MODEL_DIR + f'{MODEL_FILENAME_PREFIX}.train.txt'
    # TRAIN_DATA_FILENAME = SAVE_MODEL_DIR + 'one_trans_mpnet_MNRL_pretrained_ArcFace_0.4_focal_loss_2.0_fp16_ml64_train_shuffled_parent_title_new_sep_batch_size_1100.train.txt'
    # train_data_dict = load_dict(TRAIN_DATA_FILENAME)
    # train_scores, val_scores = train_data_dict['train_scores'], train_data_dict['val_scores']
    # epoch_count = len(val_scores)

    train_scores = []
    val_scores = []
    epoch_count = 0
    MODEL_FILENAME = None
    
else:
    train_scores = []
    val_scores = []
    epoch_count = 0
    MODEL_FILENAME = None

trans_model = trans_model.to(device)
criterion1 = criterion1.to(device)


Using cache found in /home/mahesh/.cache/torch/hub/adeelh_pytorch-multi-class-focal-loss_master


In [87]:
NUM_EPOCHS = 30

DISABLE_TQDM = False
if (DISABLE_TQDM):
    tqdm.__init__ = partialmethod(tqdm.__init__, disable=DISABLE_TQDM)

scaler = torch.cuda.amp.GradScaler()

pt('Starting Training')
trans_model.train()
# c_model.train()
#for epoch in tqdm(range(EPOCHS), desc="Epochs"):
for epoch in range(NUM_EPOCHS):
    # We set the below in case it is frozen while doing eval
    pt(f"***** Epoch {epoch_count} *****")
    train_loss = 0
    #train_acc = 0
    #train_f2_score = 0
    for (features, label) in tqdm(train_loader, desc="Training"):
        features = features.to(device)
        label    = label.to(device).squeeze(1)
    
        optimizer.zero_grad()

        with torch.autocast(device_type='cuda', dtype=torch.float16):
            t_emb = trans_model(features)
            loss1 = criterion1(t_emb, label)

        # loss.backward()
        # optimizer.step()
        scaler.scale(loss1).backward()
        scaler.step(optimizer)
        scaler.update()
       
        train_loss += loss1.item()
    train_loss = round(train_loss/len(train_loader), 5)
    pt(f'Train : | Loss: {train_loss} ')
    train_scores.append({'loss':train_loss})
    recalls  = compute_recall_at_k_for_two_trans(trans_model, trans_model, device, val_data[0], val_data[1], val_data[2], k_vals = k_vals, batch_size = BATCH_SIZE)
    recall_str = '\nValidation:  ' + ''.join([f'| Recall@{k}: {recalls[k]:.3f}' for k in k_vals])
    pt(recall_str)
    val_scores.append({'recall':recalls})
    if SAVE_BEST_MODEL:
        MODEL_FILENAME = save_best_model( 
                                        model_dict = model_dict,
                                        old_model_filename = MODEL_FILENAME,
                                        model_filename_prefix = MODEL_FILENAME_PREFIX,
                                        model_path = SAVE_MODEL_DIR,
                                        save_model_fn = save_model,
                                        train_scores = train_scores,
                                        val_scores = val_scores, 
                                        metric = METRIC, metric_key = METRIC_KEY,
                                        dry_run = DRY_RUN
                                        )
    save_model(model_dict, SAVE_MODEL_DIR + f"{MODEL_FILENAME_PREFIX}.last_model.pth")
    epoch_count += 1
    save_dict({'train_scores':train_scores, 'val_scores':val_scores}, SAVE_MODEL_DIR + f'{MODEL_FILENAME_PREFIX}.train.txt')
    if (TRAIN_PATIENCE >=0  and (not check_improvement(val_scores, metric = METRIC, metric_key = METRIC_KEY, patience = TRAIN_PATIENCE))):
        pt(f'Exiting training since no improvement in the last {TRAIN_PATIENCE} epochs')
        break


2023-03-29 18:01:26.651772 : Starting Training
2023-03-29 18:01:26.652690 : ***** Epoch 0 *****


Training:   0% 0/61 [00:00<?, ?it/s]

Training:   0% 0/61 [00:00<?, ?it/s]




RuntimeError: mat1 and mat2 shapes cannot be multiplied (20x768 and 384x219)

In [85]:
if IN_COLAB:
    from google.colab import runtime
    runtime.unassign()