In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/classification-of-math-problems-by-kasut-academy/sample_submission.csv
/kaggle/input/classification-of-math-problems-by-kasut-academy/train.csv
/kaggle/input/classification-of-math-problems-by-kasut-academy/test.csv


In [4]:
train_df = pd.read_csv("/kaggle/input/classification-of-math-problems-by-kasut-academy/train.csv")

0        3
1        5
2        0
3        1
4        5
        ..
10184    0
10185    1
10186    5
10187    1
10188    5
Name: label, Length: 10189, dtype: int64

In [None]:
!pip install -q transformers accelerate bitsandbytes

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel, BitsAndBytesConfig
from pathlib import Path
from collections import defaultdict
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import pytorch_lightning as pl
from torchmetrics.classification import Recall, F1Score
import os
from typing import cast
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split
import bitsandbytes as bnb


class MathDataset():
    def __init__(self, model_name = 'deepseek-ai/deepseek-math-7b-base', val_part: float = 0.1):
        self.model_name = model_name
        
        math_df_trainval = pd.read_csv('/kaggle/input/classification-of-math-problems-by-kasut-academy/train.csv')
        train_math_df, val_math_df = train_test_split(math_df_trainval, 
                            test_size=val_part,
                            stratify=math_df_trainval['label'])

        train_data = (train_math_df['Question'].to_numpy(), train_math_df['label'].to_numpy())
        val_data = (val_math_df['Question'].to_numpy(), val_math_df['label'].to_numpy())

        math_df_test = pd.read_csv('/kaggle/input/classification-of-math-problems-by-kasut-academy/test.csv')
        test_data = math_df_test['Question'].to_numpy()
        
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)

        self.tokenizer.pad_token = self.tokenizer.eos_token
        self.tokenizer.padding_side = "right"
        
        self.data = {'train': train_data, 'val': val_data, 'test': test_data}
        
        class_weights = compute_class_weight(class_weight = 'balanced', classes=np.unique(train_data[1]), y=train_data[1])
        self.class_weights = torch.tensor(class_weights, dtype=torch.float)


class MathDataview(Dataset):
    def __init__(self, split, dataset):
        super().__init__()
        self.tokenizer = dataset.tokenizer
        if split != 'test':
            self.texts, self.labels = dataset.data[split]
        else:
            self.texts = dataset.data[split]
            self.labels = None
                 
    def __len__(self):
        return self.texts.shape[0]
    
    def __getitem__(self, index):
        sequence = self.texts[index]
        label = self.labels[index] if self.labels is not None else 0
        label = torch.tensor(label, dtype=torch.long)
        
        tokenized = self.tokenizer(sequence, return_tensors="pt", padding="max_length", truncation=True, max_length=300)
        
        tokenized_input = tokenized['input_ids'].squeeze()
        attention_mask = tokenized['attention_mask'].squeeze()
        
        return tokenized_input, attention_mask, label


class MathDataModule(pl.LightningDataModule):
    def __init__(self, batch_szie: int = 128):
        super().__init__()
        self.batch_size = batch_szie

        self.num_loader_workers = min(8, torch.get_num_threads())

        
    def setup(self, stage: str = None):
        
        math_data = MathDataset()
        
        self.class_weights = math_data.class_weights
        
        self.trainview = MathDataview('train', math_data)
        self.valview = MathDataview('val', math_data)
        
        self.testview = MathDataview('test', math_data)
        
    #Parallel loading is disabled since it messes up the tokenizer parallelism
    def train_dataloader(self):
        return DataLoader(self.trainview,
                         batch_size = self.batch_size,
                         #num_workers = self.num_loader_workers,
                         #worker_init_fn = self.worker_init_fn,
                         shuffle = True)
        
    def val_dataloader(self):
        return DataLoader(self.valview,
                         batch_size = self.batch_size,
                         #num_workers = 3,
                         #worker_init_fn = self.worker_init_fn,
                         shuffle = False)
    
    def test_dataloader(self):
        return DataLoader(self.testview,
                         batch_size = self.batch_size,
                         #num_workers = self.num_loader_workers,
                         #worker_init_fn = self.worker_init_fn,
                         shuffle = False)


class MathFTModule(pl.LightningModule):
    def __init__(self, class_weights: torch.Tensor, submission_size, model = 'deepseek-ai/deepseek-math-7b-base', dropout_rate: float = 0.1, batch_size = 128):
        super().__init__()
        compute_dtype = getattr(torch, "float16")
        
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True, 
            bnb_4bit_quant_type="nf4", 
            bnb_4bit_compute_dtype=compute_dtype,
            bnb_4bit_use_double_quant=True,
        )
        self.backbone = AutoModel.from_pretrained(model,
                        device_map="auto",
                        quantization_config=bnb_config)

        self.backbone.config.use_cache = False
        self.backbone.config.pretraining_tp = 1
        
        #freezing backbone
        for param in self.backbone.parameters():
            param.requires_grad = False
            
        self.classifier = nn.Sequential(
                    nn.Dropout(p=dropout_rate, inplace=True),
                    nn.Linear(4096, 1024),
                    nn.ReLU(),
                    nn.BatchNorm1d(num_features = 1024),
                    nn.Linear(1024, 8))
        
        self.criterion = nn.CrossEntropyLoss(weight = class_weights)
        self.val_f1 = F1Score(task='multiclass', num_classes=8)
        self.submission_np = np.zeros((submission_size))
        self.batch_size = batch_size
                
    def forward(self, ids, mask):
        outputs = self.backbone(input_ids = ids, attention_mask = mask)
        embeddings = outputs.last_hidden_state
        embeddings = embeddings.to(torch.float32).mean(dim=1)
        logits = self.classifier(embeddings)
                
        return logits
        
    def training_step(self, batch, batch_idx):
        tokenized_input, attention_mask, targets = batch
        
        outputs = self(tokenized_input, attention_mask)
        loss = self.criterion(outputs, targets)
        
        self.log('train/loss', loss)
        
        return loss
    
    def validation_step(self, batch, batch_idx):
        tokenized_input, attention_mask, targets = batch
        
        outputs = self(tokenized_input, attention_mask)
        preds = outputs.argmax(dim = -1)
            
        loss = self.criterion(outputs, targets)
        self.val_f1.update(preds, targets) 
        
        self.log('val/loss', loss)
        
        return loss
    
    def on_validation_epoch_end(self):
        val_f1 = self.val_f1.compute()
        print(f"Cur f1: {val_f1}")
        self.log('val/f1', val_f1)
        
    def test_step(self, batch, batch_idx):
        tokenized_input, attention_mask, _ = batch
        
        outputs = self(tokenized_input, attention_mask)
        preds = outputs.argmax(dim = -1)
        
        self.submission_np[batch_idx * self.batch_size:(batch_idx+1)*self.batch_size] = preds
            
    def on_test_epoch_end(self):                
        submission = pd.read_csv('/kaggle/input/classification-of-math-problems-by-kasut-academy/sample_submission.csv')
        submission_test = submission.copy()
        
        submission_test['label'] = self.submission_np
        submission_test.to_csv('submission_test.csv', index=False)
        
    def configure_optimizers(self):
        return torch.optim.AdamW(self.parameters(), lr=1e-4, weight_decay=1e-6)
        

In [None]:
from pytorch_lightning.loggers import TensorBoardLogger
import datetime

data_module = MathDataModule()
data_module.setup("fit")

deepseek_module = MathFTModule(data_module.class_weights, len(data_module.testview.texts))

datetime_str = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
art_dir_name = f"{datetime_str}"

logger = TensorBoardLogger(
    save_dir = os.getcwd(),
    version=art_dir_name,
    name="math_logs")

trainer = pl.Trainer(max_epochs=15, logger=logger, devices=1, accelerator="gpu")
trainer.fit(deepseek_module, data_module)

In [None]:
trainer.test(deepseek_module, data_module)