In [None]:
'''
!pip install --upgrade wandb
!pip install torchmetrics
'''
!pip install GPUtil

In [None]:
import pandas as pd
import numpy as np
import torch
from torch import nn
import gc
import os
from tqdm import tqdm
from datasets import  Dataset as DS, DatasetDict,load_from_disk
from sklearn.model_selection import KFold, GroupKFold
from torch.utils.data import Dataset, DataLoader 
from sklearn.preprocessing import LabelEncoder
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig, AdamW, DataCollatorWithPadding
from GPUtil import showUtilization as gpu_usage
from torchmetrics import F1Score
from IPython.display import FileLink

import warnings
warnings.filterwarnings('ignore')

In [None]:
def init_config():
    CONFIG = {
    'train_directory': "../input/feedback-prize-effectiveness/train",
    'test_directory': "../input/feedback-prize-effectiveness/test",
    'path_to_processed_datasets' : 'dataset_dict',
    'seed': 42,
    'device': torch.device('cuda:0' if torch.cuda.is_available() else 'cpu'),
    'model_name': 'microsoft/deberta-v3-base',
    'epoches': 1,
    'folds': 5,
    'lr': 1e-5,
    'batch_size' : 12,
    'num_classes': 3,
    'max_length':512,
    'max_iters_for_scheduler': 500,
    'min_lr': 1e-6,
    'optimizer_weight_decay':1e-6
    }
    CONFIG['tokenizer'] = AutoTokenizer.from_pretrained(CONFIG['model_name'])
    CONFIG['metric'] = F1Score(num_classes=CONFIG['num_classes'])
    return CONFIG

CONFIG = init_config()

In [None]:
def set_seed(seed=42):
    np.random.seed(seed)
    torch.cuda.manual_seed(seed)
    torch.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark
    os.environ['PYTHONHASHSEED'] = str(seed)
set_seed(CONFIG['seed'])

In [None]:
class DatasetForPredictionQualityOfArgumentation(Dataset):    
    def __init__(self,path_to_train_data, path_to_test_data,  CONFIG, is_train=True):
        self.tokenizer = CONFIG['tokenizer']
        self.max_length = CONFIG['max_length']
        self.path_to_train_directory = path_to_train_data
        self.path_to_test_directory = path_to_test_data
        
    def tokenize_text(self, row):
        return self.tokenizer(row["text"], padding='max_length', truncation=True, add_special_tokens=True, max_length=self.max_length,return_token_type_ids=False)
    
    def get_essay(self, essay_id):
        essay_path = os.path.join(self.path_to_data, f'{essay_id}.txt')
        essay_text = open(essay_path, 'r').read()
        return essay_text
    
    def process_dataset(self, df, is_train_dataset=True):
        if is_train_dataset:
            self.label_encoder = LabelEncoder()
            df['labels'] = self.label_encoder.fit_transform(df['discourse_effectiveness'])
            gfk = GroupKFold(n_splits = CONFIG['folds'])        
            for fold, (_, val) in enumerate(gfk.split(X=df, groups = df['essay_id'])):
                df.loc[val, 'n_fold'] = int(fold)
            df["n_fold"] = df["n_fold"].astype(int)
            df['n_fold'].value_counts()
        
        df['essay_text'] = df['essay_id'].apply(self.get_essay)
        df['text'] = df['discourse_type'] + self.tokenizer.sep_token + df['discourse_text']#  + self.tokenizer.sep_token + df['essay_text']
        
        
        df = DS.from_pandas(df)
        
        to_remove = ['discourse_text','discourse_type', 'discourse_id', 'text','essay_id', 'essay_text']
        
        if is_train_dataset:
            to_remove.append('discourse_effectiveness')
            
        df = df.map(self.tokenize_text, batched=True, remove_columns=to_remove)
        
        return df
        
    def process_train_test_datasets(self):
        self.path_to_processed_datasets = CONFIG['path_to_processed_datasets']
        
        self.path_to_data = self.path_to_train_directory
        self.train = pd.read_csv(''.join([self.path_to_data, ".csv"]))
        self.train = self.process_dataset(self.train)
        
        self.path_to_data = self.path_to_test_directory
        self.test = pd.read_csv(''.join([self.path_to_data, ".csv"]))
        self.test = self.process_dataset(self.test, is_train_dataset=False)
        
        dataset_dict = DatasetDict({'train': self.train, 'test': self.test})
        dataset_dict.save_to_disk(self.path_to_processed_datasets)
        
        return self.path_to_processed_datasets
        
    def load_datasets_from_file(self, path=None):
        if path is None:
            self.path_to_processed_datasets = CONFIG['path_to_processed_datasets']
        else:
            self.path_to_processed_datasets = path
        datasets = load_from_disk(self.path_to_processed_datasets)
        return datasets['train'], datasets['test']
    
    def __getitem__(self, ind):
        return self.df[ind]
    
    def __len__(self):
        return self.df.shape[0]

In [None]:
ds = DatasetForPredictionQualityOfArgumentation(CONFIG['train_directory'],CONFIG['test_directory'], CONFIG)
ds.process_train_test_datasets()

In [None]:
train, test = ds.load_datasets_from_file()

In [None]:
class ModelForPredictionQualityOfArgumentation():
    def __init__(self,ds, CONFIG):
        self.ds = ds
        self.lr = CONFIG['lr']
        self.tokenizer = CONFIG['tokenizer']
        self.collator = DataCollatorWithPadding(tokenizer=self.tokenizer, max_length=CONFIG['max_length'])
        self.metric = CONFIG['metric'].to(CONFIG['device'])
        self.criterion = nn.CrossEntropyLoss()
        self.model = AutoModelForSequenceClassification.from_pretrained(CONFIG['model_name'], num_labels=CONFIG['num_classes']).to(CONFIG['device'])
        self.optimizer = AdamW(params=self.model.parameters(),lr=self.lr, weight_decay=CONFIG['optimizer_weight_decay'])
        self.scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(self.optimizer, T_max=CONFIG['max_iters_for_scheduler'],eta_min=CONFIG['min_lr'])
        self.softmax = torch.nn.Softmax()
        
    def create_dataloader_on_full_dataset(self, dataset):
        dataset = dataset.to_pandas()
        if 'n_fold' in dataset.columns:
            dataset.drop(labels=['n_fold'], axis=1, inplace=True)
        dataset = DS.from_pandas(dataset, preserve_index=False)
        
        dataloader = DataLoader(
            dataset, shuffle=True, batch_size=CONFIG['batch_size'], drop_last=True, collate_fn=self.collator, pin_memory=True
        )
        return dataloader
        
    def create_data_loaders_for_cross_val(self, num_fold):
        ds_ = self.ds.to_pandas()
        ds_train = ds_[ds_['n_fold'] != num_fold]
        ds_train.drop(labels=['n_fold'], axis=1, inplace=True)
        ds_test = ds_[ds_['n_fold'] == num_fold]
        ds_test.drop(labels=['n_fold'], axis=1, inplace=True)
        
        ds_train = DS.from_pandas(ds_train, preserve_index=False)
        ds_test =  DS.from_pandas(ds_test, preserve_index=False)
        
        train_dataloader = DataLoader(
            ds_train, shuffle=True, batch_size=CONFIG['batch_size'], drop_last=True, collate_fn=self.collator, pin_memory=True
        )
        test_dataloader = DataLoader(
            ds_test, shuffle=True, batch_size=CONFIG['batch_size'], drop_last=True, collate_fn=self.collator, pin_memory=True
        )
        
        return train_dataloader, test_dataloader
    
    def train_one_epoch(self, dataloader):
        self.model.train()
        ds_size = 0.
        sum_loss = 0.
        mean_metric = 0.
        bar = tqdm(enumerate(dataloader), total=len(dataloader))
        for _ , data in bar:
            for k, v in data.items():
                data[k] = v.to(CONFIG['device'])
            
            pred = self.model(**data)
            pred = self.softmax(pred['logits'])
            loss = self.criterion(pred, data['labels'])
            loss.backward()
            metric_score = self.metric(pred, data['labels']).item()
            self.optimizer.step()
            self.optimizer.zero_grad()
            self.scheduler.step()
            loss_ = loss.item()
            for k, v in data.items():
                data[k] = v.detach().cpu()
            pred = pred.detach().cpu()
            loss = loss.detach().cpu()
            del data, pred, v, k, loss
            gc.collect()
            ds_size += CONFIG['batch_size']
            sum_loss += (loss_ * CONFIG['batch_size'])
            mean_metric += (metric_score * CONFIG['batch_size'])
            torch.cuda.empty_cache()
            bar.set_postfix(mean_train_loss=sum_loss/ds_size, current_train_loss=loss_,  mean_metric=mean_metric/ds_size, current_metric=metric_score) 
        return sum_loss / ds_size
    
    @torch.no_grad()
    def valid_one_epoch(self, dataloader):
        self.model.eval()
        ds_size = 0.
        sum_loss = 0.
        mean_metric = 0.
        bar = tqdm(enumerate(dataloader), total=len(dataloader))
        for _ , data in bar:
            for k, v in data.items():
                data[k] = v.to(CONFIG['device'])
            self.optimizer.zero_grad()
            pred = self.model(**data)
            pred = self.softmax(pred['logits'])
            metric_score = self.metric(pred, data['labels']).item()
            loss = self.criterion(pred, data['labels'])
            loss_ = loss.item()
            for k, v in data.items():
                data[k] = v.detach().cpu()
            pred = v.detach().cpu()
            loss = loss.detach().cpu()
            del data, pred, v, k, loss
            gc.collect()
            ds_size += CONFIG['batch_size']
            sum_loss += (loss_ * CONFIG['batch_size'])
            mean_metric += (metric_score * CONFIG['batch_size'])
            torch.cuda.empty_cache()
            bar.set_postfix(mean_valid_loss=sum_loss/ds_size, current_valid_loss=loss_,  mean_metric=mean_metric/ds_size, current_metric=metric_score) 
        return sum_loss / ds_size
    
    def run_learning(self, cross_val=True):
        min_loss_val = np.inf 
        mean_loss_train = 0.
        mean_loss_test = 0.
        if cross_val:
            for i in range(CONFIG['folds']):
                print('fold: ', i)
                train_loader, test_loader = self.create_data_loaders_for_cross_val(i)
                for i in range(CONFIG['epoches']):
                    loss_train = self.train_one_epoch(train_loader)
                    loss_val = self.valid_one_epoch(test_loader)
                    if min_loss_val <= loss_val:
                        path = f'model-fold: {fold}.bin'
                        torch.save(self.model.state_dict(), path)
                mean_loss_train += loss_train
                mean_loss_test += loss_val
                        
                self.model = AutoModelForSequenceClassification.from_pretrained(CONFIG['model_name'], num_labels=CONFIG['num_classes']).to(CONFIG['device'])
                self.optimizer = AdamW(params=self.model.parameters(),lr=self.lr, weight_decay=CONFIG['optimizer_weight_decay'])
                self.scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(self.optimizer, T_max=CONFIG['max_iters_for_scheduler'], eta_min=CONFIG['min_lr'])
                torch.cuda.empty_cache()
            print(f"loss train/test: {mean_loss_train} {mean_loss_test}")
        else:
            train_loader = self.create_dataloader_on_full_dataset(self.ds)
            for i in range(CONFIG['epoches']):
                loss_train = self.train_one_epoch(train_loader)
                self.path_to_weights = 'weights_for_bert.bin'
                torch.save(self.model.state_dict(), path)
                
    def download_weights_for_model(self):
        self.model.load_state_dict(torch.load(self.path_to_weights))
        
    def create_download_link(self, df, title = "Download CSV file", filename = "DBV3L.csv"):  
        csv = df.to_csv(filename, index=False)
        return FileLink(filename)
    
    @torch.no_grad()
    def final_prediction_for_kaggle(self, test_dataset, is_download_weights=False):
        batch_size = 10
        test_loader = DataLoader(
            test_dataset, batch_size=batch_size, collate_fn=self.collator, pin_memory=True
        )
        
        if is_download_weights:
            self.download_weights_for_model()

        self.model.eval()
        bar = tqdm(enumerate(test_loader), total=len(test_loader))
        for _ , data in bar:
            for k, v in data.items():
                data[k] = v.to(CONFIG['device'])
            pred = self.model(**data)
            pred = self.softmax(pred['logits'])
        result = np.array(pred.tolist())
        res = pd.DataFrame(result, columns=['Adequate', 'Effective', 'Ineffective'])
        ids =pd.read_csv('../input/feedback-prize-effectiveness/test.csv')['discourse_id'] 
        res['discourse_id'] = ids
        res = res[['discourse_id', 'Ineffective', 'Adequate', 'Effective']]
        return self.create_download_link(res) 

In [None]:
model = ModelForPredictionQualityOfArgumentation(train, CONFIG)
model.run_learning(cross_val=True)

In [None]:
res = model.final_prediction_for_kaggle(test)
res

In [None]:
def release_cpu_and_gpu_memory(model):
    print('before')
    gpu_usage()
    
    model.model.to('cpu')
    del model
    
    gc.collect()
    torch.cuda.empty_cache()
    print('after')
    gpu_usage()
release_cpu_and_gpu_memory(model)
CONFIG = init_config()