In [2]:
import torch 
from transformers import AutoModelForSequenceClassification,Trainer,TrainingArguments
from transformers import AutoTokenizer,DataCollatorWithPadding
from datasets import Dataset
from datasets import load_metric

from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np
from scipy.stats import mode

import os
os.environ['WANDB_DISABLED'] = 'True'

In [3]:
train = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
test = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')

print('Training Set Shape = {}'.format(train.shape))
print('Training Set Memory Usage = {:.2f} MB'.format(train.memory_usage().sum() / 1024**2))
print('Test Set Shape = {}'.format(test.shape))
print('Test Set Memory Usage = {:.2f} MB'.format(test.memory_usage().sum() / 1024**2))

train,valid = train_test_split(train, test_size= 0.2,random_state = 2)

Training Set Shape = (7613, 5)
Training Set Memory Usage = 0.29 MB
Test Set Shape = (3263, 4)
Test Set Memory Usage = 0.10 MB


In [4]:
train.head(10)

Unnamed: 0,id,keyword,location,text,target
4549,6466,injured,USA,Offers : http://t.co/Gl3C1vc88P #8392 Deluxe T...,1
4512,6413,hurricane,,The hurricane mixxtail kinda tastes like the w...,0
4368,6203,hijacker,,Complete Solution to Get Rid of http://t.co/9C...,0
4297,6103,hellfire,,@HellFire_eV @JackPERU1 then I do this to one ...,0
13,19,,,#Flood in Bago Myanmar #We arrived Bago,1
6235,8903,snowstorm,Manchester,@Groupon_UK it won't let me as you don't follo...,0
3160,4537,emergency,Southern Maine,Former heroin addict shares story as city lead...,1
2917,4191,drown,somewhere in Indiana,Going to go drown my sorrows with sad music brb,0
2318,3334,demolished,Chicago,ÛÏ@SplottDave: @TeamPalestina That's about 28...,1
3392,4856,evacuation,,This is an evil generation\nRock and roll evac...,0


In [5]:
test.head(10)

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan
5,12,,,We're shaking...It's an earthquake
6,21,,,They'd probably still show more life than Arse...
7,22,,,Hey! How are you?
8,27,,,What a nice hat?
9,29,,,Fuck off!


In [6]:
def tokenization(model_path,train_df,valid_df,test_df):
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    train_ds = Dataset.from_pandas(train_df)
    valid_ds = Dataset.from_pandas(valid_df)
    test_ds = Dataset.from_pandas(test_df)
    
    def process_token(example,tokenizer=tokenizer):
        return tokenizer(example['text'])
    tokenized_train = train_ds.map(process_token)
    tokenized_valid = valid_ds.map(process_token)
    tokenized_test = test_ds.map(process_token)
    
    columns_to_remove = ['id', 'keyword', 'location','__index_level_0__']
    train_dataset = tokenized_train.remove_columns(columns_to_remove)
    valid_dataset = tokenized_valid.remove_columns(columns_to_remove)
    columns_to_remove = ['id', 'keyword', 'location']
    test_dataset = tokenized_test.remove_columns(columns_to_remove)
    
    train_dataset = train_dataset.rename_column("target", "label")
    valid_dataset = valid_dataset.rename_column("target", "label")
    return train_dataset,valid_dataset,test_dataset,tokenizer

In [7]:
def compute_metrics(eval_pred):
    load_acc = load_metric('accuracy')
    load_f1 = load_metric('f1')
    logits,labels = eval_pred
    predictions = np.argmax(logits,axis = -1)
    acc = load_acc.compute(predictions = predictions,references = labels)['accuracy']
    f1 = load_f1.compute(predictions = predictions, references = labels)['f1']
    return {'acc':acc,'f1':f1}

In [8]:
def init_trainer(model_path,tokenizer,lr,ep,train_dataset,valid_dataset):
    model = AutoModelForSequenceClassification.from_pretrained(model_path,num_labels=2)
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    training_args = TrainingArguments(
        learning_rate=lr,
        num_train_epochs=ep,
        per_device_train_batch_size=16,
        weight_decay=0.01,
        output_dir=model_path,

    )
    trainer = Trainer(
        model = model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=valid_dataset,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )
    return trainer

In [9]:
class model_fusion:
    def __init__(self,model_paths,lr,ep):
        self.model_paths = model_paths
        self.learning_rate = lr
        self.epoch_num = ep
        self.preds = []
    
    def train_pred_multiple_models(self):
        for model_path in self.model_paths:
            print(f'Training {model_path}')
            print('total:',torch.cuda.get_device_properties(0).total_memory/1e9)
            print('allocated:',torch.cuda.memory_allocated(0)/1e9)
            print('cached:',torch.cuda.memory_reserved(0)/1e9)
            train_dataset,valid_dataset,test_dataset,tokenizer = tokenization(model_path,train,valid,test)
            trainer = init_trainer(model_path,tokenizer,self.learning_rate,self.epoch_num,train_dataset,valid_dataset)
            trainer.train()
            prediction = trainer.predict(test_dataset=test_dataset)
            predictions = np.argmax(a=prediction.predictions,axis = -1)
            self.preds.append((model_path,predictions))
            
            
    def fusion_pred(self):
        all_preds = [pred[1] for pred in self.preds]
        final_preds = mode(all_preds, axis=0)[0]

        return final_preds.ravel()

In [10]:
roberta_path = 'roberta-base'
bert_path = 'bert-base-uncased'
deberta_path = 'microsoft/deberta-v3-base'
distilbert_path = 'distilbert-base-uncased'
model_paths = [bert_path,distilbert_path,roberta_path,deberta_path]
lr = 2e-5
ep = 3
models = model_fusion(model_paths,lr,ep)

In [11]:
models.train_pred_multiple_models()
pred = models.fusion_pred()
models.preds

Training bert-base-uncased
total: 15.835660288
allocated: 0.0
cached: 0.0


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

  0%|          | 0/6090 [00:00<?, ?ex/s]

  0%|          | 0/1523 [00:00<?, ?ex/s]

  0%|          | 0/3263 [00:00<?, ?ex/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss
500,0.3661




Training distilbert-base-uncased
total: 15.835660288
allocated: 1.34551296
cached: 2.71581184


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

  0%|          | 0/6090 [00:00<?, ?ex/s]

  0%|          | 0/1523 [00:00<?, ?ex/s]

  0%|          | 0/3263 [00:00<?, ?ex/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss
500,0.377




Training roberta-base
total: 15.835660288
allocated: 0.834720256
cached: 1.704984576


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

  0%|          | 0/6090 [00:00<?, ?ex/s]

  0%|          | 0/1523 [00:00<?, ?ex/s]

  0%|          | 0/3263 [00:00<?, ?ex/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss
500,0.382




Training microsoft/deberta-v3-base
total: 15.835660288
allocated: 1.53042176
cached: 3.139436544


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



  0%|          | 0/6090 [00:00<?, ?ex/s]

  0%|          | 0/1523 [00:00<?, ?ex/s]

  0%|          | 0/3263 [00:00<?, ?ex/s]

pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()
Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss
500,0.3882




[('bert-base-uncased', array([1, 1, 1, ..., 1, 1, 1])),
 ('distilbert-base-uncased', array([1, 1, 1, ..., 1, 1, 1])),
 ('roberta-base', array([1, 1, 1, ..., 1, 1, 1])),
 ('microsoft/deberta-v3-base', array([1, 1, 1, ..., 1, 1, 1]))]

In [12]:
all_preds = [pred[1] for pred in models.preds]
final_preds = mode(all_preds, axis=0)[0]
final_preds = final_preds.ravel()

In [13]:
submission = pd.read_csv('/kaggle/input/nlp-getting-started/sample_submission.csv')
submission['target'] = pred
submission = submission.set_index('id',drop=True)
submission.to_csv('/kaggle/working/prediction.csv')

In [14]:
submission.head(10)

Unnamed: 0_level_0,target
id,Unnamed: 1_level_1
0,1
2,1
3,1
9,1
11,1
12,1
21,0
22,0
27,0
29,0
