In [36]:
import pandas as pd
from datasets import load_dataset
from transformers import AutoModelForSequenceClassification,AutoTokenizer
from sklearn.metrics import accuracy_score,f1_score
from transformers import Trainer,TrainingArguments
import numpy as np

In [2]:
train_set=pd.read_csv("../../datasets/classification/train.csv")

In [3]:
test_set=pd.read_csv("../../datasets/classification/test.csv")

In [4]:
train_set=train_set[['text','target']]


In [5]:
test_set=test_set[['text']]

In [6]:
# converting pandas type to datasets type

In [7]:
train_dataset=load_dataset("csv",data_files='../../datasets/classification/train.csv')
test_dataset=load_dataset("csv",data_files='../../datasets/classification/test.csv')


Using custom data configuration default-8c63bb9970a7b12f
Reusing dataset csv (/Users/monishostwal/.cache/huggingface/datasets/csv/default-8c63bb9970a7b12f/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519)


  0%|          | 0/1 [00:00<?, ?it/s]

Using custom data configuration default-0c7b5fd350e329ca
Reusing dataset csv (/Users/monishostwal/.cache/huggingface/datasets/csv/default-0c7b5fd350e329ca/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519)


  0%|          | 0/1 [00:00<?, ?it/s]

In [8]:
# creating word encodings

In [9]:
checkpoint='distilbert-base-uncased'

In [10]:
tokenizer=AutoTokenizer.from_pretrained(checkpoint)

In [11]:
def tokenize_function(example):
    return tokenizer(example['text'], truncation=True,padding=True)

In [12]:
train_dataset=train_dataset.map(tokenize_function,batched=True)

Loading cached processed dataset at /Users/monishostwal/.cache/huggingface/datasets/csv/default-8c63bb9970a7b12f/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519/cache-187e890a29d39326.arrow


In [13]:
train_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'keyword', 'location', 'text', 'target', 'input_ids', 'attention_mask'],
        num_rows: 7613
    })
})

In [14]:
train_dataset=train_dataset.remove_columns(['id','keyword','location','text'])

In [15]:
train_dataset

DatasetDict({
    train: Dataset({
        features: ['target', 'input_ids', 'attention_mask'],
        num_rows: 7613
    })
})

In [16]:
train_dataset=train_dataset.rename_column('target','labels')

In [17]:
train_dataset['train']

Dataset({
    features: ['labels', 'input_ids', 'attention_mask'],
    num_rows: 7613
})

In [18]:
test_dataset=test_dataset.map(tokenize_function,batched=True)


  0%|          | 0/4 [00:00<?, ?ba/s]

In [19]:
test_dataset=test_dataset.remove_columns(['id','keyword','location','text'])

In [20]:
test_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 3263
    })
})

In [21]:
# model loading

In [22]:
model=AutoModelForSequenceClassification.from_pretrained(checkpoint,num_labels=2)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'pre_classifi

In [23]:
def compute_metrics(pred):
    labels=pred.label_ids
    preds=pred.predictions.argmax(-1)
    f1=f1_score(labels,preds,average='weighted')
    accuracy=accuracy_score(labels,preds)
    return {'f1':f1,'acc':accuracy}

In [24]:
training_args=TrainingArguments(output_dir='../../results/classification/classification_model',num_train_epochs=2,disable_tqdm=False,per_device_train_batch_size=32)

In [25]:
trainer=Trainer(model,training_args,train_dataset=train_dataset['train'],tokenizer=tokenizer,compute_metrics=compute_metrics)

In [26]:
trainer.train()

***** Running training *****
  Num examples = 7613
  Num Epochs = 2
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 476


Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=476, training_loss=0.3755830396123293, metrics={'train_runtime': 3656.165, 'train_samples_per_second': 4.164, 'train_steps_per_second': 0.13, 'total_flos': 330872514796128.0, 'train_loss': 0.3755830396123293, 'epoch': 2.0})

In [32]:
predictions=trainer.predict(test_dataset['train'])

***** Running Prediction *****
  Num examples = 3263
  Batch size = 8


In [33]:
predictions

PredictionOutput(predictions=array([[-0.9807891 ,  0.83639765],
       [-2.0810933 ,  1.7777361 ],
       [-1.8801842 ,  1.7806343 ],
       ...,
       [-2.3907797 ,  2.2109566 ],
       [-1.197084  ,  1.0350052 ],
       [-0.9459625 ,  0.87499833]], dtype=float32), label_ids=None, metrics={'test_runtime': 188.0268, 'test_samples_per_second': 17.354, 'test_steps_per_second': 2.17})

In [37]:
preds = np.argmax(predictions.predictions, axis=-1)

In [39]:
len(preds)

3263

In [41]:
test_set=pd.read_csv("../../datasets/classification/test.csv")

In [42]:
test_set

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan
...,...,...,...,...
3258,10861,,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
3259,10865,,,Storm in RI worse than last hurricane. My city...
3260,10868,,,Green Line derailment in Chicago http://t.co/U...
3261,10874,,,MEG issues Hazardous Weather Outlook (HWO) htt...


In [47]:
ans=pd.DataFrame()
ans['id']=test_set['id']
ans['target']=preds

In [49]:
ans['target'].value_counts()

0    2026
1    1237
Name: target, dtype: int64

In [50]:
ans.to_csv("../../results/classification/finetuning_result.csv",index=False)

In [51]:
# got 83% F1 score for 2 epoch only