In [1]:
# This was default Kaggle setup/support code 

import numpy as np 
import pandas as pd 

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


/kaggle/input/covid-19-nlp-text-classification/Corona_NLP_test.csv
/kaggle/input/covid-19-nlp-text-classification/Corona_NLP_train.csv


In [2]:
# from fast.ai NLP beginners notebook, deal with path
creds = ''
from pathlib import Path

cred_path = Path('~/.kaggle/kaggle.json').expanduser()
if not cred_path.exists():
    cred_path.parent.mkdir(exist_ok=True)
    cred_path.write_text(creds)
    cred_path.chmod(0o600)

In [3]:
import os
iskaggle = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', '')

In [4]:
# bring in NLP dataset
if iskaggle:
    path = Path('../input/covid-19-nlp-text-classification')
    ! pip install -q datasets

In [5]:
# test to make sure we have correct files - we do! 
!ls {path}

Corona_NLP_test.csv  Corona_NLP_train.csv


In [6]:
import pandas as pd
# note - had to swap encoding, was getting "invalid continuation byte" error before
# see https://stackoverflow.com/questions/5552555/unicodedecodeerror-invalid-continuation-byte
df = pd.read_csv(path/'Corona_NLP_train.csv', encoding='latin-1')
# inspect file as dataframe
df 

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive
3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,Positive
4,3803,48755,,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative
...,...,...,...,...,...,...
41152,44951,89903,"Wellington City, New Zealand",14-04-2020,Airline pilots offering to stock supermarket s...,Neutral
41153,44952,89904,,14-04-2020,Response to complaint not provided citing COVI...,Extremely Negative
41154,44953,89905,,14-04-2020,You know itÂs getting tough when @KameronWild...,Positive
41155,44954,89906,,14-04-2020,Is it wrong that the smell of hand sanitizer i...,Neutral


In [7]:
df.describe(include='object')

Unnamed: 0,Location,TweetAt,OriginalTweet,Sentiment
count,32567,41157,41157,41157
unique,12220,30,41157,5
top,London,20-03-2020,@TartiiCat Well new/used Rift S are going for ...,Positive
freq,540,3448,1,11422


In [8]:
# input - i'll use just original tweet, could do other stuff - don't want to deal w/ cleaning location data
df['input'] = 'OriginalTweet:' + df.OriginalTweet

In [9]:
# inspect to make sure this works
df.input.head()

0    OriginalTweet:@MeNyrbie @Phil_Gahan @Chrisitv ...
1    OriginalTweet:advice Talk to your neighbours f...
2    OriginalTweet:Coronavirus Australia: Woolworth...
3    OriginalTweet:My food stock is not the only on...
4    OriginalTweet:Me, ready to go at supermarket d...
Name: input, dtype: object

In [10]:
# create Dataset object for Transformers
from datasets import Dataset,DatasetDict
ds = Dataset.from_pandas(df)
# inspect
ds 

Dataset({
    features: ['UserName', 'ScreenName', 'Location', 'TweetAt', 'OriginalTweet', 'Sentiment', 'input'],
    num_rows: 41157
})

In [11]:
# just going to use deberta here, don't want to bother yet with changing model
model_nm = 'microsoft/deberta-v3-small'
from transformers import AutoModelForSequenceClassification,AutoTokenizer
tokz = AutoTokenizer.from_pretrained(model_nm)

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/578 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



In [12]:
tokz.tokenize("Here's my tokenized sentence from Deberta.")

['▁Here',
 "'",
 's',
 '▁my',
 '▁token',
 'ized',
 '▁sentence',
 '▁from',
 '▁Deb',
 'erta',
 '.']

In [13]:
# bring in test set 
eval_df = pd.read_csv(path/'Corona_NLP_test.csv')
eval_df

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,1,44953,NYC,02-03-2020,TRENDING: New Yorkers encounter empty supermar...,Extremely Negative
1,2,44954,"Seattle, WA",02-03-2020,When I couldn't find hand sanitizer at Fred Me...,Positive
2,3,44955,,02-03-2020,Find out how you can protect yourself and love...,Extremely Positive
3,4,44956,Chicagoland,02-03-2020,#Panic buying hits #NewYork City as anxious sh...,Negative
4,5,44957,"Melbourne, Victoria",03-03-2020,#toiletpaper #dunnypaper #coronavirus #coronav...,Neutral
...,...,...,...,...,...,...
3793,3794,48746,Israel ??,16-03-2020,Meanwhile In A Supermarket in Israel -- People...,Positive
3794,3795,48747,"Farmington, NM",16-03-2020,Did you panic buy a lot of non-perishable item...,Negative
3795,3796,48748,"Haverford, PA",16-03-2020,Asst Prof of Economics @cconces was on @NBCPhi...,Neutral
3796,3797,48749,,16-03-2020,Gov need to do somethings instead of biar je r...,Extremely Negative


In [14]:
# need to set the eval_df up the same as above
eval_df['input'] = 'OriginalTweet:' + df.OriginalTweet

In [15]:
# when I attempt to train, getting the issue that I fail to truncate 
# need to get the max length in both the evaluation and train dfs
# first create input_length col in both
df['input_length'] = df['input'].apply(len)
eval_df['input_length'] = eval_df['input'].apply(len)

# next get maximums
max_input_len_train = df['input_length'].max()
max_input_len_eval = df['input_length'].max()

max_length = max_input_len_train
if max_input_len_eval > max_input_len_train:
    max_length = max_input_len_eval

max_length

369

In [16]:
# tokenize all of our inputs
# NOTE: initially got a problem when this had NoneTypes from Location;
# if this was more serious I'd clean the data to get rid of this, but location seems lossy anyways 
def tok_func(x): return tokz(x["input"], padding=True, truncation=True, max_length=max_length)
tok_ds = ds.map(tok_func, batched=True)
# DataSet from DataFrame, tokenized
eval_ds = Dataset.from_pandas(eval_df).map(tok_func, batched=True)

Map:   0%|          | 0/41157 [00:00<?, ? examples/s]

Map:   0%|          | 0/3798 [00:00<?, ? examples/s]

In [17]:
# transformers assumes labelshas column "labels"
tok_ds = tok_ds.rename_columns({'Sentiment':'labels'})
eval_ds = eval_ds.rename_columns({'Sentiment':'labels'})

In [18]:
# however, we don't want to use the eval_df as our validation set, just our eventual test set
# going to make a validation set here:
dds = tok_ds.train_test_split(0.25, seed=42)
dds

DatasetDict({
    train: Dataset({
        features: ['UserName', 'ScreenName', 'Location', 'TweetAt', 'OriginalTweet', 'labels', 'input', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 30867
    })
    test: Dataset({
        features: ['UserName', 'ScreenName', 'Location', 'TweetAt', 'OriginalTweet', 'labels', 'input', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 10290
    })
})

In [20]:
from transformers import TrainingArguments,Trainer


2024-07-25 18:54:34.063094: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-25 18:54:34.063241: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-25 18:54:34.180295: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [22]:
# the specifics here are fuzzy to me but as I understand it
# the Pearson coefficient shows how much two things are correlated
# here we are going to find correlation between our predictions and the ground truth of the eval est
def corr(x,y): return np.corrcoef(x,y)[0][1]
def corr_d(eval_pred): return {'pearson': corr(*eval_pred)}

In [23]:
# look at current format - these are correctly tokenized - for debugging 
# dds['train']['input_ids'][0]

In [24]:
# need to get the unique labels into numbers
unique_labels = list(set(tok_ds['labels']))
unique_labels


['Extremely Negative', 'Positive', 'Extremely Positive', 'Negative', 'Neutral']

In [25]:
label_dict = {
    'Extremely Negative': 0,
    'Negative': 0.25,
    'Neutral': 0.5,
    'Positive': 0.75,
    'Extremely Positive': 1
}

In [26]:
def format_labels(examples):
    examples['labels'] = [label_dict[label] for label in examples['labels']]
    return examples

In [27]:
# Apply the format_labels function to the dataset using the map method
formatted_dds = dds.map(format_labels, batched=True)

Map:   0%|          | 0/30867 [00:00<?, ? examples/s]

Map:   0%|          | 0/10290 [00:00<?, ? examples/s]

In [28]:
formatted_dds['train'][0]['labels']

0.75

In [42]:
bs = 32
epochs = 4
lr = 8e-5

In [43]:
args = TrainingArguments('outputs', learning_rate=lr, warmup_ratio=0.1, lr_scheduler_type='cosine', fp16=True,
    evaluation_strategy="epoch", per_device_train_batch_size=bs, per_device_eval_batch_size=bs*2,
    num_train_epochs=epochs, weight_decay=0.01, report_to='none')



In [44]:
model = AutoModelForSequenceClassification.from_pretrained(model_nm, num_labels=1)
trainer = Trainer(model, args, train_dataset=formatted_dds['train'], eval_dataset=formatted_dds['test'],
                  tokenizer=tokz, compute_metrics=corr_d)
trainer.train();

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Pearson
1,No log,0.027971,0.88268
2,0.059600,0.022763,0.92218
3,0.019200,0.014756,0.936465
4,0.009500,0.013716,0.942082




In [46]:
eval_ds = Dataset.from_pandas(eval_df).map(tok_func, batched=True)
preds = trainer.predict(eval_ds).predictions.astype(float)


Map:   0%|          | 0/3798 [00:00<?, ? examples/s]

array([[0.50294423],
       [0.73556268],
       [0.77110201],
       ...,
       [0.25231057],
       [0.78408509],
       [0.89477992]])

In [51]:
evaluation_results = trainer.evaluate()
evaluation_results

{'eval_loss': 0.013715675100684166,
 'eval_pearson': 0.9420822092639983,
 'eval_runtime': 58.8447,
 'eval_samples_per_second': 174.867,
 'eval_steps_per_second': 1.377,
 'epoch': 4.0}