### All Installations done in this cell

In [None]:
!pip install sentence-transformers
!pip install datasets
!pip install torchinfo

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


### Importing Libraries

In [None]:
import torch
import re
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
from datasets import Dataset, DatasetDict, ClassLabel, Value, load_dataset, load_metric
from sklearn.metrics import classification_report, f1_score, accuracy_score, precision_score, recall_score, make_scorer
from sklearn.model_selection import train_test_split
from torchinfo import summary
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sn
import tqdm
import random
import nltk

In [None]:
if torch.cuda.is_available():
  device = "cuda"
else:
  device = "cpu"

### Loading Data

In [None]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [None]:
train_df.head()

Unnamed: 0,tweet,label,id
0,#WhoIsQ #WheresTheServer #DumpNike #DECLASFISA...,OFF,0
1,"#ConstitutionDay is revered by Conservatives, ...",NOT,1
2,#FOXNews #NRA #MAGA #POTUS #TRUMP #2ndAmendmen...,NOT,2
3,#Watching #Boomer getting the news that she is...,NOT,3
4,#NoPasaran: Unity demo to oppose the far-right...,OFF,4


In [None]:
test_df.head()

Unnamed: 0,tweet,id
0,#WhoIsQ #WheresTheServer #DumpNike #DECLASFISA...,0
1,"#ConstitutionDay is revered by Conservatives, ...",1
2,#FOXNews #NRA #MAGA #POTUS #TRUMP #2ndAmendmen...,2
3,#Watching #Boomer getting the news that she is...,3
4,#NoPasaran: Unity demo to oppose the far-right...,4


### Pre-processing

In [None]:
import re
def clean_text(text):
    text = re.sub(r"@[A-Za-z0-9]+", ' ', text)
    text = re.sub(r"https?://[A-Za-z0-9./]+", ' ', text)
    text = re.sub(r"[^a-zA-z.!?'0-9]", ' ', text)
    text = re.sub('\t', ' ',  text)
    text = re.sub(r" +", ' ', text)
    return text

In [None]:
train_df['tweet'] = train_df['tweet'].apply(clean_text)
test_df['tweet'] = test_df['tweet'].apply(clean_text)

### Spliting Train dataset into Train and Val

In [None]:
# for out validation dataset, we have chosen 1000 (7.5%) fields from the train dataset
validation_df = pd.DataFrame(columns=['tweet', 'label', 'id'])
validation_df['tweet'] = train_df['tweet'][12240:]
validation_df['label'] = train_df['label'][12240:]
validation_df['id'] = train_df['id'][12240:]

In [None]:
# remove the common ones between train and val
train_df.drop(train_df.tail(1000).index,inplace = True)

In [None]:
# adding placeholder labels in test set, they have no purpose except to make the shape of the datasets same
placeholder_labels = []
for i in range(860):
    placeholder_labels.append('OFF')

test_df.insert(1,'label',placeholder_labels,True)

In [None]:
# convert the dataframs to csvs so we dont have to pre-process again, and add them to project dataset folder
train_df.to_csv('preprocessed_train.csv', index = False)
validation_df.to_csv('preprocessed_val.csv', index = False)
test_df.to_csv('preprocessed_test.csv', index = False)

### Modelling and Training

In [None]:
data_loader = load_dataset('/content/', 'csv',  data_files = {'train': 'preprocessed_train.csv', 'val':'preprocessed_val.csv', 'test': 'preprocessed_test.csv'})
cls = ClassLabel(names = list(data_loader['train'].unique('label')))
data_loader = data_loader.cast_column('label', cls)

data_loader



Downloading and preparing dataset csv/content to /root/.cache/huggingface/datasets/csv/content-907cf124943478b1/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

   

Extracting data files #1:   0%|          | 0/1 [00:00<?, ?obj/s]

Extracting data files #2:   0%|          | 0/1 [00:00<?, ?obj/s]

Extracting data files #0:   0%|          | 0/1 [00:00<?, ?obj/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating val split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/content-907cf124943478b1/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Casting the dataset:   0%|          | 0/13 [00:00<?, ?ba/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['tweet', 'label', 'id'],
        num_rows: 12240
    })
    val: Dataset({
        features: ['tweet', 'label', 'id'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['tweet', 'label', 'id'],
        num_rows: 860
    })
})

In [None]:
model_id = 'roberta-base'
tokenizer = AutoTokenizer.from_pretrained(model_id)
data_collator = DataCollatorWithPadding(tokenizer = tokenizer)

In [None]:
def tokenize_function(examples):
    return tokenizer(examples['tweet'], padding='max_length', truncation = True, max_length = 100)

tokenized_data_loader = data_loader.map(tokenize_function, batched = True)

  0%|          | 0/13 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

### Fine-tuning

In [None]:
metric = load_metric("accuracy")
# evaluation function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references = labels)

"""
    Full roberta model, 4 epochs with a learning rate of 5e-6 and weight decay of 0.01 and a batch size 32.
    Data is truncated at 100 tokens.
"""

model = AutoModelForSequenceClassification.from_pretrained(model_id, num_labels = 2)

training_args = TrainingArguments(    
    output_dir = "./results",
    report_to = 'all',
    learning_rate = 5e-6,
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 32,
    num_train_epochs = 4,
    weight_decay = 0.01,
    no_cuda = False,
    fp16 = True,
    evaluation_strategy = 'epoch',
    logging_strategy = 'epoch',
)
  
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data_loader['train'],
    eval_dataset=tokenized_data_loader['val'],
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

  metric = load_metric("accuracy")
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['c

In [None]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: tweet, id. If tweet, id are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 12240
  Num Epochs = 4
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 3060
  Number of trainable parameters = 124647170
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.5062,0.441607,0.809
2,0.4156,0.432746,0.82
3,0.3811,0.451916,0.809
4,0.3571,0.457699,0.808


Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json
Model weights saved in ./results/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-500/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: tweet, id. If tweet, id are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 32
Saving model checkpoint to ./results/checkpoint-1000
Configuration saved in ./results/checkpoint-1000/config.json
Model weights saved in ./results/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-1000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-1000/specia

TrainOutput(global_step=3060, training_loss=0.4149759529462827, metrics={'train_runtime': 478.8512, 'train_samples_per_second': 102.245, 'train_steps_per_second': 6.39, 'total_flos': 2515999466880000.0, 'train_loss': 0.4149759529462827, 'epoch': 4.0})

### Predictions

In [None]:
test_predictions = np.argmax(trainer.predict(tokenized_data_loader['test'])[0], axis = -1)

The following columns in the test set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: tweet, id. If tweet, id are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 860
  Batch size = 32


In [None]:
predictions = pd.DataFrame(columns=['label','id'])
predictions['label'] = test_predictions

ids = list(range(0,860))
predictions['id'] = ids

In [None]:
predictions.to_csv('predictions.csv', index = False)

### Save the model

In [None]:
model.save_pretrained('/content/')

Configuration saved in /content/config.json
Model weights saved in /content/pytorch_model.bin
