<a href="https://colab.research.google.com/github/mstMetaly/CSE102-Structured-Programming-Language-Sessional/blob/main/Untitled0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from tqdm.notebook import tqdm
from transformers import BertForSequenceClassification, BertTokenizer, AdamW, get_linear_schedule_with_warmup
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score
import random
import os

# Check if CUDA is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3, 1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3, 1), 'GB')
else:
    print("CUDA not available. Exiting.")
    exit(0)
torch.cuda.empty_cache()

cuda
Tesla T4
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB


In [2]:
# Load the data
df_train = pd.read_csv('/content/sample_data/Corona_NLP_train.csv', encoding='ISO-8859-1')
df_test = pd.read_csv('/content/sample_data/Corona_NLP_test.csv', encoding='ISO-8859-1')

# Display the first few rows of the dataframes to check if they are loaded correctly
df_train.head()
df_test.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,1,44953,NYC,02-03-2020,TRENDING: New Yorkers encounter empty supermar...,Extremely Negative
1,2,44954,"Seattle, WA",02-03-2020,When I couldn't find hand sanitizer at Fred Me...,Positive
2,3,44955,,02-03-2020,Find out how you can protect yourself and love...,Extremely Positive
3,4,44956,Chicagoland,02-03-2020,#Panic buying hits #NewYork City as anxious sh...,Negative
4,5,44957,"Melbourne, Victoria",03-03-2020,#toiletpaper #dunnypaper #coronavirus #coronav...,Neutral


In [3]:
# Data preprocessing function
def preprocess(row):
    import re
    text = re.sub(r'https?://\S+|www\.\S+', ' ', row)
    text = re.sub(r'@\S+', ' ', text)
    text = re.sub(r'#', ' ', text)
    text = text.replace('.', ' ').replace(',', ' ')
    text = text.replace('\r', ' ').replace('\n', ' ')
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Apply preprocessing to train and test data
df_train['CleanTweet'] = df_train['OriginalTweet'].apply(preprocess)
df_test['CleanTweet'] = df_test['OriginalTweet'].apply(preprocess)

# Select only the relevant columns
df_train = df_train[['CleanTweet', 'Sentiment']]
df_test = df_test[['CleanTweet', 'Sentiment']]

# Display the cleaned tweets
df_train.head()
df_test.head()


Unnamed: 0,CleanTweet,Sentiment
0,TRENDING New Yorkers encounter empty supermark...,Extremely Negative
1,When I couldnt find hand sanitizer at Fred Mey...,Positive
2,Find out how you can protect yourself and love...,Extremely Positive
3,Panic buying hits NewYork City as anxious shop...,Negative
4,toiletpaper dunnypaper coronavirus coronavirus...,Neutral


In [4]:
# Remove duplicates
df_train.drop_duplicates(subset='CleanTweet', inplace=True)
df_test.drop_duplicates(subset='CleanTweet', inplace=True)

# Filter out empty tweets
df_train = df_train[df_train['CleanTweet'].str.strip() != '']
df_test = df_test[df_test['CleanTweet'].str.strip() != '']

# Check the cleaned data
df_train.info()
df_test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 41005 entries, 0 to 41156
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   CleanTweet  41005 non-null  object
 1   Sentiment   41005 non-null  object
dtypes: object(2)
memory usage: 961.1+ KB
<class 'pandas.core.frame.DataFrame'>
Index: 3792 entries, 0 to 3797
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   CleanTweet  3792 non-null   object
 1   Sentiment   3792 non-null   object
dtypes: object(2)
memory usage: 88.9+ KB


In [5]:
# Map sentiment values to numerical labels
df_train['Sentiment'] = df_train['Sentiment'].map({'Extremely Negative': 0, 'Negative': 0, 'Neutral': 1, 'Positive': 2, 'Extremely Positive': 2})
df_test['Sentiment'] = df_test['Sentiment'].map({'Extremely Negative': 0, 'Negative': 0, 'Neutral': 1, 'Positive': 2, 'Extremely Positive': 2})

# Check the distribution of sentiments
print(df_train['Sentiment'].value_counts())
print(df_test['Sentiment'].value_counts())

Sentiment
2    17985
0    15357
1     7663
Name: count, dtype: int64
Sentiment
0    1632
2    1545
1     615
Name: count, dtype: int64


In [6]:
# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(df_train.index.values,
                                                  df_train.Sentiment.values,
                                                  test_size=0.15,
                                                  random_state=42,
                                                  stratify=df_train.Sentiment.values)

df_train['data_type'] = ['not_set']*df_train.shape[0]
df_train.loc[X_train, 'data_type'] = 'train'
df_train.loc[X_val, 'data_type'] = 'val'

# Check the distribution of training and validation sets
df_train['data_type'].value_counts()

data_type
train    34854
val       6151
Name: count, dtype: int64

In [7]:
# Initiating BertTokenizer from 'bert-base-uncased' model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

# Tokenize the training data
encoded_data_train = tokenizer.batch_encode_plus(
    df_train[df_train.data_type=='train'].CleanTweet.values,
    add_special_tokens=True,
    return_attention_mask=True,
    padding='longest',
    truncation=True,
    return_tensors='pt'
)

# Tokenize the validation data
encoded_data_val = tokenizer.batch_encode_plus(
    df_train[df_train.data_type=='val'].CleanTweet.values,
    add_special_tokens=True,
    return_attention_mask=True,
    padding='longest',
    truncation=True,
    return_tensors='pt'
)

# Tokenize the test data
encoded_data_test = tokenizer.batch_encode_plus(
    df_test.CleanTweet.values,
    add_special_tokens=True,
    return_attention_mask=True,
    padding='longest',
    truncation=True,
    return_tensors='pt'
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [8]:
# Create tensors for training data
input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(df_train[df_train.data_type=='train'].Sentiment.values)

# Create tensors for validation data
input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(df_train[df_train.data_type=='val'].Sentiment.values)

# Create tensors for test data
input_ids_test = encoded_data_test['input_ids']
attention_masks_test = encoded_data_test['attention_mask']
labels_test = torch.tensor(df_test.Sentiment.values)

# Create TensorDatasets
dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)
dataset_test = TensorDataset(input_ids_test, attention_masks_test, labels_test)

# Check the sizes of the datasets
print(len(dataset_train), len(dataset_val))
print(len(dataset_test))

34854 6151
3792


In [9]:
# Load the BERT model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=3,
                                                      output_attentions=False,
                                                      output_hidden_states=False)

# Set batch size
batch_size = 4

# Create DataLoaders
dataloader_train = DataLoader(dataset_train,
                              sampler=RandomSampler(dataset_train),
                              batch_size=batch_size)

dataloader_validation = DataLoader(dataset_val,
                                   sampler=SequentialSampler(dataset_val),
                                   batch_size=batch_size)

dataloader_test = DataLoader(dataset_test,
                             sampler=SequentialSampler(dataset_test),
                             batch_size=batch_size)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
# Set up the optimizer and scheduler
optimizer = AdamW(model.parameters(),
                  lr=1e-5,
                  eps=1e-8)

epochs = 2
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train)*epochs)




In [11]:
def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

def evaluate(dataloader_val):

    model.eval()

    loss_val_total = 0
    predictions, true_vals = [], []

    for batch in dataloader_val:

        batch = tuple(b.to(device) for b in batch)

        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():
            outputs = model(**inputs)

        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)

    loss_val_avg = loss_val_total/len(dataloader_val)

    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)

    return loss_val_avg, predictions, true_vals

In [14]:
# Set seed values for reproducibility
seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# Move the model to the GPU
model.to(device)

for epoch in tqdm(range(1, epochs+1)):

    model.train()

    loss_train_total = 0

    progress_bar = tqdm(dataloader_train,
                        desc='Epoch {:1d}'.format(epoch),
                        leave=False,
                        disable=False)

    for batch in progress_bar:

        model.zero_grad()

        batch = tuple(b.to(device) for b in batch)

        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        outputs = model(**inputs)

        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()

        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})

    torch.save(model.state_dict(), f'finetuned_BERT_epoch_{epoch}.model')

    tqdm.write(f'\nEpoch {epoch}')

    loss_train_avg = loss_train_total/len(dataloader_train)
    tqdm.write(f'Training loss: {loss_train_avg}')

    val_loss, predictions, true_vals = evaluate(dataloader_validation)
    val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (Weighted): {val_f1}')


  0%|          | 0/2 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/8714 [00:00<?, ?it/s]


Epoch 1
Training loss: 0.21751330172154
Validation loss: 0.5325504340148838
F1 Score (Weighted): 0.8919667177864463


Epoch 2:   0%|          | 0/8714 [00:00<?, ?it/s]


Epoch 2
Training loss: 0.32647807646857285
Validation loss: 0.5325504340148838
F1 Score (Weighted): 0.8919667177864463


In [15]:
# Load the trained model
model.load_state_dict(torch.load('finetuned_BERT_epoch_2.model'))

# Evaluate the model on the test data
test_loss, test_predictions, test_true_vals = evaluate(dataloader_test)
test_f1 = f1_score_func(test_predictions, test_true_vals)

# Print test results
print(f'Test Loss: {test_loss}')
print(f'Test F1 Score (Weighted): {test_f1}')

Test Loss: 0.5817227166826112
Test F1 Score (Weighted): 0.8859058243553678
