<a href="https://colab.research.google.com/github/lostgethsemane/Medium_Code_Examples/blob/master/Deep_Learning_Method.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [94]:
import torch
import pandas as pd
import numpy as np
import time
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import torch.optim as optim

In [95]:
df_train = pd.read_csv('dreaddit-train.csv')
df_test = pd.read_csv('dreaddit-test.csv')

In [96]:
df_train.drop(df_train[df_train.text.str.strip().str.len() < 50].index, inplace=True)
df_test.drop(df_test[df_test.text.str.strip().str.len() < 50].index, inplace=True)

In [97]:
X_train = df_train['text'].to_list()
y_train = df_train['label'].to_list()
X_test = df_test['text'].to_list()
y_test = df_test['label'].to_list()

In [98]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

X_train_tokens = tokenizer(X_train, padding=True, truncation=True, return_tensors='pt')
X_test_tokens = tokenizer(X_test, padding=True, truncation=True, return_tensors='pt')

In [106]:
from torch.utils.data import DataLoader, TensorDataset

# Defining batch size
batch_size = 128

# Converting tokenized data to PyTorch tensors
X_train_tensor = X_train_tokens['input_ids'].clone().detach()
y_train_tensor = torch.tensor(y_train)

X_test_tensor = X_test_tokens['input_ids'].clone().detach()
y_test_tensor = torch.tensor(y_test)

# Creating DataLoader for training data
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Creating DataLoader for testing data
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

print(f"Number of batches in the Training DataLoader: {len(train_dataloader)}")
print(f"Number of batches in the Training DataLoader: {len(test_dataloader)}")

Number of batches in the Training DataLoader: 23
Number of batches in the Training DataLoader: 6


In [107]:
# initializing the model
model = AutoModelForSequenceClassification.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

# freezing the parameters so these parameters don't get updated
for p in model.base_model.parameters():
    p.requires_grad = False
params = list(model.named_parameters())

## checking which layers are frozen
'''
params = list(model.named_parameters())
print(f"The BERT model has {len(params)} different named parameters.\n")

for name, param in params:
    print(f"{name} - {'Frozen' if not param.requires_grad else 'Not Frozen'} - {tuple(param.size())}")

'''

'\nparams = list(model.named_parameters())\nprint(f"The BERT model has {len(params)} different named parameters.\n")\n\nfor name, param in params:\n    print(f"{name} - {\'Frozen\' if not param.requires_grad else \'Not Frozen\'} - {tuple(param.size())}")\n\n'

In [None]:
from torch.utils.data import DataLoader, TensorDataset
from transformers import AutoModelForSequenceClassification, AdamW, get_linear_schedule_with_warmup

# Starting time
start_time = time.time()

# Optimizer and Scheduler
optimizer = optim.Adam(model.parameters(), lr=1e-3)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader))

# Loss function
criterion = torch.nn.CrossEntropyLoss()

# Number of epochs
num_epochs = 5
best_val_loss = float('inf')
precision_per_epoch = []
recall_per_epoch = []
f1_per_epoch = []
accuracy_per_epoch = []

# Training loop
for epoch in range(num_epochs):
    model.train()
    i = 0
    # Training loop
    for batch in train_dataloader:
        inputs, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()
        #if (i%5==0):
        print(f'Batch {i+1} in epoch {epoch+1} finished training.')
        i = i+1

    # Validation
    model.eval()
    val_loss = 0.0
    all_preds = []
    all_labels = []
    for batch in test_dataloader:
        inputs, labels = batch
        with torch.no_grad():
            outputs = model(inputs, labels=labels)
        val_loss += outputs.loss.item()
        probs = torch.nn.functional.softmax(outputs.logits, dim=1)
        preds = torch.argmax(probs, dim=1).cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(labels.cpu().numpy())

    avg_val_loss = val_loss / len(test_dataloader)
    precision = precision_score(all_labels, all_preds)
    precision_per_epoch.append(precision)
    accuracy = accuracy_score(all_labels, all_preds)
    accuracy_per_epoch.append(precision)
    recall = recall_score(all_labels, all_preds, zero_division=1)
    recall_per_epoch.append(recall)
    f1 = f1_score(all_labels, all_preds)
    f1_per_epoch.append(f1)

    print(f'Epoch {epoch + 1}/{num_epochs}, Validation Loss: {avg_val_loss}')
    print(f'Precision: {precision}, Recall: {recall}, F1 Score: {f1}, Accuracy Score: {accuracy}')
    # Check for early stopping
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
    else:
        print("Validation loss is not improving. Stopping early.")
        break

# Ending Training
end_time = time.time()

In [None]:
print(f"Time it took to train and evaluate the model: {end_time - start_time}")

In [None]:
# Saving and downloading the model
from google.colab import files
torch.save(model.state_dict(), 'bert_initial_model_MS2.pth')
#files.download('bert_initial_model_MS2.pth')

In [None]:
# Preparing the data for the analysis
labels_last_epoch = all_labels
predicitions_last_epoch = all_preds
mismatched_indices = [i for i, (true, pred) in enumerate(zip(labels_last_epoch, predicitions_last_epoch)) if true != pred]

In [None]:
# Checking the mismatch for one example
 text = X_test[0]
 print(text)