In [1]:
%load_ext autoreload
%autoreload 2

## Exploratory data analysis

In [38]:
import os
while 'notebooks' in os.getcwd():
    os.chdir("..")

import numpy as np
import pandas as pd 
from src.utils import train_test_split
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from src.preprocessing import TextDataset
import torch
from torch.utils.data import DataLoader, Dataset
from IPython.display import clear_output
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

In [9]:
train_dict, test_dict = train_test_split()

  0%|          | 0/16 [00:00<?, ?it/s]

100%|██████████| 16/16 [00:04<00:00,  3.55it/s]


In [8]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", cache_dir = '/Data')
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", cache_dir = '/Data')

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
train_df = pd.concat(train_dict.values())

In [13]:
train_df

Unnamed: 0,ID,MatchID,PeriodID,EventType,Timestamp,Tweet
0,0_0,0,0,0,1403725800000,RT @soccerdotcom: Pick a #WorldCup Side! \nRet...
1,0_0,0,0,0,1403725800000,RT @FOXSoccer: #WorldCup Group E standings:\n\...
2,0_0,0,0,0,1403725801000,#honduras vs #ecuador - current tweets: 852:27...
3,0_0,0,0,0,1403725802000,Ecuador have not picked up any injuries and th...
4,0_0,0,0,0,1403725802000,RT @TheSCRLife: If #HON wins we’ll give away a...
...,...,...,...,...,...,...
155544,19_129,19,129,1,1403560800000,FINAL GROUP A STANDINGS:\n#BRA 7 pts (+5 GD)\n...
155545,19_129,19,129,1,1403560800000,RT @FIFAWorldCup: GROUP A #WORLDCUP RESULTS:\n...
155546,19_129,19,129,1,1403560800000,RT @PurelyFootball: World Cup 2014 so far: \n\...
155547,19_129,19,129,1,1403560800000,RT @FootballFunnys: World Cup 2014 so far: \n\...


In [76]:

def train_model(train_dataloader, model, optimizer, n_epochs : int = 10, device : str = 'cuda', accumulation_steps : int = 1):
    model.train()
    
    for epoch in range(n_epochs):
        all_preds = []
        all_labels = []
        epoch_loss = 0
        for i, batch in enumerate(tqdm(train_dataloader)):

            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            with torch.autocast( device_type = 'cuda'):
                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss
            loss.backward()

            if (i + 1) % accumulation_steps == 0:
                optimizer.step()
                optimizer.zero_grad()

            preds = torch.argmax(outputs.logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

            epoch_loss += loss.item()

            if i % 10 == 0:
                clear_output()
                print(f"Batch accuracy {accuracy_score(all_labels, all_preds)}")
                print(f"{confusion_matrix(all_labels, all_preds)}")

                all_labels = []
                all_preds = []

        acc = accuracy_score(all_labels, all_preds)
        f1 = f1_score(all_labels, all_preds)

        clear_output()
        print(f"---------- Epoch {epoch} ------------")
        print(f"Training Loss : {epoch_loss}\n")
        print(f"Training Accuracy : {acc}\n")
        print(f"Training F1 : {f1}\n")

In [84]:
def evaluate_model(val_dataloader, model, device : str = 'cuda'):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in tqdm(val_dataloader):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    acc = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)

    print(f"Validation Accuracy : {acc}\n")
    print(f"Validation F1 : {f1}\n")

    return all_preds, all_labels

In [None]:
# K Fold CV

device = 'cuda'
final_results = []
for validation_set_id, validation_data in train_dict.items():
    train_data = train_df.query(f"MatchID != {validation_set_id}").sample(10_000, random_state=1)
    
    train_dataset = TextDataset(
        train_data["Tweet"].tolist(), train_data["EventType"].tolist(), tokenizer
    )
    val_dataset = TextDataset(
        validation_data["Tweet"].tolist(), validation_data["EventType"].tolist(), tokenizer
    )

    train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=32)

    model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2, cache_dir = '/Data').to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

    # Train the model
    train_model(train_dataloader, model, optimizer,n_epochs=2)

    # Evaluate the model
    preds, labels = evaluate_model(val_dataloader, model)

    # Combine results for this fold
    validation_results = pd.DataFrame({
        "MatchID": validation_data["MatchID"].values,
        "true_values": labels,
        "predictions": preds,
    })
    
    final_results.append(validation_results)

 90%|█████████ | 282/313 [00:19<00:02, 14.27it/s]

Batch accuracy 0.925
[[110  11]
 [ 13 186]]


 92%|█████████▏| 287/313 [00:20<00:01, 14.30it/s]


KeyboardInterrupt: 

In [85]:
preds, labels = evaluate_model(val_dataloader, model)

  0%|          | 0/1299 [00:00<?, ?it/s]

100%|██████████| 1299/1299 [01:25<00:00, 15.21it/s]

Validation Accuracy : 0.5922145453670045

Validation F1 : 0.6817712149392249




