In [1]:
%load_ext autoreload
%autoreload 2

## Training PET

In [12]:
import os
while 'notebooks' in os.getcwd():
    os.chdir("..")

import numpy as np
import pandas as pd 
from src.utils import train_test_split, get_sample_weights, get_eval_set
from src.preprocessing import preprocess_data
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoModel, DataCollatorForLanguageModeling, AutoModelForMaskedLM
from src.preprocessing import TextDataset
import torch
from torch.utils.data import DataLoader, Dataset
from IPython.display import clear_output
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, precision_score, roc_auc_score
from sentence_transformers import SentenceTransformer, InputExample, losses, evaluation, LoggingHandler
import logging
from copy import deepcopy
from sklearn.decomposition import PCA
from huggingface_hub import notebook_login
from sklearn.ensemble import RandomForestClassifier
from peft import get_peft_model, LoraConfig, TaskType
from collections import defaultdict
import transformers
from peft import get_peft_model, LoraConfig, TaskType
import re
from bert_score import BERTScorer
import langid
from src.utils import aggregate_samples, evaluate_model, compute_class_weights, remove_hashtag_links, get_first_texts, validate_pet_model
from torch.optim.lr_scheduler import ReduceLROnPlateau

from transformers import BitsAndBytesConfig

tqdm.pandas()

In [3]:
train_data, test_data = train_test_split()


  0%|          | 0/16 [00:00<?, ?it/s]

100%|██████████| 16/16 [00:05<00:00,  2.97it/s]


In [4]:
df = pd.concat(train_data)

tokenizer = AutoTokenizer.from_pretrained("allenai/longformer-base-4096", cache_dir = '/Data')
df['tokens'] = df['Tweet'].progress_apply(tokenizer.tokenize)

target_words = [
    "goal", "penalty", "halftime", "full-time", "yellow", "red",
    "kickoff", "extra time", "stoppage time", "foul", "offside", "handball",
    "save", "tackle", "dribble", "corner", "substitution", "header",
    "free kick", "throw-in", "assist", "hat-trick", "own goal", "victory",
    "defeat", "draw", "win", "loss", "tie", "comeback", "goalkeeper",
    "striker", "midfielder", "defender", "referee", "fans", "var", "gooal"
]
target_words = set(tokenizer.tokenize(" ".join(target_words)))

def is_valid_text(t):
    for w in t:
        if w in target_words:
            return True
        
    return False

df['is_valid']= df['tokens'].progress_apply(is_valid_text)
# df['lan'] = df['Tweet'].progress_apply(lambda x : langid.classify(x)[0])

100%|██████████| 1472980/1472980 [01:33<00:00, 15762.42it/s]
100%|██████████| 1472980/1472980 [00:01<00:00, 986963.01it/s] 


In [5]:
en_df = df.query("is_valid == 1")#.query("lan == 'en' ")

In [6]:
val_indices = [1,5,12,19]
# val_indices = list(np.random.choice(all_train_indices, 3, replace=False))
# train_indices = list(set(all_train_indices).difference(set(val_indices)))
train_indices = [0,2,7,11,13,18]

train_df = aggregate_samples(en_df, train_indices)
val_df = aggregate_samples(en_df, val_indices)

  return (df.query(f"MatchID in {indices}")).groupby(["MatchID", "PeriodID"]).agg({
  return (df.query(f"MatchID in {indices}")).groupby(["MatchID", "PeriodID"]).agg({


In [7]:
train_df['prompt'] = train_df['Tweet'] + "\nIs there a tweet that indicates a football event like Goal, Halftime, Red or Yellow Card or Fulltime ? "+ tokenizer.mask_token
val_df['prompt'] = val_df['Tweet'] + "\nIs there a tweet that indicates a football event like Goal, Halftime, Red or Yellow Card or Fulltime ? "+ tokenizer.mask_token

In [8]:
train_df['label'] = train_df['EventType'].map({0:tokenizer.convert_tokens_to_ids("no"), 1: tokenizer.convert_tokens_to_ids("yes")})
val_df['label'] = val_df['EventType'].map({0:tokenizer.convert_tokens_to_ids("no"), 1: tokenizer.convert_tokens_to_ids("yes")})

In [14]:
class YesNoDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, mask_token="<mask>", max_length=4096, device = 'cuda'):
        self.texts = texts
        self.tokenizer = tokenizer
        self.mask_token = mask_token
        self.max_length = max_length
        self.labels = labels
        self.device = device

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        # Tokenize the text and ensure masking
        text = self.texts[idx]
        inputs = self.tokenizer(
            text, 
            return_tensors="pt", 
            truncation=True, 
            padding="longest", 
            max_length=self.max_length
        )
        input_ids = inputs.input_ids.squeeze()
        
        # Find and mask the token to predict
        mask_index = torch.where(input_ids == self.tokenizer.mask_token_id)[0]
        labels = input_ids.clone()
        labels[:] = -100  # Set all to ignore_index (-100)
        if mask_index.numel() > 0:
            labels[mask_index] = self.labels[idx] # Keep the target at the masked position
        return {
            "input_ids": input_ids.to(self.device),
            "attention_mask": inputs.attention_mask.squeeze().to(self.device),
            "labels": labels.to(self.device)
        }

In [15]:
train_dataset = YesNoDataset(
    train_df['prompt'].tolist(),
    train_df['label'].tolist(),
    tokenizer,
    tokenizer.mask_token
)

val_dataset = YesNoDataset(
    val_df['prompt'].tolist(),
    val_df['label'].tolist(),
    tokenizer,
    tokenizer.mask_token
)


In [None]:

# Define the data


tokenizer = AutoTokenizer.from_pretrained("allenai/longformer-base-4096", cache_dir = '/Data')

# Prepare DataLoader
train_dataloader = DataLoader(
    train_dataset,
    batch_size=1,
    shuffle=True,
    # collate_fn=DataCollatorForLanguageModeling(
    #     tokenizer=tokenizer,
    #     mlm=True,
    #     mlm_probability=0.15
    # )
)

val_dataloader = DataLoader(
    val_dataset,
    batch_size=1,
    shuffle=True,
    # collate_fn=DataCollatorForLanguageModeling(
    #     tokenizer=tokenizer,
    #     mlm=True,
    #     mlm_probability=0.15
    # )
)
allowed_tokens = tokenizer.convert_tokens_to_ids(["no", "yes"])
# Load the model
# model = AutoModelForMaskedLM.from_pretrained("allenai/longformer-base-4096", cache_dir = '/Data')
# model = model.to("cuda")

# Define optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)

# Training loop
epochs = 10
for epoch in range(epochs):
    model.train()
    epoch_loss = 0
    for batch in tqdm(train_dataloader):
        input_ids = batch["input_ids"].to("cuda")
        attention_mask = batch["attention_mask"].to("cuda")
        labels = batch["labels"].to("cuda")

        with torch.autocast( device_type = 'cuda'):
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits  # Shape: (batch_size, seq_len, vocab_size)

        # Create a mask to identify `[MASK]` positions
            mask_positions = (input_ids == tokenizer.mask_token_id)  # Shape: (batch_size, seq_len)

            # Extract logits for `[MASK]` positions only
            masked_logits = logits[mask_positions]  # Shape: (num_masks, vocab_size)

            # Filter logits to include only "yes" and "no"
            allowed_logits = masked_logits[:, allowed_tokens]  # Shape: (num_masks, len(allowed_tokens))

            # Create the corresponding target labels (mapped to indices in allowed_tokens)
            target_labels = labels[mask_positions]  # Shape: (num_masks)
            remapped_labels = torch.zeros_like(target_labels)
            for i, token_id in enumerate(allowed_tokens):
                remapped_labels[target_labels == token_id] = i

            # Compute loss only for `[MASK]` tokens
            loss_fn = torch.nn.CrossEntropyLoss()  # No ignore_index needed as we filter positions
            loss = loss_fn(allowed_logits, remapped_labels)

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
    
    y_pred_val , y_true_val = validate_pet_model(model, val_dataloader, tokenizer, allowed_tokens)
    

100%|██████████| 780/780 [01:25<00:00,  9.15it/s]
Validation: 100%|██████████| 487/487 [00:13<00:00, 37.38it/s]


Accuracy: 0.5585
[[  0 215]
 [  0 272]]


100%|██████████| 780/780 [01:25<00:00,  9.16it/s]
Validation: 100%|██████████| 487/487 [00:12<00:00, 37.64it/s]


Accuracy: 0.5585
[[  0 215]
 [  0 272]]


100%|██████████| 780/780 [01:25<00:00,  9.16it/s]
Validation: 100%|██████████| 487/487 [00:12<00:00, 37.53it/s]

Accuracy: 0.5585
[[  0 215]
 [  0 272]]





In [32]:
remapped_labels

tensor([0], device='cuda:0')

In [None]:
masked_probs.

tensor([[1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000]],
       device='cuda:0', grad_fn=<SumBackward1>)