In [1]:
%load_ext autoreload
%autoreload 2

## Training Transformer

In [17]:
import os
while 'notebooks' in os.getcwd():
    os.chdir("..")

import numpy as np
import pandas as pd 
from src.utils import train_test_split, get_sample_weights, get_eval_set
from src.preprocessing import preprocess_data
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoModel
from src.preprocessing import TextDataset
import torch
from torch.utils.data import DataLoader, Dataset
from IPython.display import clear_output
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, precision_score, roc_auc_score
from sentence_transformers import SentenceTransformer, InputExample, losses, evaluation, LoggingHandler
import logging
from copy import deepcopy
from sklearn.decomposition import PCA
from huggingface_hub import notebook_login
from sklearn.ensemble import RandomForestClassifier
from peft import get_peft_model, LoraConfig, TaskType
from collections import defaultdict
import transformers
from peft import get_peft_model, LoraConfig, TaskType
import re
from bert_score import BERTScorer
import langid
from src.utils import aggregate_samples, evaluate_model, compute_class_weights, remove_hashtag_links, get_first_texts
from torch.optim.lr_scheduler import ReduceLROnPlateau

from transformers import BitsAndBytesConfig, DataCollatorWithPadding

tqdm.pandas()

In [26]:
data = pd.read_pickle("summary.pkl")
data['match_id'] = data['text_idx'].apply(lambda x: x[0])
data['period_id'] = data['text_idx'].apply(lambda x: x[1])

data = data.set_index(['match_id', 'period_id'])

In [27]:
# possible_indices = set(train_data.keys())

# test_indices = list(np.random.choice(list(possible_indices), size=3, replace = False,))
# test_indices = [13,1,18]
# all_train_indices = list(possible_indices.difference(set(test_indices)))
val_indices = [1,5,4,12,19]
# val_indices = list(np.random.choice(all_train_indices, 3, replace=False))
# train_indices = list(set(all_train_indices).difference(set(val_indices)))
train_indices = [0,3,2,7,11,]

test_indices = [14,17,13, 18]

train_df = data.query(f"match_id in {train_indices}")
val_df = data.query(f"match_id in {val_indices}")

In [12]:
# train_df = remove_hashtag_links(train_df)
# test_df = remove_hashtag_links(test_df)
# val_df = remove_hashtag_links(val_df)

In [13]:
test_indices

[14, 17, 13, 18]

## Training the model

In [28]:
train_df['ID'] = 0
val_df['ID'] = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['ID'] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_df['ID'] = 0


In [29]:
from transformers import DataCollatorWithPadding

In [33]:
# K Fold CV


tokenizer = AutoTokenizer.from_pretrained("bert-base-cased", cache_dir = '/Data')
# tokenizer.pad_token = tokenizer.eos_token


device = 'cuda'
final_results = []


train_dataset = TextDataset(
    train_df["generated_text"].tolist(), 
    train_df['ID'].tolist(),
    train_df["label"].tolist(), 
    tokenizer,
    train_df.index.get_level_values("match_id").tolist()

)

val_dataset = TextDataset(
    val_df["generated_text"].tolist(), 
    val_df['ID'].tolist(),
    val_df["label"].tolist(), 
    tokenizer,
    val_df.index.get_level_values("match_id").tolist()
)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=8, )
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", cache_dir = '/Data', num_labels = 2,ignore_mismatched_sizes=True)
# for p in base_model.model.parameters():
#     p.requires_grad = False
model.to(device)

# model = get_peft_model(base_model, lora_config)
# for param in model.bert.parameters():
#     param.requires_grad = False

for name, param in model.bert.named_parameters():
    if "layer.11" in name or "layer.10":  # Unfreeze last two layers
        param.requires_grad = True

    else:
        param.requires_grad = False
    
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
scheduler = ReduceLROnPlateau(optimizer, mode='max', factor=0.1, patience=2, verbose=True)


best_model = None
best_acc = -1

n_epochs = 10

labels = train_df["label"].tolist()

class_weights = compute_class_weights(train_df['label']).to(device)
# class_weights = torch.Tensor([0.6,0.4]).to(device)

# Define weighted loss function
loss_fn = torch.nn.CrossEntropyLoss(class_weights)

for epoch in range(n_epochs):
    all_preds = []
    all_labels = []
    epoch_loss = 0

    print(train_indices, val_indices)
    for i, batch in enumerate(tqdm(train_dataloader)):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        with torch.autocast( device_type = 'cuda'):
            # outputs = model(input_ids=input_ids, attention_mask=attention_mask, extra_feature = count)
            # loss = loss_fn(outputs, labels.squeeze() )

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels = labels)
            loss = loss_fn(outputs.logits, labels)
        loss.backward()

        
        optimizer.step()
        optimizer.zero_grad()

        preds = torch.argmax(outputs.logits, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

        epoch_loss += loss.item()

    acc = accuracy_score(all_labels, all_preds, )
    f1 = f1_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds)
    conf_matrix = confusion_matrix(all_labels, all_preds)


    
    print(f"---------- Epoch {epoch} ------------")
    print(f"Training Loss : {epoch_loss}\n")
    print(f"Training Accuracy : {acc}\n")
    print(f"Training Precision : {precision}\n")
    print(conf_matrix)


    preds, labels = evaluate_model(val_df, val_dataloader, model)

    y_pred_val = pd.Series(preds, index=val_df.index)
    y_true_val = pd.Series(labels, index= val_df.index)
    combined_acc = pd.concat([y_pred_val, y_true_val], axis = 1)\
        .groupby("match_id")\
        .apply(lambda x: accuracy_score(x[1], x[0]))


    print(combined_acc)
    scheduler.step(combined_acc.min())

    if combined_acc.min() > best_acc:
        best_acc = combined_acc.min()
        best_model = deepcopy(model)


# Combine results for this fold
# validation_results = pd.DataFrame({
#     "MatchID": validation_data["MatchID"].values,
#     "true_values": labels,
#     "predictions": preds,
# })

# final_results.append(validation_results)

Validation Accuracy : 0.5768645357686454

Validation auc : 0.586009110346751

[[232  78]
 [200 147]]
match_id
1     0.576923
4     0.570588
5     0.584615
12    0.639175
19    0.530769
dtype: float64
[0, 3, 2, 7, 11] [1, 5, 4, 12, 19]


  0%|          | 0/82 [00:00<?, ?it/s]

 52%|█████▏    | 43/82 [00:03<00:03, 12.26it/s]


KeyboardInterrupt: 

In [22]:
y_true_val

130     0
131     0
132     0
133     0
134     1
       ..
2132    1
2133    1
2134    1
2135    1
2136    1
Length: 657, dtype: int64

In [92]:
best_acc

0.6538461538461539

In [96]:
test_dataset = TextDataset(
    test_df["Tweet"].tolist(), 
    test_df['ID'].tolist(),
    test_df["EventType"].tolist(), 
    tokenizer,
    test_df.index.get_level_values("MatchID").tolist()
)

test_dataloader = DataLoader(test_dataset, batch_size=8, collate_fn=data_collator)


preds, labels = evaluate_model(test_df, test_dataloader, best_model, extra_feature=False)

Validation Accuracy : 0.6307692307692307

Validation auc : 0.6267440295494311

[[137  96]
 [ 96 191]]


In [98]:
y_pred_val = pd.Series(preds, index=test_df.index)
y_true_val = pd.Series(labels, index= test_df.index)
combined_acc = pd.concat([y_pred_val, y_true_val], axis = 1)\
    .groupby("MatchID")\
    .apply(lambda x: accuracy_score(x[1], x[0]))


print(combined_acc)

MatchID
13    0.676923
14    0.515385
17    0.676923
18    0.653846
dtype: float64


### Generating predictions

In [93]:
total_test_df = get_eval_set().set_index(["MatchID", "PeriodID"])
test_df = preprocess_data(total_test_df)




  0%|          | 0/4 [00:00<?, ?it/s]

100%|██████████| 4/4 [00:01<00:00,  3.36it/s]


In [73]:
tokenizer = AutoTokenizer.from_pretrained("allenai/longformer-base-4096", cache_dir = '/Data')
test_df['tokens'] = test_df['Tweet'].progress_apply(tokenizer.tokenize)

target_words = [
    "goal", "penalty", "halftime", "full-time", "yellow", "red",
    "kickoff", "extra time", "stoppage time", "foul", "offside", "handball",
    "save", "tackle", "dribble", "corner", "substitution", "header",
    "free kick", "throw-in", "assist", "hat-trick", "own goal", "victory",
    "defeat", "draw", "win", "loss", "tie", "comeback", "goalkeeper",
    "striker", "midfielder", "defender", "referee", "fans", "var", "gooal"
]
target_words = set(tokenizer.tokenize(" ".join(target_words)))

def is_valid_text(t):
    for w in t:
        if w in target_words:
            return True
        
    return False

test_df['is_valid']= test_df['tokens'].progress_apply(is_valid_text)

100%|██████████| 362397/362397 [00:26<00:00, 13903.77it/s]
100%|██████████| 362397/362397 [00:00<00:00, 719927.55it/s]


In [74]:
# test_df['language'] = test_df['lan'].apply(lambda x: x[0])
test_df['language'] = "en"

# test_df_en = test_df.query("language == 'en' ")
test_df_en = test_df.query("is_valid == 1")

In [75]:
processed_test_df = test_df_en.groupby(["MatchID", "PeriodID"]).agg({
    "Tweet":   lambda x: get_first_texts(x, max_size=15),
    "ID": len
})

processed_test_df = remove_hashtag_links(processed_test_df)

In [76]:
processed_test_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Tweet,ID
MatchID,PeriodID,Unnamed: 2_level_1,Unnamed: 3_level_1
6,0,Fascinated for this match. This will tell us ...,82
6,1,"""In a few minutes of x ...Can't wait""....Waa...",85
6,2,I want Germany to win. \n Germany Vs. Ghana ki...,79
6,3,Let's go ! Hoping for Schweinsteiger to at lea...,110
6,4,Come on black for the win! Down those wanke...,151
...,...,...,...
16,125,T-minus 5 minutes until my morning productivit...,137
16,126,Stat comparison between Serbia & Germany s int...,115
16,127,Zum GlÃ¼ck hat wenigstens KEINER beim Tippspie...,116
16,128,ç›´å‰äºˆæƒ³0-0ã¯å¤–ã‚Œã€‚ã‚»ãƒ«ãƒ“ã‚¢å„ªä½ã...,109


In [77]:
test_dataset = TextDataset(
    processed_test_df["Tweet"].tolist(), 
    processed_test_df['ID'].tolist(), 
    None,
    tokenizer,
    [0] * 516
)

test_dataloader = DataLoader(test_dataset, batch_size=32, collate_fn=data_collator)

In [78]:
preds, labels, probas = evaluate_model(processed_test_df, test_dataloader, best_model, use_labels=False, return_proba=True)

  0%|          | 0/17 [00:00<?, ?it/s]

100%|██████████| 17/17 [00:12<00:00,  1.33it/s]


In [79]:
processed_test_df['EventType'] = preds

In [80]:
processed_test_df['EventType'].value_counts(normalize = True)

EventType
0    0.643411
1    0.356589
Name: proportion, dtype: float64

In [81]:
pd.merge(
    total_test_df,
    processed_test_df[["Tweet", "EventType"]],
    left_index=True,
    right_index=True

)[['EventType','ID']]\
    .drop_duplicates("ID")\
    .set_index("ID")\
    .to_csv("predictions_2.csv")

EventType
0    0.831609
1    0.168391
Name: proportion, dtype: float64