In [1]:
%load_ext autoreload
%autoreload 2

## Training Transformer

In [2]:
import os
while 'notebooks' in os.getcwd():
    os.chdir("..")

import numpy as np
import pandas as pd 
from src.utils import train_test_split, get_sample_weights, get_eval_set
from src.preprocessing import preprocess_data
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoModel
from src.preprocessing import TextDataset
import torch
from torch.utils.data import DataLoader, Dataset
from IPython.display import clear_output
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, precision_score, roc_auc_score
from sentence_transformers import SentenceTransformer, InputExample, losses, evaluation, LoggingHandler
import logging
from copy import deepcopy
from sklearn.decomposition import PCA
from huggingface_hub import notebook_login
from sklearn.ensemble import RandomForestClassifier
from peft import get_peft_model, LoraConfig, TaskType
from sklearn.utils.class_weight import compute_class_weight
from collections import defaultdict
import transformers
from peft import get_peft_model, LoraConfig, TaskType
import re
from bert_score import BERTScorer
import langid

from transformers import BitsAndBytesConfig

tqdm.pandas()

In [3]:
scorer = BERTScorer(model_type='bert-base-uncased',num_layers = 12, device='cuda')



In [3]:
def get_first_texts(x, max_size = 10):
    size = x.apply(lambda x: len(x.split(" ")))\
        .sort_values()
    
    x = x.reindex_like(size)
    mask = size < max_size
    # mask = x.str.lower().str.contains("goal")

    return "\n".join(x[mask])
    # return x[mask].tolist()

In [4]:
train_data, test_data = train_test_split()
def get_samples(indices, frac = 1, df = None):
    all_df = []

    if df is None:
        for id in indices:
            temp_df = train_data[id]

            
            
            all_df.append(temp_df.dropna().sample(frac=frac))

            
        return pd.concat(all_df).groupby(["MatchID", "PeriodID"]).agg({
            "Tweet":    get_first_texts,
            "EventType": np.mean,
            "ID": len
        })
    
    else: 
        return (df.query(f"MatchID in {indices}")).groupby(["MatchID", "PeriodID"]).agg({
            "Tweet":    get_first_texts,
            "EventType": np.mean,
            "ID": len
        })

  0%|          | 0/16 [00:00<?, ?it/s]

100%|██████████| 16/16 [00:05<00:00,  2.73it/s]


In [5]:
# sample_text = "Threre was a goal, half time, kick-off, full time, penalty, red card, yellow card, or own goal"
# # Define batch size
# batch_size = 1024

# # Initialize lists to store scores
# precisions = []
# recalls = []
# f1_scores = []

# data = pd.concat(train_data.values())

# # Create a progress bar
# for i in tqdm(range(0, len(data), batch_size), desc="Scoring Batches"):
#     # Slice the batch
#     batch = data.iloc[i:i + batch_size]['Tweet'].tolist()
    
#     # Compute BERTScore for the batch
#     P, R, F1 = scorer.score(batch, [sample_text] * len(batch), )
    
#     # Append scores
#     precisions.extend(P.tolist())
#     recalls.extend(R.tolist())
#     f1_scores.extend(F1.tolist())

In [6]:
# all_df = pd.concat(train_data.values())
# all_df['bertscore'] = f1_scores

In [7]:
# train = all_df.groupby(["MatchID", "PeriodID"], as_index=False).apply(lambda x: x.sort_values("bertscore").iloc[-50:])

In [8]:
class BertWithExtraFeature(torch.nn.Module):
    def __init__(self, bert_model_name='bert-base-uncased', hidden_size=768, extra_feature_size=1):
        super(BertWithExtraFeature, self).__init__()
        # Load the pre-trained BERT model
        self.bert = AutoModel.from_pretrained(bert_model_name, cache_dir = '/Data')
        self.hidden_size = hidden_size
        self.extra_feature_size = extra_feature_size
        
        self.dropout = torch.nn.Dropout(p=0.5)
        # Fully connected layer to combine BERT output and extra feature
        self.fc = torch.nn.Linear(self.hidden_size + self.extra_feature_size, 2)
    
    def forward(self, input_ids, attention_mask, extra_feature):
        """
        Args:
            input_ids: Tensor of shape (batch_size, seq_len) with token IDs.
            attention_mask: Tensor of shape (batch_size, seq_len) for masking attention.
            extra_feature: Tensor of shape (batch_size, 1) with the additional feature.

        Returns:
            Logits for binary classification.
        """
        # Get BERT output (pooled output)
        bert_output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = bert_output.pooler_output  # Shape: (batch_size, hidden_size)

        pooled_output = self.dropout(pooled_output)
        
        # Concatenate the pooled output with the extra feature
        combined_input = torch.cat((pooled_output, extra_feature), dim=1)  # Shape: (batch_size, hidden_size + extra_feature_size)
        
        # Pass through the fully connected layer
        logits = self.fc(combined_input)  # Shape: (batch_size, 1)
        
        return logits

In [51]:
from torch.cuda.amp import autocast
def evaluate_model(
    val_df: pd.DataFrame, 
    val_dataloader, 
    model, device : str = 'cuda', 
    use_labels = True, 
    sample_weight = None,
    extra_feature : bool = False,
    return_proba  =False
):
    model.eval()
    all_preds = []
    all_labels = []
    predict_proba = []
    with torch.no_grad():
        with torch.autocast(device_type = 'cuda'):
            for i,batch in tqdm(enumerate(val_dataloader), total = len(val_dataloader)):
                input_ids = batch["input_ids"].to(device)
                attention_mask = batch["attention_mask"].to(device)
                count = batch['count'].to(device).unsqueeze(dim = -1)

                labels = None
                if  use_labels:
                    labels = batch["label"].to(device)
                # count = batch['count'].to(device).unsqueeze(dim = -1)

                if extra_feature:
                    outputs = model(input_ids=input_ids, attention_mask=attention_mask, extra_feature = count)
                    preds = torch.argmax(outputs, dim=1)
                else:
                    outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                    preds = torch.argmax(outputs.logits, dim=1)
                    probas = torch.softmax(outputs.logits, dim = 1)[:,1]
                    
                predict_proba.extend(probas.cpu().numpy())
                all_preds.extend(preds.cpu().numpy())
                if use_labels:
                    all_labels.extend(labels.cpu().numpy())

                # if i % 100 == 0: 
                #     acc = accuracy_score(all_labels, all_preds)
                #     f1 = f1_score(all_labels, all_preds)

                #     clear_output()
                #     print(f"Validation Accuracy : {acc}\n")
                #     print(f"Validation F1 : {f1}\n")
                #     conf_matrix = confusion_matrix(all_labels, all_preds)
                #     print(conf_matrix)

    if use_labels:
        acc = accuracy_score(all_labels, all_preds)
        precision = precision_score(all_labels, all_preds)
        conf_matrix = confusion_matrix(all_labels, all_preds)
        auc = roc_auc_score(all_labels, all_preds)

        clear_output()
        print(f"Validation Accuracy : {acc}\n")
        print(f"Validation auc : {auc}\n")
        print(conf_matrix)

    if return_proba:
        return all_preds, all_labels, predict_proba
    return all_preds, all_labels

In [10]:
def compute_class_weights(labels):
    class_weights = compute_class_weight(class_weight="balanced", classes=np.unique(labels), y=labels)
    return torch.tensor(class_weights, dtype=torch.float)

In [11]:
def remove_hashtag_links(df):

    df['Tweet'] = df['Tweet'].str.replace(r"#\w+", "", regex=True)

    # Remove links
    df['Tweet'] = df['Tweet'].str.replace(r"http\S+|www\S+", "", regex=True)

    emoji_pattern = re.compile(
        "["
        u"\U0001F600-\U0001F64F"  # Emoticons
        u"\U0001F300-\U0001F5FF"  # Symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # Transport & map symbols
        u"\U0001F700-\U0001F77F"  # Alchemical symbols
        u"\U0001F780-\U0001F7FF"  # Geometric shapes extended
        u"\U0001F800-\U0001F8FF"  # Supplemental arrows-C
        u"\U0001F900-\U0001F9FF"  # Supplemental symbols and pictographs
        u"\U0001FA00-\U0001FA6F"  # Chess symbols
        u"\U0001FA70-\U0001FAFF"  # Symbols and pictographs extended-A
        u"\U00002702-\U000027B0"  # Dingbats
        u"\U000024C2-\U0001F251"  # Enclosed characters
        "]+",
        flags=re.UNICODE
    )
    df['Tweet'] = df['Tweet'].str.replace(emoji_pattern, "", regex=True)
    df['Tweet'] = df['Tweet'].str.strip()

    # df['Tweet'] = "Is there any event like goal, halftime, fulltime, start of match or cards in any of the following tweets?\n\n" + df['Tweet']

    return df


In [12]:
df = pd.concat(train_data)
df = remove_hashtag_links(df)
# df['lan'] = df['Tweet'].progress_apply(lambda x : langid.classify(x)[0])

In [13]:
en_df = df#.query("lan == 'en' ")

In [10]:
possible_indices = set(train_data.keys())

In [None]:
test_indices = list(np.random.choice(list(possible_indices), size=3, replace = False,))
test_indices = [13,1,18]
all_train_indices = list(possible_indices.difference(set(test_indices)))
val_indices = [1,5,12,19]
# val_indices = list(np.random.choice(all_train_indices, 3, replace=False))
# train_indices = list(set(all_train_indices).difference(set(val_indices)))
train_indices = [0,2,7,11,13,18]


train_df = get_samples(train_indices,)
# train_df = get_samples(train_indices, df = train)
test_df = get_samples(test_indices)
val_df = get_samples(val_indices)

  return pd.concat(all_df).groupby(["MatchID", "PeriodID"]).agg({


In [266]:
train_df = remove_hashtag_links(train_df)
test_df = remove_hashtag_links(test_df)
val_df = remove_hashtag_links(val_df)

In [259]:
test_indices

[2, 14, 1]

In [267]:
# K Fold CV


tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased", cache_dir = '/Data')

device = 'cuda'
final_results = []


train_dataset = TextDataset(
    train_df["Tweet"].tolist(), 
    train_df['ID'].tolist(),
    train_df["EventType"].tolist(), 
    tokenizer
)

val_dataset = TextDataset(
    val_df["Tweet"].tolist(), 
    val_df['ID'].tolist(),
    val_df["EventType"].tolist(), 
    tokenizer
)

test_dataset = TextDataset(
    test_df["Tweet"].tolist(), 
    test_df['ID'].tolist(),
    test_df["EventType"].tolist(), 
    tokenizer
)

train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle = True)
val_dataloader = DataLoader(val_dataset, batch_size=16)
test_dataloader = DataLoader(test_dataset, batch_size=16)

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", cache_dir = '/Data', num_labels = 2, dropout = 0.4)
# model.resize_position_embeddings(2048)

# model = BertWithExtraFeature(bert_model_name="bert-base-uncased")

# for p in model.distlbert.parameters():
#     p.requires_grad = False
model.to(device)

# model = get_peft_model(base_model, lora_config)
# for param in model.bert.parameters():
#     param.requires_grad = False
    
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4,)

# optimizer = torch.optim.AdamW([
#     {'params': model.bert.parameters(), 'weight_decay': 1e-3},  # Regularize BERT weights
#     {'params': model.fc.parameters(), 'weight_decay': 1e-2}     # Stronger regularization on the classifier
# ], lr=1e-5)

for name, param in model.distilbert.named_parameters():
    if "layer.5" in name or "layer.4" in name:  # Unfreeze last two layers
        param.requires_grad = True

    else:
        param.requires_grad = False

best_model = None
second_best_model = None
best_acc = -1
second_best_acc = -1

n_epochs = 10

labels = train_df["EventType"].tolist()
class_weight = torch.Tensor([0.4, 0.6]).to(device)
# class_weights = compute_class_weights(labels).to(device)

# Define weighted loss function
loss_fn = torch.nn.CrossEntropyLoss(class_weight)

for epoch in range(n_epochs):
    all_preds = []
    all_labels = []
    epoch_loss = 0

    print(train_indices, val_indices)
    for i, batch in enumerate(tqdm(train_dataloader)):

        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)
        count = batch['count'].to(device).unsqueeze(dim = -1)

        with torch.autocast( device_type = 'cuda'):
            # outputs = model(input_ids=input_ids, attention_mask=attention_mask, extra_feature = count)
            # loss = loss_fn(outputs, labels.squeeze() )

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels = labels)
            loss = loss_fn(outputs.logits, labels)
            # loss = loss_fn(outputs, labels)
        loss.backward()

        
        optimizer.step()
        optimizer.zero_grad()

        preds = torch.argmax(outputs.logits, dim=1)
        # preds = torch.argmax(outputs, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

        epoch_loss += loss.item()

    acc_train = accuracy_score(all_labels, all_preds, )
    f1 = f1_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds)
    conf_matrix = confusion_matrix(all_labels, all_preds)


    
    print(f"---------- Epoch {epoch} ------------")
    print(f"Training Loss : {epoch_loss}\n")
    print(f"Training Accuracy : {acc_train}\n")
    print(f"Training Precision : {precision}\n")
    print(conf_matrix)


    preds, labels = evaluate_model(val_df, val_dataloader, model, extra_feature=False)

    acc = accuracy_score(labels, preds)

    # if acc_train > 0.8 and acc > 0.68:
    #     best_model = deepcopy(model)

    if acc > best_acc:
        second_best_acc = best_acc
        second_best_model = deepcopy(best_model)  # Promote previous best to second best
        best_acc = acc
        best_model = deepcopy(model)
    # elif acc > second_best_acc:
    #     second_best_acc = acc
    #     second_best_model = deepcopy(model)


# Combine results for this fold
# validation_results = pd.DataFrame({
#     "MatchID": validation_data["MatchID"].values,
#     "true_values": labels,
#     "predictions": preds,
# })

# final_results.append(validation_results)

Validation Accuracy : 0.6931818181818182

Validation auc : 0.6933850129198966

[[154  71]
 [ 64 151]]


In [235]:
# from torch.utils.data import DataLoader, Dataset

# class TweetDataset(Dataset):
#     def __init__(self, tweets, tokenizer, max_length=512):
#         self.tweets = tweets
#         self.tokenizer = tokenizer
#         self.max_length = max_length

#     def __len__(self):
#         return len(self.tweets)

#     def __getitem__(self, idx):
#         return self.tokenizer(
#             self.tweets[idx],
#             return_tensors='pt',
#             truncation=True,
#             padding='max_length',
#             max_length=self.max_length
#         )

# # Define a dataset

# def get_X(texts):
#     tweet_dataset = TweetDataset(
#         tweets=texts,
#         tokenizer=tokenizer,
#         max_length=512
#     )

#     # Create a DataLoader for batching
#     batch_size = 32  # Adjust batch size based on available GPU memory
#     data_loader = DataLoader(tweet_dataset, batch_size=batch_size)

#     X_train_list = []  # To store processed embeddings

#     # Process in batches
#     with torch.no_grad():
#         for batch in tqdm(data_loader):
#             # Move tokenized input to GPU
#             b = {k: v.squeeze().to("cuda") for k, v in batch.items()}  # Squeeze to match model input shape
            
#             # Model forward pass
#             outputs = best_model.distilbert(**b)
            
#             # Collect the embeddings (modify based on your use case, e.g., logits, hidden states, etc.)
#             X_train_list.append(outputs.last_hidden_state[:,-1,:].to("cpu"))

#     # Concatenate all batches if needed
#     X = torch.cat(X_train_list, dim=0)
    
#     return X

In [269]:
X_train = get_X(train_df['Tweet'].tolist())
X_val = get_X(val_df['Tweet'].tolist())
X_test = get_X(test_df['Tweet'].tolist())

  0%|          | 0/41 [00:00<?, ?it/s]

100%|██████████| 41/41 [00:12<00:00,  3.22it/s]
100%|██████████| 14/14 [00:04<00:00,  3.03it/s]
100%|██████████| 13/13 [00:04<00:00,  2.94it/s]


In [273]:
clf = RandomForestClassifier(max_depth=8,max_features='log2', class_weight={0:0.4, 1:0.6})

clf.fit(X_train, train_df['EventType'])

y_pred_val = clf.predict(X_val)
y_pred_test = clf.predict(X_test)

accuracy_score(val_df['EventType'],y_pred_val), accuracy_score(test_df['EventType'],y_pred_test)

(0.6295454545454545, 0.7128205128205128)

In [164]:
confusion_matrix(val_df['EventType'],y_pred_val)

array([[ 85,  76],
       [ 40, 189]])

In [268]:
preds, labels = evaluate_model(test_df, test_dataloader, best_model, extra_feature=False)

Validation Accuracy : 0.6846153846153846

Validation auc : 0.6778722030981066

[[105  61]
 [ 62 162]]


## Training the model

In [14]:
# val_indices = list(np.random.choice(list(train_data.keys()), size=6, replace = False))
# train_indices = list(set(train_data.keys()).difference(set(val_indices)))

val_indices = [1,5,12,19]
# val_indices = list(np.random.choice(all_train_indices, 3, replace=False))
# train_indices = list(set(all_train_indices).difference(set(val_indices)))
train_indices = [0,2,7,11,13,18]



# train_df = get_samples(train_indices, df = train)
train_df = get_samples(train_indices, df = en_df)
val_df = get_samples(val_indices, df = en_df)

  return (df.query(f"MatchID in {indices}")).groupby(["MatchID", "PeriodID"]).agg({
  return (df.query(f"MatchID in {indices}")).groupby(["MatchID", "PeriodID"]).agg({


In [15]:
train_df = remove_hashtag_links(train_df, )
val_df = remove_hashtag_links(val_df)

In [16]:
train_df['EventType'].value_counts(normalize=True)

EventType
1.0    0.567949
0.0    0.432051
Name: proportion, dtype: float64

In [17]:
val_df['EventType'].value_counts(normalize=True)

EventType
1.0    0.558522
0.0    0.441478
Name: proportion, dtype: float64

In [18]:
model = AutoModelForSequenceClassification.from_pretrained("allenai/longformer-base-4096", cache_dir = '/Data', num_labels = 2,ignore_mismatched_sizes=True)

config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/597M [00:00<?, ?B/s]

Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
model

LongformerForSequenceClassification(
  (longformer): LongformerModel(
    (embeddings): LongformerEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (position_embeddings): Embedding(4098, 768, padding_idx=1)
    )
    (encoder): LongformerEncoder(
      (layer): ModuleList(
        (0-11): 12 x LongformerLayer(
          (attention): LongformerAttention(
            (self): LongformerSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (query_global): Linear(in_features=768, out_features=768, bias=True)
              (key_global): Linear(in_features=768, out_features=768, bias=True)
          

In [36]:
# K Fold CV


tokenizer = AutoTokenizer.from_pretrained("allenai/longformer-base-4096", cache_dir = '/Data')
# tokenizer.pad_token = tokenizer.eos_token


device = 'cuda'
final_results = []


train_dataset = TextDataset(
    train_df["Tweet"].tolist(), 
    train_df['ID'].tolist(),
    train_df["EventType"].tolist(), 
    tokenizer,
    train_df.index.get_level_values("MatchID").tolist()

)

val_dataset = TextDataset(
    val_df["Tweet"].tolist(), 
    val_df['ID'].tolist(),
    val_df["EventType"].tolist(), 
    tokenizer,
    val_df.index.get_level_values("MatchID").tolist()
)


train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=16)


model = AutoModelForSequenceClassification.from_pretrained("allenai/longformer-base-4096", cache_dir = '/Data', num_labels = 2,ignore_mismatched_sizes=True)
# for p in base_model.model.parameters():
#     p.requires_grad = False
model.to(device)

# model = get_peft_model(base_model, lora_config)
# for param in model.bert.parameters():
#     param.requires_grad = False

for name, param in model.longformer.named_parameters():
    if "layer.11" in name:  # Unfreeze last two layers
        param.requires_grad = True

    else:
        param.requires_grad = False
    
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)

best_model = None
best_acc = -1

n_epochs = 10

labels = train_df["EventType"].tolist()
class_weights = torch.Tensor([0.4,0.6]).to(device)

# Define weighted loss function
loss_fn = torch.nn.CrossEntropyLoss(class_weights)

for epoch in range(n_epochs):
    all_preds = []
    all_labels = []
    epoch_loss = 0

    print(train_indices, val_indices)
    for i, batch in enumerate(tqdm(train_dataloader)):

        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        with torch.autocast( device_type = 'cuda'):
            # outputs = model(input_ids=input_ids, attention_mask=attention_mask, extra_feature = count)
            # loss = loss_fn(outputs, labels.squeeze() )

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels = labels)
            loss = loss_fn(outputs.logits, labels)
        loss.backward()

        
        optimizer.step()
        optimizer.zero_grad()

        preds = torch.argmax(outputs.logits, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

        epoch_loss += loss.item()

    acc = accuracy_score(all_labels, all_preds, )
    f1 = f1_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds)
    conf_matrix = confusion_matrix(all_labels, all_preds)


    
    print(f"---------- Epoch {epoch} ------------")
    print(f"Training Loss : {epoch_loss}\n")
    print(f"Training Accuracy : {acc}\n")
    print(f"Training Precision : {precision}\n")
    print(conf_matrix)


    preds, labels = evaluate_model(val_df, val_dataloader, model)

    acc = roc_auc_score(labels, preds)

    if acc > best_acc:
        best_acc = acc
        best_model = deepcopy(model)


# Combine results for this fold
# validation_results = pd.DataFrame({
#     "MatchID": validation_data["MatchID"].values,
#     "true_values": labels,
#     "predictions": preds,
# })

# final_results.append(validation_results)

Validation Accuracy : 0.675564681724846

Validation auc : 0.6652103283173735

[[124  91]
 [ 67 205]]


In [38]:
total_test_df = get_eval_set().set_index(["MatchID", "PeriodID"])
test_df = preprocess_data(total_test_df)




100%|██████████| 4/4 [00:01<00:00,  3.49it/s]


In [25]:
# sample_text = "Threre was a goal, half time, kick-off, full time, penalty, red card, yellow card, or own goal"
# # Define batch size
# batch_size = 1024

# # Initialize lists to store scores
# precisions = []
# recalls = []
# # f1_scores = []

# data = test_df

# # Create a progress bar
# for i in tqdm(range(0, len(data), batch_size), desc="Scoring Batches"):
#     # Slice the batch
#     batch = data.iloc[i:i + batch_size]['Tweet'].tolist()
    
#     # Compute BERTScore for the batch
#     P, R, F1 = scorer.score(batch, [sample_text] * len(batch), )
    
#     # Append scores
#     precisions.extend(P.tolist())
#     recalls.extend(R.tolist())
    # f1_scores.extend(F1.tolist())

In [26]:
# test_df['bertscore'] = f1_scores 

In [84]:
# test_df = test_df.groupby(["MatchID", "PeriodID"], as_index=False).apply(lambda x: x.sort_values("bertscore").iloc[-50:])

In [None]:
# test_df['lan'] = test_df['Tweet'].progress_apply(langid.classify)

100%|██████████| 362397/362397 [04:03<00:00, 1490.06it/s]


In [39]:
# test_df['language'] = test_df['lan'].apply(lambda x: x[0])
test_df['language'] = "en"

test_df_en = test_df.query("language == 'en' ")

In [40]:
processed_test_df = test_df_en.groupby(["MatchID", "PeriodID"]).agg({
    "Tweet":    get_first_texts,
    "ID": len
})

processed_test_df = remove_hashtag_links(processed_test_df)

In [41]:
processed_test_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Tweet,ID
MatchID,PeriodID,Unnamed: 2_level_1,Unnamed: 3_level_1
6,0,I Finally get to see Germany play\n \nFascin...,237
6,1,"""In a few minutes of x ...Can't wait""....Waa...",245
6,2,I don't see any team in this World Cup that ca...,254
6,3,Future World Cup champions are about to play.....,344
6,4,Thing is.. Ozil looks like one of my aquarium ...,456
...,...,...,...
16,125,"Someone here just described Group D as ""litera...",344
16,126,Stat comparison between Serbia & Germany s int...,315
16,127,OMFG just lost??? I'm glad I'm at work then! ...,300
16,128,ç›´å‰äºˆæƒ³0-0ã¯å¤–ã‚Œã€‚ã‚»ãƒ«ãƒ“ã‚¢å„ªä½ã...,273


In [42]:
test_dataset = TextDataset(
    processed_test_df["Tweet"].tolist(), 
    processed_test_df['ID'].tolist(), 
    None,
    tokenizer,
    [0] * 516
)

test_dataloader = DataLoader(test_dataset, batch_size=32)

In [52]:
preds, labels, probas = evaluate_model(processed_test_df, test_dataloader, best_model, use_labels=False, return_proba=True)

100%|██████████| 17/17 [00:42<00:00,  2.52s/it]


In [56]:
(np.array(probas) > 0.4).sum()

243

In [71]:
processed_test_df['EventType'] = (np.array(probas) > 0.3).astype(int)

In [72]:
processed_test_df['EventType'].value_counts(normalize = True)

EventType
1    0.563953
0    0.436047
Name: proportion, dtype: float64

In [73]:
example_submission = pd.read_csv("sub-event-detection-in-twitter-streams/challenge_data/logistic_predictions.csv").set_index("ID")

In [None]:
pd.merge(
    total_test_df,
    processed_test_df[["Tweet", "EventType"]],
    left_index=True,
    right_index=True

)[['EventType','ID']]\
    .drop_duplicates("ID")\
    .set_index("ID")\
    .reindex(index = example_submission.index)\
    .to_csv("predictions_2.csv")

: 

EventType
0    0.831609
1    0.168391
Name: proportion, dtype: float64