In [1]:
%load_ext autoreload
%autoreload 2

## Training Transformer

In [2]:
import os
while 'notebooks' in os.getcwd():
    os.chdir("..")

import numpy as np
import pandas as pd 
from src.utils import train_test_split, get_sample_weights, get_eval_set
from src.preprocessing import preprocess_data
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoModel
from src.preprocessing import TextDataset
import torch
from torch.utils.data import DataLoader, Dataset
from IPython.display import clear_output
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, precision_score, roc_auc_score
from sentence_transformers import SentenceTransformer, InputExample, losses, evaluation, LoggingHandler
import logging
from copy import deepcopy
from sklearn.decomposition import PCA
from huggingface_hub import notebook_login
from sklearn.ensemble import RandomForestClassifier
from peft import get_peft_model, LoraConfig, TaskType
from sklearn.utils.class_weight import compute_class_weight
from collections import defaultdict
import transformers
from peft import get_peft_model, LoraConfig, TaskType
import re
from bert_score import BERTScorer

from transformers import BitsAndBytesConfig

tqdm.pandas()

In [3]:
scorer = BERTScorer(model_type='bert-base-uncased',num_layers = 12, device='cuda')



In [4]:
def get_first_texts(x, max_size = 10):
    size = x.apply(lambda x: len(x.split(" ")))\
        .sort_values()
    
    x = x.reindex_like(size)
    mask = size < max_size
    # mask = x.str.lower().str.contains("goal")

    return "\n".join(x[mask])
    # return x[mask].tolist()

In [5]:
train_data, test_data = train_test_split()
def get_samples(indices, frac = 1, df = None):
    all_df = []

    if df is None:
        for id in indices:
            temp_df = train_data[id]

            
            
            all_df.append(temp_df.dropna().sample(frac=frac))

            
        return pd.concat(all_df).groupby(["MatchID", "PeriodID"]).agg({
            "Tweet":    get_first_texts,
            "EventType": np.mean,
            "ID": len
        })
    
    else: 
        return (df).groupby(["MatchID", "PeriodID"]).agg({
            "Tweet":    get_first_texts,
            "EventType": np.mean,
            "ID": len
        })

  0%|          | 0/16 [00:00<?, ?it/s]

100%|██████████| 16/16 [00:08<00:00,  1.91it/s]


In [6]:
# sample_text = "Threre was a goal, half time, kick-off, full time, penalty, red card, yellow card, or own goal"
# # Define batch size
# batch_size = 1024

# # Initialize lists to store scores
# precisions = []
# recalls = []
# f1_scores = []

# data = pd.concat(train_data.values())

# # Create a progress bar
# for i in tqdm(range(0, len(data), batch_size), desc="Scoring Batches"):
#     # Slice the batch
#     batch = data.iloc[i:i + batch_size]['Tweet'].tolist()
    
#     # Compute BERTScore for the batch
#     P, R, F1 = scorer.score(batch, [sample_text] * len(batch), )
    
#     # Append scores
#     precisions.extend(P.tolist())
#     recalls.extend(R.tolist())
#     f1_scores.extend(F1.tolist())

In [7]:
# all_df = pd.concat(train_data.values())
# all_df['bertscore'] = f1_scores

In [8]:
# train = all_df.groupby(["MatchID", "PeriodID"], as_index=False).apply(lambda x: x.sort_values("bertscore").iloc[-50:])

In [6]:
class BertWithExtraFeature(torch.nn.Module):
    def __init__(self, bert_model_name='bert-base-uncased', hidden_size=768, extra_feature_size=1):
        super(BertWithExtraFeature, self).__init__()
        # Load the pre-trained BERT model
        self.bert = AutoModel.from_pretrained(bert_model_name, cache_dir = '/Data')
        self.hidden_size = hidden_size
        self.extra_feature_size = extra_feature_size
        
        self.dropout = torch.nn.Dropout(p=0.5)
        # Fully connected layer to combine BERT output and extra feature
        self.fc = torch.nn.Linear(self.hidden_size + self.extra_feature_size, 2)
    
    def forward(self, input_ids, attention_mask, extra_feature):
        """
        Args:
            input_ids: Tensor of shape (batch_size, seq_len) with token IDs.
            attention_mask: Tensor of shape (batch_size, seq_len) for masking attention.
            extra_feature: Tensor of shape (batch_size, 1) with the additional feature.

        Returns:
            Logits for binary classification.
        """
        # Get BERT output (pooled output)
        bert_output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = bert_output.pooler_output  # Shape: (batch_size, hidden_size)

        pooled_output = self.dropout(pooled_output)
        
        # Concatenate the pooled output with the extra feature
        combined_input = torch.cat((pooled_output, extra_feature), dim=1)  # Shape: (batch_size, hidden_size + extra_feature_size)
        
        # Pass through the fully connected layer
        logits = self.fc(combined_input)  # Shape: (batch_size, 1)
        
        return logits

In [14]:
from torch.cuda.amp import autocast
def evaluate_model(
    val_df: pd.DataFrame, 
    val_dataloader, 
    model, device : str = 'cuda', 
    use_labels = True, 
    sample_weight = None,
    extra_feature : bool = False,
):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        with torch.autocast(device_type = 'cuda'):
            for i,batch in tqdm(enumerate(val_dataloader), total = len(val_dataloader)):
                input_ids = batch["input_ids"].to(device)
                attention_mask = batch["attention_mask"].to(device)
                count = batch['count'].to(device).unsqueeze(dim = -1)

                labels = None
                if  use_labels:
                    labels = batch["label"].to(device)
                # count = batch['count'].to(device).unsqueeze(dim = -1)

                if extra_feature:
                    outputs = model(input_ids=input_ids, attention_mask=attention_mask, extra_feature = count)
                    preds = torch.argmax(outputs, dim=1)
                else:
                    outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                    preds = torch.argmax(outputs.logits, dim=1)
                    

                all_preds.extend(preds.cpu().numpy())
                if use_labels:
                    all_labels.extend(labels.cpu().numpy())

                # if i % 100 == 0: 
                #     acc = accuracy_score(all_labels, all_preds)
                #     f1 = f1_score(all_labels, all_preds)

                #     clear_output()
                #     print(f"Validation Accuracy : {acc}\n")
                #     print(f"Validation F1 : {f1}\n")
                #     conf_matrix = confusion_matrix(all_labels, all_preds)
                #     print(conf_matrix)

    if use_labels:
        acc = accuracy_score(all_labels, all_preds)
        precision = precision_score(all_labels, all_preds)
        conf_matrix = confusion_matrix(all_labels, all_preds)
        auc = roc_auc_score(all_labels, all_preds)

        clear_output()
        print(f"Validation Accuracy : {acc}\n")
        print(f"Validation auc : {auc}\n")
        print(conf_matrix)

    return all_preds, all_labels

In [8]:
def compute_class_weights(labels):
    class_weights = compute_class_weight(class_weight="balanced", classes=np.unique(labels), y=labels)
    return torch.tensor(class_weights, dtype=torch.float)

In [9]:
def remove_hashtag_links(df):

    df['Tweet'] = df['Tweet'].str.replace(r"#\w+", "", regex=True)

    # Remove links
    df['Tweet'] = df['Tweet'].str.replace(r"http\S+|www\S+", "", regex=True)

    emoji_pattern = re.compile(
        "["
        u"\U0001F600-\U0001F64F"  # Emoticons
        u"\U0001F300-\U0001F5FF"  # Symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # Transport & map symbols
        u"\U0001F700-\U0001F77F"  # Alchemical symbols
        u"\U0001F780-\U0001F7FF"  # Geometric shapes extended
        u"\U0001F800-\U0001F8FF"  # Supplemental arrows-C
        u"\U0001F900-\U0001F9FF"  # Supplemental symbols and pictographs
        u"\U0001FA00-\U0001FA6F"  # Chess symbols
        u"\U0001FA70-\U0001FAFF"  # Symbols and pictographs extended-A
        u"\U00002702-\U000027B0"  # Dingbats
        u"\U000024C2-\U0001F251"  # Enclosed characters
        "]+",
        flags=re.UNICODE
    )
    df['Tweet'] = df['Tweet'].str.replace(emoji_pattern, "", regex=True)
    df['Tweet'] = df['Tweet'].str.strip()

    df['Tweet'] = "Is there any event like goal, halftime, fulltime, start of match or cards in any of the following tweets?\n\n" + df['Tweet']

    return df


In [10]:
possible_indices = set(train_data.keys())

In [278]:
df = pd.concat(train_data)
df.sort_values(by = ['MatchID', "PeriodID", "Timestamp"], ascending=[True, True, False]).head(20)

Unnamed: 0,Unnamed: 1,ID,MatchID,PeriodID,EventType,Timestamp,Tweet
0,239,0_0,0,0,0,1403725860000,Okay honduras. This is your chance
0,212,0_0,0,0,0,1403725855000,Coming up next Ecuador vs France and Honduras ...
0,213,0_0,0,0,0,1403725855000,Sucks that every time Honduras plays I'm stuck...
0,184,0_0,0,0,0,1403725848000,omg LETS GO HONDURAS 🙌👏👏
0,181,0_0,0,0,0,1403725847000,Omar a real Honduras fan foo still got hope😂😂 ...
0,178,0_0,0,0,0,1403725846000,"Switzerland bout to catch this L , lol ⚽️👏 #HO..."
0,179,0_0,0,0,0,1403725846000,My Predicition : Switzerland 2-1 Honduras #F...
0,177,0_0,0,0,0,1403725845000,Ecuador Is The Only Southamerican Team Missing...
0,175,0_0,0,0,0,1403725845000,I want Honduras kicked out just because of wha...
0,145,0_0,0,0,0,1403725835000,So here we are again! Its the last two encount...


In [265]:
test_indices = list(np.random.choice(list(possible_indices), size=3, replace = False,))
# test_indices = [13,1,18]
all_train_indices = list(possible_indices.difference(set(test_indices)))
# val_indices = [5,4,12]
val_indices = list(np.random.choice(all_train_indices, 3, replace=False))
train_indices = list(set(all_train_indices).difference(set(val_indices)))


train_df = get_samples(train_indices,)
# train_df = get_samples(train_indices, df = train)
test_df = get_samples(test_indices)
val_df = get_samples(val_indices)

  return pd.concat(all_df).groupby(["MatchID", "PeriodID"]).agg({


In [266]:
train_df = remove_hashtag_links(train_df)
test_df = remove_hashtag_links(test_df)
val_df = remove_hashtag_links(val_df)

In [259]:
test_indices

[2, 14, 1]

In [267]:
# K Fold CV


tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased", cache_dir = '/Data')

device = 'cuda'
final_results = []


train_dataset = TextDataset(
    train_df["Tweet"].tolist(), 
    train_df['ID'].tolist(),
    train_df["EventType"].tolist(), 
    tokenizer
)

val_dataset = TextDataset(
    val_df["Tweet"].tolist(), 
    val_df['ID'].tolist(),
    val_df["EventType"].tolist(), 
    tokenizer
)

test_dataset = TextDataset(
    test_df["Tweet"].tolist(), 
    test_df['ID'].tolist(),
    test_df["EventType"].tolist(), 
    tokenizer
)

train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle = True)
val_dataloader = DataLoader(val_dataset, batch_size=16)
test_dataloader = DataLoader(test_dataset, batch_size=16)

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", cache_dir = '/Data', num_labels = 2, dropout = 0.4)
# model.resize_position_embeddings(2048)

# model = BertWithExtraFeature(bert_model_name="bert-base-uncased")

# for p in model.distlbert.parameters():
#     p.requires_grad = False
model.to(device)

# model = get_peft_model(base_model, lora_config)
# for param in model.bert.parameters():
#     param.requires_grad = False
    
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4,)

# optimizer = torch.optim.AdamW([
#     {'params': model.bert.parameters(), 'weight_decay': 1e-3},  # Regularize BERT weights
#     {'params': model.fc.parameters(), 'weight_decay': 1e-2}     # Stronger regularization on the classifier
# ], lr=1e-5)

for name, param in model.distilbert.named_parameters():
    if "layer.5" in name or "layer.4" in name:  # Unfreeze last two layers
        param.requires_grad = True

    else:
        param.requires_grad = False

best_model = None
second_best_model = None
best_acc = -1
second_best_acc = -1

n_epochs = 10

labels = train_df["EventType"].tolist()
class_weight = torch.Tensor([0.4, 0.6]).to(device)
# class_weights = compute_class_weights(labels).to(device)

# Define weighted loss function
loss_fn = torch.nn.CrossEntropyLoss(class_weight)

for epoch in range(n_epochs):
    all_preds = []
    all_labels = []
    epoch_loss = 0

    print(train_indices, val_indices)
    for i, batch in enumerate(tqdm(train_dataloader)):

        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)
        count = batch['count'].to(device).unsqueeze(dim = -1)

        with torch.autocast( device_type = 'cuda'):
            # outputs = model(input_ids=input_ids, attention_mask=attention_mask, extra_feature = count)
            # loss = loss_fn(outputs, labels.squeeze() )

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels = labels)
            loss = loss_fn(outputs.logits, labels)
            # loss = loss_fn(outputs, labels)
        loss.backward()

        
        optimizer.step()
        optimizer.zero_grad()

        preds = torch.argmax(outputs.logits, dim=1)
        # preds = torch.argmax(outputs, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

        epoch_loss += loss.item()

    acc_train = accuracy_score(all_labels, all_preds, )
    f1 = f1_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds)
    conf_matrix = confusion_matrix(all_labels, all_preds)


    
    print(f"---------- Epoch {epoch} ------------")
    print(f"Training Loss : {epoch_loss}\n")
    print(f"Training Accuracy : {acc_train}\n")
    print(f"Training Precision : {precision}\n")
    print(conf_matrix)


    preds, labels = evaluate_model(val_df, val_dataloader, model, extra_feature=False)

    acc = accuracy_score(labels, preds)

    # if acc_train > 0.8 and acc > 0.68:
    #     best_model = deepcopy(model)

    if acc > best_acc:
        second_best_acc = best_acc
        second_best_model = deepcopy(best_model)  # Promote previous best to second best
        best_acc = acc
        best_model = deepcopy(model)
    # elif acc > second_best_acc:
    #     second_best_acc = acc
    #     second_best_model = deepcopy(model)


# Combine results for this fold
# validation_results = pd.DataFrame({
#     "MatchID": validation_data["MatchID"].values,
#     "true_values": labels,
#     "predictions": preds,
# })

# final_results.append(validation_results)

Validation Accuracy : 0.6931818181818182

Validation auc : 0.6933850129198966

[[154  71]
 [ 64 151]]


In [235]:
# from torch.utils.data import DataLoader, Dataset

# class TweetDataset(Dataset):
#     def __init__(self, tweets, tokenizer, max_length=512):
#         self.tweets = tweets
#         self.tokenizer = tokenizer
#         self.max_length = max_length

#     def __len__(self):
#         return len(self.tweets)

#     def __getitem__(self, idx):
#         return self.tokenizer(
#             self.tweets[idx],
#             return_tensors='pt',
#             truncation=True,
#             padding='max_length',
#             max_length=self.max_length
#         )

# # Define a dataset

# def get_X(texts):
#     tweet_dataset = TweetDataset(
#         tweets=texts,
#         tokenizer=tokenizer,
#         max_length=512
#     )

#     # Create a DataLoader for batching
#     batch_size = 32  # Adjust batch size based on available GPU memory
#     data_loader = DataLoader(tweet_dataset, batch_size=batch_size)

#     X_train_list = []  # To store processed embeddings

#     # Process in batches
#     with torch.no_grad():
#         for batch in tqdm(data_loader):
#             # Move tokenized input to GPU
#             b = {k: v.squeeze().to("cuda") for k, v in batch.items()}  # Squeeze to match model input shape
            
#             # Model forward pass
#             outputs = best_model.distilbert(**b)
            
#             # Collect the embeddings (modify based on your use case, e.g., logits, hidden states, etc.)
#             X_train_list.append(outputs.last_hidden_state[:,-1,:].to("cpu"))

#     # Concatenate all batches if needed
#     X = torch.cat(X_train_list, dim=0)
    
#     return X

In [269]:
X_train = get_X(train_df['Tweet'].tolist())
X_val = get_X(val_df['Tweet'].tolist())
X_test = get_X(test_df['Tweet'].tolist())

  0%|          | 0/41 [00:00<?, ?it/s]

100%|██████████| 41/41 [00:12<00:00,  3.22it/s]
100%|██████████| 14/14 [00:04<00:00,  3.03it/s]
100%|██████████| 13/13 [00:04<00:00,  2.94it/s]


In [273]:
clf = RandomForestClassifier(max_depth=8,max_features='log2', class_weight={0:0.4, 1:0.6})

clf.fit(X_train, train_df['EventType'])

y_pred_val = clf.predict(X_val)
y_pred_test = clf.predict(X_test)

accuracy_score(val_df['EventType'],y_pred_val), accuracy_score(test_df['EventType'],y_pred_test)

(0.6295454545454545, 0.7128205128205128)

In [164]:
confusion_matrix(val_df['EventType'],y_pred_val)

array([[ 85,  76],
       [ 40, 189]])

In [268]:
preds, labels = evaluate_model(test_df, test_dataloader, best_model, extra_feature=False)

Validation Accuracy : 0.6846153846153846

Validation auc : 0.6778722030981066

[[105  61]
 [ 62 162]]


## Training the model

In [307]:
val_indices = list(np.random.choice(list(train_data.keys()), size=6, replace = False))
train_indices = list(set(train_data.keys()).difference(set(val_indices)))


# train_df = get_samples(train_indices, df = train)
train_df = get_samples(train_indices, )
val_df = get_samples(val_indices, )

  return pd.concat(all_df).groupby(["MatchID", "PeriodID"]).agg({


In [308]:
train_df = remove_hashtag_links(train_df)
val_df = remove_hashtag_links(val_df)

In [309]:
train_df['EventType'].value_counts(normalize=True)

EventType
1.0    0.537778
0.0    0.462222
Name: proportion, dtype: float64

In [310]:
val_df['EventType'].value_counts(normalize=True)

EventType
1.0    0.542567
0.0    0.457433
Name: proportion, dtype: float64

In [311]:
# K Fold CV


tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased", cache_dir = '/Data')
# tokenizer.pad_token = tokenizer.eos_token


device = 'cuda'
final_results = []


train_dataset = TextDataset(
    train_df["Tweet"].tolist(), 
    train_df['ID'].tolist(),
    train_df["EventType"].tolist(), 
    tokenizer
)

val_dataset = TextDataset(
    val_df["Tweet"].tolist(), 
    val_df['ID'].tolist(),
    val_df["EventType"].tolist(), 
    tokenizer
)


train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=32)
test_dataloader = DataLoader(test_dataset, batch_size=32)

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", cache_dir = '/Data', num_labels = 2, dropout = 0.3)


# for p in base_model.model.parameters():
#     p.requires_grad = False
model.to(device)

# model = get_peft_model(base_model, lora_config)
# for param in model.bert.parameters():
#     param.requires_grad = False

for name, param in model.distilbert.named_parameters():
    if "layer.5" in name or "layer.4" in name:  # Unfreeze last two layers
        param.requires_grad = True

    else:
        param.requires_grad = False
    
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)

best_model = None
best_acc = -1

n_epochs = 10

labels = train_df["EventType"].tolist()
class_weights = torch.Tensor([0.4,0.6]).to(device)

# Define weighted loss function
loss_fn = torch.nn.CrossEntropyLoss(class_weights)

for epoch in range(n_epochs):
    all_preds = []
    all_labels = []
    epoch_loss = 0

    print(train_indices, val_indices)
    for i, batch in enumerate(tqdm(train_dataloader)):

        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        with torch.autocast( device_type = 'cuda'):
            # outputs = model(input_ids=input_ids, attention_mask=attention_mask, extra_feature = count)
            # loss = loss_fn(outputs, labels.squeeze() )

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels = labels)
            loss = loss_fn(outputs.logits, labels)
        loss.backward()

        
        optimizer.step()
        optimizer.zero_grad()

        preds = torch.argmax(outputs.logits, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

        epoch_loss += loss.item()

    acc = accuracy_score(all_labels, all_preds, )
    f1 = f1_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds)
    conf_matrix = confusion_matrix(all_labels, all_preds)


    
    print(f"---------- Epoch {epoch} ------------")
    print(f"Training Loss : {epoch_loss}\n")
    print(f"Training Accuracy : {acc}\n")
    print(f"Training Precision : {precision}\n")
    print(conf_matrix)


    preds, labels = evaluate_model(val_df, val_dataloader, model)

    acc = roc_auc_score(labels, preds)

    if acc > best_acc:
        best_acc = acc
        best_model = deepcopy(model)


# Combine results for this fold
# validation_results = pd.DataFrame({
#     "MatchID": validation_data["MatchID"].values,
#     "true_values": labels,
#     "predictions": preds,
# })

# final_results.append(validation_results)

Validation Accuracy : 0.6747141041931385

Validation auc : 0.6631863127764768

[[190 170]
 [ 86 341]]
[1, 2, 5, 7, 8, 10, 11, 13, 17, 19] [18, 12, 3, 4, 14, 0]


 45%|████▌     | 10/22 [00:06<00:07,  1.61it/s]


KeyboardInterrupt: 

In [312]:
total_test_df = get_eval_set().set_index(["MatchID", "PeriodID"])
test_df = preprocess_data(total_test_df)




100%|██████████| 4/4 [00:01<00:00,  3.55it/s]


In [291]:
# sample_text = "Threre was a goal, half time, kick-off, full time, penalty, red card, yellow card, or own goal"
# # Define batch size
# batch_size = 1024

# # Initialize lists to store scores
# precisions = []
# recalls = []
# # f1_scores = []

# data = test_df

# # Create a progress bar
# for i in tqdm(range(0, len(data), batch_size), desc="Scoring Batches"):
#     # Slice the batch
#     batch = data.iloc[i:i + batch_size]['Tweet'].tolist()
    
#     # Compute BERTScore for the batch
#     P, R, F1 = scorer.score(batch, [sample_text] * len(batch), )
    
#     # Append scores
#     precisions.extend(P.tolist())
#     recalls.extend(R.tolist())
    # f1_scores.extend(F1.tolist())

In [292]:
# test_df['bertscore'] = f1_scores 

In [293]:
# test_df = test_df.groupby(["MatchID", "PeriodID"], as_index=False).apply(lambda x: x.sort_values("bertscore").iloc[-50:])

In [313]:
test_df

Unnamed: 0_level_0,Unnamed: 1_level_0,ID,Timestamp,Tweet
MatchID,PeriodID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
6,0,6_0,1403376600000,I Finally get to see Germany play\n#GER 🇩🇪⚽🏆
6,0,6_0,1403376600000,Fascinated for this #GERvsGHA match. This will...
6,0,6_0,1403376600000,: #GER and #GHA in a few.
6,0,6_0,1403376600000,Our players jooo #GER #WorldCup2014
6,0,6_0,1403376600000,Germany #GER Vs Ghana #GHA now! Come on Germa...
...,...,...,...,...
15,125,15_125,1404064800000,How come Ireland never makes it into the #Worl...
15,125,15_125,1404064800000,#MEX what a team ! #Ochoa World Cup goal keeper !
15,125,15_125,1404064800000,Harsh on Mexico though! #MEX
15,125,15_125,1404064800000,Dutch deserve to be in last 8.Keep their nerve...


In [314]:
test_df = test_df.groupby(["MatchID", "PeriodID"]).agg({
    "Tweet":    get_first_texts,
    "ID": len
})

test_df = remove_hashtag_links(test_df)

In [315]:
test_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Tweet,ID
MatchID,PeriodID,Unnamed: 2_level_1,Unnamed: 3_level_1
6,0,"Is there any event like goal, halftime, fullti...",237
6,1,"Is there any event like goal, halftime, fullti...",245
6,2,"Is there any event like goal, halftime, fullti...",254
6,3,"Is there any event like goal, halftime, fullti...",344
6,4,"Is there any event like goal, halftime, fullti...",456
...,...,...,...
16,125,"Is there any event like goal, halftime, fullti...",344
16,126,"Is there any event like goal, halftime, fullti...",315
16,127,"Is there any event like goal, halftime, fullti...",300
16,128,"Is there any event like goal, halftime, fullti...",273


In [316]:
test_dataset = TextDataset(
    test_df["Tweet"].tolist(), 
    test_df['ID'].tolist(), 
    None,
    tokenizer
)

test_dataloader = DataLoader(test_dataset, batch_size=32)

In [317]:
preds, labels = evaluate_model(test_df, test_dataloader, best_model, use_labels=False)

  0%|          | 0/17 [00:00<?, ?it/s]

100%|██████████| 17/17 [00:06<00:00,  2.82it/s]


In [319]:
test_df['EventType'] = preds

In [300]:
pd.concat(train_data.values()).drop_duplicates("ID").EventType.value_counts(normalize=True)

EventType
1    0.539541
0    0.460459
Name: proportion, dtype: float64

In [320]:
pd.merge(
    total_test_df,
    test_df[["Tweet", "EventType"]],
    left_index=True,
    right_index=True

)\
    .drop_duplicates("ID")\
    ['EventType'].value_counts(normalize=True)

EventType
0    0.565891
1    0.434109
Name: proportion, dtype: float64

In [217]:
pd.merge(
    total_test_df,
    test_df[["Tweet", "EventType"]],
    left_index=True,
    right_index=True

)[['EventType','ID']]\
    .drop_duplicates("ID")\
    .set_index("ID")\
    .to_csv("predictions_2.csv")

EventType
0    0.831609
1    0.168391
Name: proportion, dtype: float64