In [1]:
%load_ext autoreload
%autoreload 2

## Training Transformer

In [2]:
import os
while 'notebooks' in os.getcwd():
    os.chdir("..")

import numpy as np
import pandas as pd 
from src.utils import train_test_split
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoModel
from src.preprocessing import TextDataset
import torch
from torch.utils.data import DataLoader, Dataset
from IPython.display import clear_output
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sentence_transformers import SentenceTransformer, InputExample, losses, evaluation, LoggingHandler
import logging
from copy import deepcopy
from sklearn.decomposition import PCA
from huggingface_hub import notebook_login
from sklearn.ensemble import RandomForestClassifier
from peft import get_peft_model, LoraConfig, TaskType
from sklearn.utils.class_weight import compute_class_weight
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModel, \
  BitsAndBytesConfig, GPTQConfig
import json

tqdm.pandas()

In [3]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

# tokenizer = AutoTokenizer.from_pretrained(
#     "mistralai/Mistral-7B-Instruct-v0.1",
#     cache_dir = "/Data"    
# )

# tunned_model = AutoModelForCausalLM.from_pretrained(
#     "mistralai/Mistral-7B-Instruct-v0.1",
#     quantization_config = quantization_config,
#     device_map="auto",
#     attn_implementation="eager",
#     cache_dir = "/Data" 
# )


tokenizer = AutoTokenizer.from_pretrained(
    "unsloth/Llama-3.2-1B-Instruct",
    cache_dir = "/Data"    
)

model = AutoModelForCausalLM.from_pretrained(
    "unsloth/Llama-3.2-1B-Instruct",
    quantization_config = quantization_config,
    device_map="auto",
    attn_implementation="eager",
    cache_dir = "/Data" 
)

tokenizer_config.json:   0%|          | 0.00/54.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/927 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

In [4]:
def get_first_texts(x, min_size = 15):
    size = x.apply(lambda x: len(x.split(" ")))\
        .sort_values()
    
    x = x.reindex_like(size)
    mask = size > min_size
    # mask = x.str.lower().str.contains("goal")

    # return "\n".join(x[mask])
    return x[mask].tolist()

In [5]:
train_indices = [0,2,4]
val_indices = [8,11,13]
train_data, test_data = train_test_split()

def get_samples(indices, frac = 1):
    all_df = []
    for id in indices:
        temp_df = train_data[id]
        # for i in range(1,5+1):  
        #     temp_df[f"Tweet"] = temp_df['Tweet'].shift(i) + temp_df['Tweet']
        
        all_df.append(temp_df.dropna().sample(frac=frac))

        
    return pd.concat(all_df)

train_df = get_samples(train_indices)
val_df = get_samples(val_indices)

  0%|          | 0/16 [00:00<?, ?it/s]

100%|██████████| 16/16 [00:04<00:00,  3.56it/s]


In [6]:
base_prompt = '''You are a helpful assistant.
    I will provide you a tweet that happened during a world cup game and you will return me if this tweet represents a football event such as a goal, half time, kick-off, full time, penalty, red card, yellow card, or
own goal—occurred within that period.

    Here is the text:
    
    {text}

    Answer in the following format (JSON):

    {{
        'event': yes or no (depending on the event),
        'reason' : a short reason (no more than 50 words)
    }}.

    Please do not output anything else than the JSON
'''

In [None]:
generated_samples = []

save_every = 1000

for i, (idx, row) in tqdm(enumerate(train_df.iterrows()), total=len(train_df)):
    text = row['Tweet']

    prompt = base_prompt.format(text = text)
    message = [ {"role": "user", "content": prompt}]

    template = tokenizer.apply_chat_template(
        message,
        tokenize= False
    )

    tokens = tokenizer(
        template,
        return_tensors = 'pt'
    )

    generated_ids = model.generate(
        tokens['input_ids'].to('cuda'),
        # attention_mask = tokens['attention_mask'].to("cuda"),
        max_new_tokens = 100,
        do_sample = False,
        # temperature = 1.
    )


    decoded = tokenizer.batch_decode(generated_ids[:,tokens['input_ids'].shape[1]:])

    generated_t = decoded[0].split("<|end_header_id|>")[1]\
        .split("<|eot_id|>")[0]
    
    try:
        json_obj = json.loads(generated_t)

        json_obj['MatchID'] = row['MatchID']
        json_obj['PeriodID'] = row['PeriodID']
        json_obj['originalTweet'] = text
        json_obj['EventType'] = row['EventType']
        generated_samples.append(json_obj)

    except:
        print("Error in JSON generation")

    if 'yes' in decoded[0]:
        clear_output()
        print(f'''
            generated text : {decoded[0]}
            original text: {text}
            text {i} out of {len(train_df)}
            '''
        )

    if i % save_every == 0:
        # pd.DataFrame(generated_samples).to_csv("data/real_labels.csv")

  0%|          | 0/307901 [00:00<?, ?it/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)
  0%|          | 1/307901 [00:01<116:55:26,  1.37s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  0%|          | 2/307901 [00:02<94:07:24,  1.10s/it] The attention mask and

Error in JSON generation


  0%|          | 6/307901 [00:05<67:56:00,  1.26it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  0%|          | 7/307901 [00:06<88:11:50,  1.03s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  0%|          | 8/307901 [00:07<75:00:28,  1.14it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Error in JSON generation


  0%|          | 9/307901 [00:08<71:03:40,  1.20it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  0%|          | 10/307901 [00:08<66:57:21,  1.28it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  0%|          | 11/307901 [00:09<71:39:09,  1.19it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  0%|          | 12/307901 [00:10<69:29:21,  1.23it/s]The attention mask and the pa

KeyboardInterrupt: 

In [36]:
pd.DataFrame(generated_samples)

Unnamed: 0,event,reason,MatchID,PeriodID,originalTweet
0,no,The tweet does not represent a specific footba...,0,41,Honduras is so gay like can u score pls
1,yes,"Goal scored by Shaqiri, Switzerland leading 3-0",0,100,"Shaqiri again, 3-0 for #SUI. Now #ECU need two..."
2,no,The tweet does not represent a specific footba...,0,9,"I love my country , my weird Spanish accent ....."
3,no,The tweet does not describe a specific footbal...,0,81,Damn.. Nice ass play from Honduras tho
4,yes,"The tweet announces the end of the match, whic...",0,122,final match\n#Ecuador 0 #France 0 \n#Honduras...
5,no,The tweet is a general encouragement to Hondur...,0,6,Come on Honduras get this win! #HON
6,no,This tweet does not represent a specific footb...,0,101,wanted Honduras to win 😒
7,no,The tweet does not describe a specific footbal...,0,47,Honduras are a fairly limited footballing side...
8,yes,The tweet mentions Honduras scoring against Ec...,0,126,It's crazy to think that Honduras scored again...
9,yes,The tweet suggests that Switzerland has scored...,0,100,"Ahh Switzerland 3 , Honduras still 0 , Loool s..."


In [33]:
json_obj

{'event': 'no',
 'reason': "The tweet does not represent a specific football event, it's a fan expressing their national pride and cultural identity."}

In [29]:
decoded

['<|start_header_id|>assistant<|end_header_id|>\n\n{\n    "event": "no",\n    "reason": "The tweet does not describe a specific football event, but rather a general statement about a team\'s performance in the World Cup."\n}<|eot_id|>']

In [21]:
generated_ids.shape

torch.Size([1, 168])

153