In [2]:
%load_ext autoreload
%autoreload 2

## Llama 8b

In [1]:
import os
while 'notebooks' in os.getcwd():
    os.chdir("..")

import numpy as np
import pandas as pd 
from src.utils import train_test_split
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoModel
from src.preprocessing import TextDataset
import torch
from torch.utils.data import DataLoader, Dataset
from IPython.display import clear_output
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, precision_score
from sentence_transformers import SentenceTransformer, InputExample, losses, evaluation, LoggingHandler
import logging
from copy import deepcopy
from sklearn.decomposition import PCA
from huggingface_hub import notebook_login
from sklearn.ensemble import RandomForestClassifier
from peft import get_peft_model, LoraConfig, TaskType
from sklearn.utils.class_weight import compute_class_weight

from transformers import BitsAndBytesConfig


tqdm.pandas()

In [3]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Tokenizer and model setup
tokenizer = AutoTokenizer.from_pretrained(
    "meta-llama/Meta-Llama-3-8B-Instruct",
    cache_dir="/Data"
)

model = AutoModelForSequenceClassification.from_pretrained(
    "meta-llama/Meta-Llama-3-8B-Instruct",
    quantization_config=quantization_config,
    device_map="auto",
    attn_implementation="eager",
    cache_dir="/Data"
)



Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Meta-Llama-3-8B-Instruct and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<generator object Module.parameters at 0x7f810b506180>

In [8]:
for param in model.model.parameters():
    model.requires_grad = False

In [10]:
train_df = pd.read_csv("data/real_labels_1.csv", index_col= 0 )
train_df_2 = pd.read_csv("data/real_labels_2.csv", index_col=0)
train_df_3 = pd.read_csv("data/real_labels_3.csv", index_col=0)

train_df = pd.concat([train_df, train_df_2, train_df_3])
train_df = train_df.drop_duplicates("originalTweet")

train_df['label'] = train_df['event'].map({
    "no": 0,
    "yes": 1,
    "penalty" : 1,
    "goal" : 1
})

train_df = train_df.sample(frac = 1., replace=False)\
    .dropna()

val_df = train_df[20_000:]
train_df = train_df[0:20_000]

In [20]:
device = 'cuda'

labels = train_df["EventType"].tolist()
# class_weights = compute_class_weights(labels).to(device)

# Define weighted loss function
loss_fn = torch.nn.CrossEntropyLoss()
# for param in model.bert.parameters():
#     param.requires_grad = False
    
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
# tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", cache_dir = '/Data')
n_epochs = 5

for epoch in range(n_epochs):
    for idx, row in tqdm(train_df.iterrows(), total = len(train_df)):
        texts = row['originalTweet']
        max_probs = []

        tokens = tokenizer(
            texts, 
            # max_length=64,
            # padding="max_length", 
            # truncation=True, 
            return_tensors="pt"
        )

        label = torch.tensor(row['label']).to(device)

        input_ids = tokens["input_ids"].to(device)
        attention_mask = tokens["attention_mask"].to(device)

        with torch.autocast( device_type = 'cuda'):
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)

        # Compute loss and backpropagate
        loss = loss_fn(outputs.logits, label.to(torch.long).unsqueeze(-1))
            

        loss.backward()

        
        optimizer.step()
        optimizer.zero_grad()

    y_true_val =[]
    y_pred_val = []

    with torch.no_grad():
        for idx, row in tqdm(val_df.iterrows(), total = len(val_df)):
            texts = row['originalTweet']

            batch = tokenizer(
                texts, 
                max_length=64,
                padding="max_length", 
                truncation=True, 
                return_tensors="pt"
            )

            label = torch.tensor(row['label']).to(device)

            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            # count = torch.ones(len(texts)).to(device).unsqueeze(dim = -1) * row['ID']

            with torch.autocast( device_type = 'cuda'):
                outputs = model(input_ids=input_ids, attention_mask=attention_mask)

            p = torch.softmax(outputs.logits, dim = -1)

            y_true_val.extend([row['label']])
            y_pred_val.extend([p.item()])

    acc_val = accuracy_score(y_true_val, y_pred_val)
    print(acc_val)

  0%|          | 0/20000 [00:00<?, ?it/s]

  1%|          | 173/20000 [00:51<1:37:25,  3.39it/s]


KeyboardInterrupt: 

In [27]:
y_true_val =[]
y_pred_val = []

with torch.no_grad():
    for idx, row in tqdm(val_df.iterrows(), total = len(val_df)):
        texts = row['originalTweet']

        batch = tokenizer(
            texts, 
            return_tensors="pt"
        )

        label = torch.tensor(row['label']).to(device)

        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        # count = torch.ones(len(texts)).to(device).unsqueeze(dim = -1) * row['ID']

        with torch.autocast( device_type = 'cuda'):
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)

        p = torch.argmax(outputs.logits)

        y_true_val.extend([row['label']])
        y_pred_val.extend([p.item()])

acc_val = accuracy_score(y_true_val, y_pred_val)
print(acc_val)

  0%|          | 0/14781 [00:00<?, ?it/s]

  1%|          | 79/14781 [00:14<44:10,  5.55it/s]


KeyboardInterrupt: 

In [30]:
acc_val = precision_score(y_true_val, y_pred_val)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [31]:
acc_val

0.0