In [None]:
import random
import pandas as pd

# Define some synthetic sequences and labels
sequences = [
    "The weather today is sunny and warm.",
    "I just finished reading a great book.",
    "The movie last night was very exciting.",
    "Artificial intelligence is transforming the world.",
    "I love hiking in the mountains.",
    "Cooking new recipes is a lot of fun.",
    "The concert last weekend was amazing.",
    "Python is a versatile programming language.",
    "I enjoy watching sports on weekends.",
    "Machine learning is a fascinating field."
]

# Generate synthetic labels (0 or 1) for binary classification
labels = [random.randint(0, 1) for _ in range(len(sequences))]

# Create a DataFrame
data = pd.DataFrame({
    'sequence': sequences,
    'label': labels
})

# Display the synthetic dataset
data


In [None]:
import numpy as np 
import re 
data_csv = f'./../../toy-data/exp2/data_1.csv'

def remove_newlines(text):
    return re.sub(r'\n+', '', text)

df = pd.read_csv(data_csv)
df['Description'] = df['Description'].apply(lambda x: remove_newlines(x))


# Subsample Observations
indices = np.random.choice(df.index, size=32, replace=False)
df = df.loc[indices].reset_index(drop=True)
df['label'] = [random.randint(0, 1) for _ in range(df.shape[0])]
df.head()

In [None]:
df[['Description', 'label']]

In [None]:
var = "Description"
data = df

In [None]:
from transformers import AutoTokenizer, DataCollatorWithPadding
from datasets import Dataset, DatasetDict
from torch.utils.data import DataLoader


model_id = "microsoft/phi-3-mini-4k-instruct" #"microsoft/phi-3-mini-4k-instruct" # "meta-llama/Meta-Llama-3-8B-Instruct" #"meta-llama/Meta-Llama-3-8B" # #
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.pad_token = tokenizer.eos_token

def tokenizer_function(example):
  return tokenizer(example[var], truncation=True, max_length=512)

dataset = Dataset.from_dict(data[[var,'label']])
tokenized_dataset = dataset.map(tokenizer_function, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns([var])

data_loader = DataLoader(tokenized_dataset, batch_size=4, collate_fn=DataCollatorWithPadding(tokenizer), shuffle=False)


In [None]:
import torch
from transformers import Phi3ForSequenceClassification, AdamW
model_id = "microsoft/phi-3-mini-4k-instruct"

# Load the model
model = Phi3ForSequenceClassification.from_pretrained(model_id,
                                                           device_map='auto',
                                                           num_labels=2, 
                                                           torch_dtype=torch.bfloat16,
                                                           attn_implementation="flash_attention_2")

# Define the optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")

In [None]:
losses = []

model.train()
for epoch in range(20):  # Training for 3 epochs
    batch_loss = []
    for batch in data_loader:
        input_ids, attention_mask, labels = batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['labels'].to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        # Calculate the loss
        loss_fct = torch.nn.CrossEntropyLoss()
        loss = loss_fct(logits, labels)
        batch_loss.append(loss.detach().item())

        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    losses.append(sum(batch_loss)/len(data_loader))

    print(f"Epoch {epoch + 1}, Loss: {losses[-1]}")

In [None]:
import matplotlib.pyplot as plt 
plt.plot(losses)