In [8]:
!pip install transformers datasets peft accelerate bitsandbytes deepseek torch



In [2]:
import pandas as pd

# Load CSV file
file_path = "CUMTOWN.csv" #change based on data location
df = pd.read_csv(file_path, sep='~')

# Preview data
df.head()
# Check for missing data
print(df.isnull().sum())

# Inspect unique values in the 'episode_id' column
print(df['episode_id'].unique())


episode_id    0
text          0
dtype: int64
['1' '2' '3' '4' '5' '6' '7' '8' '9' '10' '11' '12' '13' '14' '15' '16'
 '17' 'P1' '18' 'P2' '19' 'P3' '20' 'P4' '21' 'P5' '22' 'P6' '23' 'P7'
 '24' 'P8' '25' 'P9' '26' 'P10' '27' 'P11' '28' 'P12' '29' 'P14' 'P13'
 '30' 'P15' '31' 'P16' '32' 'P17' '33' 'P18' '34' 'P19' '35' 'P20' '36'
 'P21' '37' 'P22' '38' 'P23' '39' 'P24' '40' 'P25' '41']


In [3]:
# Preprocess data
df['text'] = df['text'].str.strip().str.lower()  # Normalize text
formatted_data = "\n".join([f"Speaker: Episode {row['episode_id']}\nDialogue: {row['text']}\n---" for _, row in df.iterrows()])

# Save to text file
with open("formatted_dialogue.txt", "w") as f:
    f.write(formatted_data)

In [9]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# Configure quantization
bnb_config = BitsAndBytesConfig(load_in_8bit=True)

# Model name
model_name = "meta-llama/Llama-2-7b-chat-hf"

# Load tokenizer and model with quantization
tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=True)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    torch_dtype="auto",
    device_map="auto"
)

# Test the model
prompt = "Create a joke script about [INSERT CURRENT EVENT HERE]"
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
outputs = model.generate(inputs.input_ids, max_length=100, temperature=0.7)

response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(response)




OSError: You are trying to access a gated repo.
Make sure to have access to it at https://huggingface.co/meta-llama/Llama-2-7b-chat-hf.
403 Client Error. (Request ID: Root=1-676e849a-001ff6da475ec0fb0a44db42;2a30f961-f9da-4e14-abd2-b64e2bad74f6)

Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-2-7b-chat-hf/resolve/main/config.json.
Access to model meta-llama/Llama-2-7b-chat-hf is restricted and you are not in the authorized list. Visit https://huggingface.co/meta-llama/Llama-2-7b-chat-hf to ask for access.

In [None]:
from datasets import load_dataset

# Load dataset
dataset = load_dataset('text', data_files={'train': 'formatted_dialogue.txt'})

# Tokenize
def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, padding="max_length", max_length=512)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./llama_comedy",
    evaluation_strategy="steps",
    per_device_train_batch_size=4,
    num_train_epochs=3,
    save_steps=500,
    save_total_limit=2,
    logging_dir='./logs',
    learning_rate=5e-5,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
)

trainer.train()

In [None]:
prompt = "Speaker: Host\nDialogue: Why did the chicken cross the road?\n---\nSpeaker: Guest\nDialogue:"
inputs = tokenizer(prompt, return_tensors="pt")
output = model.generate(**inputs, max_length=100, temperature=0.7)

print(tokenizer.decode(output[0], skip_special_tokens=True))