In [None]:
!pip install transformers datasets peft accelerate bitsandbytes deepseek torch



In [None]:
import pandas as pd

# Load CSV file
file_path = "CUMTOWN.csv" #change based on data location
df = pd.read_csv(file_path, sep='~')

# Preview data
df.head()
# Check for missing data
print(df.isnull().sum())

# Inspect unique values in the 'episode_id' column
print(df['episode_id'].unique())


episode_id    0
text          0
dtype: int64
['1' '2' '3' '4' '5' '6' '7' '8' '9' '10' '11' '12' '13' '14' '15' '16'
 '17' 'P1' '18' 'P2' '19' 'P3' '20' 'P4' '21' 'P5' '22' 'P6' '23' 'P7'
 '24' 'P8' '25' 'P9' '26' 'P10' '27' 'P11' '28' 'P12' '29' 'P14' 'P13'
 '30' 'P15' '31' 'P16' '32' 'P17' '33' 'P18' '34' 'P19' '35' 'P20' '36'
 'P21' '37' 'P22' '38' 'P23' '39' 'P24' '40' 'P25' '41']


In [None]:
# Preprocess data
df['text'] = df['text'].str.strip().str.lower()  # Normalize text
formatted_data = "\n".join([f"Speaker: Episode {row['episode_id']}\nDialogue: {row['text']}\n---" for _, row in df.iterrows()])

# Save to text file
with open("formatted_dialogue.txt", "w") as f:
    f.write(formatted_data)

In [10]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# Configure quantization
bnb_config = BitsAndBytesConfig(load_in_8bit=True)

# Model name
model_name = "meta-llama/Llama-2-7b-chat-hf"

# Load tokenizer and model with quantization
tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=True)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    torch_dtype="auto",
    device_map="auto"
)

# Test the model
prompt = "Create a joke script about [INSERT CURRENT EVENT HERE]"
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
outputs = model.generate(inputs.input_ids, max_length=100, temperature=0.7)

response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(response)


tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

Create a joke script about [INSERT CURRENT EVENT HERE]

Here is a joke script about a current event:

Title: "The Great Toilet Paper Shortage of 2023"

Characters:

* President Joe Biden
* Mike Pence
* Kim Jong-un
* Elon Musk

(The scene opens with President Biden sitting at his desk, looking wor


In [14]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM

# Define the model name and tokenizer
model_name = "meta-llama/Llama-2-7b-hf"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Assign a pad token if not defined
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Load the dataset
dataset = load_dataset('text', data_files={'train': 'formatted_dialogue.txt'})

# Tokenize function
def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, padding="max_length", max_length=512)

# Tokenize the dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Verify the tokenized dataset
print(tokenized_dataset)
print(tokenized_dataset["train"][0])

DatasetDict({
    train: Dataset({
        features: ['text', 'input_ids', 'attention_mask'],
        num_rows: 198
    })
})
{'text': 'Speaker: Episode 1', 'input_ids': [1, 5013, 5790, 29901, 382, 12907, 29871, 29896, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 3200

In [19]:
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
from datasets import load_dataset
import torch

# Clear GPU memory
torch.cuda.empty_cache()

# Model and tokenizer setup
model_name = "meta-llama/Llama-2-7b-hf"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Add padding token without resizing embeddings
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Load model with quantization or gradient checkpointing
from transformers import BitsAndBytesConfig
bnb_config = BitsAndBytesConfig(load_in_8bit=True)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)

# Enable gradient checkpointing
model.gradient_checkpointing_enable()

# Load dataset
dataset = load_dataset('text', data_files={'train': 'formatted_dialogue.txt'})

def tokenize_function(examples):
    tokens = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)
    tokens["labels"] = tokens["input_ids"].copy()
    return tokens

tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset.set_format("torch")

# Training arguments
training_args = TrainingArguments(
    output_dir="./llama_comedy",
    evaluation_strategy="steps",
    per_device_train_batch_size=1,  # Reduce batch size
    num_train_epochs=3,
    save_steps=500,
    save_total_limit=2,
    logging_dir='./logs',
    learning_rate=5e-5,
    fp16=True  # Mixed precision
)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
)

# Start training
trainer.train()

ValueError: Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules in 32-bit, you need to set `llm_int8_enable_fp32_cpu_offload=True` and pass a custom `device_map` to `from_pretrained`. Check https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu for more details. 

In [None]:
# Define the prompt
prompt = "Write a joke about Patriotic Brexiteers wanting to deport the King of England for being German:"

# Tokenize the input
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")  # Ensure input is on GPU if using a GPU

# Generate text
output = model.generate(
    **inputs,
    max_length=10000,       # Set max length for the response
    temperature=0.7,      # Control creativity
    top_p=0.9,            # Nucleus sampling for more natural outputs
    repetition_penalty=1.2  # Discourage repetitive phrases
)

# Decode and print the output
response = tokenizer.decode(output[0], skip_special_tokens=True)
print(response)
