In [None]:
# Required Imports
import pandas as pd
import re
import random
import torch
from sklearn.model_selection import train_test_split
from transformers import (
    GPT2Tokenizer,
    GPT2LMHeadModel,
    TextDataset,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback
)

# Data Preprocessing

def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9 ]+", " ", text)
    return text


cleaned_data = {}

with open('movie_titles_metadata.txt', 'r', encoding='utf-8', errors='replace') as titles, \
     open('movie_lines.txt', 'r', encoding='utf-8', errors='replace') as lines:

    movie_titles = {}
    for title in titles:
        parts = title.strip().split(' +++$+++ ')
        if len(parts) >= 5:
            movie_id = parts[0]
            movie_titles[movie_id] = {
                'title': parts[1],
                'year': parts[2],
                'genres': parts[5].strip('[]').replace("'", "").split(', ')
            }

    for line in lines:
        parts = line.strip().split(' +++$+++ ')
        if len(parts) >= 5:
            line_id = parts[0]
            character_name = parts[3]
            movie_id = parts[2]
            dialogue = clean_text(parts[4])

            if movie_id in movie_titles:
                movie_data = movie_titles[movie_id]
                cleaned_data[line_id] = {
                    'movie': movie_data['title'],
                    'year': movie_data['year'],
                    'genres': movie_data['genres'],
                    'character': character_name,
                    'line': dialogue
                }

print(f"Processed {len(cleaned_data)} lines.")

sample_size = int(len(cleaned_data) * 0.5)

random.seed(42)
reduced_data = random.sample(list(cleaned_data.values()), sample_size)

with open('reduced_movie_dialogues.txt', 'w', encoding='utf-8') as f:
    for entry in reduced_data:
        movie = entry['movie']
        year = entry['year']
        genres = ', '.join(entry['genres'])
        character = entry['character']
        line = entry['line']

        f.write(f"{movie} ({year} - [{genres}]): {character}: {line}\n")

print(f"Sampled {sample_size} lines and saved to 'reduced_movie_dialogues.txt'.")

In [7]:
device = "cuda" if torch.cuda.is_available() else "cpu" # Using GPU for available resources

# Pre-trained GPT-2 model and tokenizer
model_name = 'gpt2'
model = GPT2LMHeadModel.from_pretrained(model_name).to(device)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Tokenizer Configuration
tokenizer.pad_token = tokenizer.eos_token
model.resize_token_embeddings(len(tokenizer))  # Adjust token embeddings for padding

# Function to clean and tokenize input
def tokenize_input(user_input):
    input_ids = tokenizer.encode(user_input, return_tensors='pt', truncation=True, max_length=512)
    return input_ids

# Load and preprocess dataset for fine-tuning
def load_dataset(file_path, tokenizer, block_size=128):
    return TextDataset(
        tokenizer=tokenizer,
        file_path=file_path,
        block_size=block_size
    )

# Create Data Collator for Language Modeling
def create_data_collator(tokenizer):
    return DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False
    )


# Splitting dataset into train and evaluation sets
def load_and_split_dataset(file_path, tokenizer, block_size=128):
    dataset = TextDataset(
        tokenizer=tokenizer,
        file_path=file_path,
        block_size=block_size,
    )
    train_size = int(0.8 * len(dataset))
    eval_size = len(dataset) - train_size

    train_dataset, eval_dataset = torch.utils.data.random_split(
        dataset, [train_size, eval_size]
    )
    return train_dataset, eval_dataset

train_dataset, eval_dataset = load_and_split_dataset("reduced_movie_dialogues.txt", tokenizer)

# Training Parameters
training_args = TrainingArguments(
    output_dir='./gpt2-finetuned-movie-dialogues',
    overwrite_output_dir=True,
    num_train_epochs=6,
    per_device_train_batch_size=32,
    gradient_accumulation_steps=32,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model='loss',
    greater_is_better=False,
    logging_dir='./logs',
    logging_steps=500,
)

# Trainer Setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=create_data_collator(tokenizer),
    callbacks=[EarlyStoppingCallback(early_stopping_patience=1)]


    )

def chat_bot():
    # System prompt used once to define behavior
    system_prompt = (
        "You are a knowledgeable movie expert specializing in famous films, characters, quotes, and plots. "
        "Your goal is to answer movie-related questions with concise, single-sentence answers. "
        "Example: User: Who is Luke? Chatbot: Luke is a Jedi Knight and son of Anakin Skywalker."
    )

    print('Chatbot: Hi! I am a movie bot for AAI 520. What do you want to talk about? ("exit" to end conversation)')

    # Conversation history to track context
    conversation_history = []

    def format_conversation():
        """Format the recent conversation for input to the model."""
        return "\n".join(conversation_history[-6:])

    while True:
        try:
            user_input = input('You: ')

            if user_input.lower() == 'exit':
                print('Chatbot: Cheers!')
                break

            # Add user input to conversation history
            conversation_history.append(f"User: {user_input}")

            # Prepare input for the model
            input_text = f"{system_prompt}\n{format_conversation()}"
            input_ids = tokenizer.encode(input_text, return_tensors='pt', max_length=1024, truncation=True).to(device)

            # Generate a response with adjusted parameters
            output = model.generate(
                input_ids,
                max_new_tokens=40,  # Limit response length
                num_return_sequences=1,
                no_repeat_ngram_size=3,  # Avoiding repetition
                top_k=40,  # Token Diversity
                top_p=0.60,  # Balance Variation
                temperature=0.4,  # Randomness
                pad_token_id=tokenizer.eos_token_id,
                attention_mask=input_ids.ne(tokenizer.pad_token_id),
                do_sample=True
            )

            # Decode and validate the response
            chatbot_response = tokenizer.decode(output[0], skip_special_tokens=True).strip()
            conversation_history.append(f"Chatbot: {chatbot_response}")
            print(f'Chatbot: {chatbot_response}')

        except Exception as e:
            print(f'An error occurred: {e}')



#trainer.train()

# Start the chatbot
chat_bot()




Chatbot: Hi! I am a movie bot for AAI 520. What do you want to talk about? ("exit" to end conversation)
You: what did Darth Vader say To Luke Skywalker?
Chatbot: You are a knowledgeable movie expert specializing in famous films, characters, quotes, and plots. Your goal is to answer movie-related questions with concise, single-sentence answers. Example: User: Who is Luke? Chatbot: Luke is a Jedi Knight and son of Anakin Skywalker.
User: what did Darth Vader say To Luke Skywalker? ChatBot: Darth Vader is a Sith Lord and son-in-law of Darth Vader.
user: what was the first time you saw a movie? ChatBots: The first time I
You: What did Yoda tell Luke Skywalker?
Chatbot: You are a knowledgeable movie expert specializing in famous films, characters, quotes, and plots. Your goal is to answer movie-related questions with concise, single-sentence answers. Example: User: Who is Luke? Chatbot: Luke is a Jedi Knight and son of Anakin Skywalker.
User: what did Darth Vader say To Luke Skywalker?
Chat