In [3]:
import pandas as pd
import re
import random
from transformers import GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from transformers import Trainer, TrainingArguments
import torch







In [8]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9 ]+", " ", text) 
    return text


cleaned_data = {}

with open('movie_titles_metadata.txt', 'r', encoding='utf-8', errors='replace') as titles, \
     open('movie_lines.txt', 'r', encoding='utf-8', errors='replace') as lines:

    movie_titles = {}
    for title in titles:
        parts = title.strip().split(' +++$+++ ')
        if len(parts) >= 5:
            movie_id = parts[0]
            movie_titles[movie_id] = {
                'title': parts[1],
                'year': parts[2],
                'genres': parts[5].strip('[]').replace("'", "").split(', ')
            }

    for line in lines:
        parts = line.strip().split(' +++$+++ ')
        if len(parts) >= 5:
            line_id = parts[0]
            character_name = parts[3]
            movie_id = parts[2]
            dialogue = clean_text(parts[4])

            if movie_id in movie_titles:
                movie_data = movie_titles[movie_id]
                cleaned_data[line_id] = {
                    'movie': movie_data['title'],
                    'year': movie_data['year'],
                    'genres': movie_data['genres'],
                    'character': character_name,
                    'line': dialogue
                }

print(f"Processed {len(cleaned_data)} lines.")

sample_size = int(len(cleaned_data) * 0.1)

random.seed(42)  
reduced_data = random.sample(list(cleaned_data.values()), sample_size)

with open('reduced_movie_dialogues.txt', 'w', encoding='utf-8') as f:
    for entry in reduced_data:
        movie = entry['movie']
        year = entry['year']
        genres = ', '.join(entry['genres'])
        character = entry['character']
        line = entry['line']

        f.write(f"{movie} ({year} - [{genres}]): {character}: {line}\n")

print(f"Sampled {sample_size} lines and saved to 'reduced_movie_dialogues.txt'.")


Processed 304446 lines.
Sampled 30444 lines and saved to 'reduced_movie_dialogues.txt'.


In [9]:
model_name = 'gpt2' 
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model.eval()




GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [10]:
# Tokenizing function
def tokenize_input(user_input):

    input_ids = tokenizer.encode(user_input, return_tensors='pt')
    return input_ids


In [11]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

def load_dataset(file_path, tokenizer, block_size=128):
    return TextDataset(
        tokenizer=tokenizer,
        file_path=file_path,
        block_size=block_size 
    )

def create_data_collator(tokenizer):
    return DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False
    )
            
train_dataset = load_dataset("reduced_movie_dialogues.txt", tokenizer)
data_collator = create_data_collator(tokenizer)




In [12]:
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Training Paraneters
training_args = TrainingArguments(
    output_dir='./gpt2-finetuned-movie-dialogues',
    overwrite_output_dir=True,                   
    num_train_epochs=1,                           
    per_device_train_batch_size=32,               
    save_steps=999_000,                           
    save_total_limit=2,                        
    logging_dir='./logs',                       
    logging_steps=190000,                            
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)
trainer.train()


  0%|          | 0/104 [00:00<?, ?it/s]

{'train_runtime': 1967.7473, 'train_samples_per_second': 1.678, 'train_steps_per_second': 0.053, 'train_loss': 4.04705810546875, 'epoch': 1.0}


TrainOutput(global_step=104, training_loss=4.04705810546875, metrics={'train_runtime': 1967.7473, 'train_samples_per_second': 1.678, 'train_steps_per_second': 0.053, 'total_flos': 215696572416000.0, 'train_loss': 4.04705810546875, 'epoch': 1.0})

In [18]:


def chat_bot():
    conversation_history = ""
    #internal_system_prompt = "You are a movie expert who answers questions about famous movies and quotes. Be concise and informative."

    print('Chatbot: Hi! I am a movie bot for AAI 520, what do you want to talk about? ("exit" to end conversation)')

    while True:
        try:
            user_input = input('You: ')
            
            if user_input.lower() == 'exit':
                print('Chatbot: Cheers!')
                break
            conversation_history += f"User: {user_input}\n"
            prompt =  conversation_history

            tokenizer.pad_token = tokenizer.eos_token
            input_ids = tokenizer(prompt, return_tensors='pt', max_length=100, truncation=True, padding='max_length')['input_ids']

            attention_mask = (input_ids != tokenizer.pad_token_id).type(torch.long)

            output = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_new_tokens=50,
            top_p=0.85,  
            temperature=0.3, 
            pad_token_id=tokenizer.eos_token_id,
            no_repeat_ngram_size=3,
            do_sample=True
)

            

            # Decode model response
            chatbot_response = tokenizer.decode(output[0], skip_special_tokens=True)

            # Append response to conversation history and print it
            conversation_history += f"Chatbot: {chatbot_response}\n"
            print(f"Chatbot: {chatbot_response}")

            # Limit conversation history to the last 300 characters to avoid overload
            conversation_history = conversation_history[-300:]

        except Exception as e:
            print(f'An error occurred: {e}')

chat_bot()



Chatbot: Hi! I am a movie bot for AAI 520, what do you want to talk about? ("exit" to end conversation)
Chatbot: Cheers!


https://realpython.com/build-a-chatbot-python-chatterbot/
https://huggingface.co/docs/transformers/en/main_classes/data_collator
NLP with transformers