In [1]:
import pandas as pd
import re
import random
from transformers import GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from transformers import Trainer, TrainingArguments







### Text Preprocessing

In [2]:
lines = open('movie_lines.txt', encoding='utf-8', errors='ignore').read().split('\n')
conversations = open('movie_conversations.txt', encoding='utf-8', errors='ignore').read().split('\n')

id2line = {}
for line in lines:
    _line = line.split(' +++$+++ ')
    if len(_line) == 5:
        id2line[_line[0]] = _line[4]

def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9]+", " ", text)
    return text

cleaned_data = {key: clean_text(value) for key, value in id2line.items()}


In [3]:
output_file = "cleaned_movie_dialogues.txt"

with open(output_file, 'w', encoding='utf-8') as f:
    for value in cleaned_data.values():  # Loop through the values of the dictionary
        f.write(f"{value}\n")  # Write each cleaned dialogue on a new line


In [4]:
dialogue_list = list(cleaned_data.values())

# Sample Size
sample_size = int(len(dialogue_list) * 0.01)

# Randomly sample the lines
reduced_data = random.sample(dialogue_list, sample_size)

# Save to new dataset
with open("reduced_movie_dialogues.txt", 'w', encoding='utf-8') as f:
    for dialogue in reduced_data:
        f.write(f"{dialogue}\n")


### Model Selection

In [5]:
model_name = 'gpt2' 
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model.eval()




GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [6]:
# Tokenizing function
def tokenize_input(user_input):

    input_ids = tokenizer.encode(user_input, return_tensors='pt')
    return input_ids


In [7]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

def load_dataset(file_path, tokenizer, block_size=128):
    return TextDataset(
        tokenizer=tokenizer,
        file_path=file_path,
        block_size=block_size 
    )

def create_data_collator(tokenizer):
    return DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False
    )

train_dataset = load_dataset("reduced_movie_dialogues.txt", tokenizer)
data_collator = create_data_collator(tokenizer)




In [8]:
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Training Paraneters
training_args = TrainingArguments(
    output_dir='./gpt2-finetuned-movie-dialogues',
    overwrite_output_dir=True,                   
    num_train_epochs=3,                           
    per_device_train_batch_size=4,               
    save_steps=10_000,                           
    save_total_limit=2,                        
    logging_dir='./logs',                       
    logging_steps=200,                            
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)
trainer.train()


  0%|          | 0/2478 [00:00<?, ?it/s]

{'loss': 4.0813, 'grad_norm': 8.913236618041992, 'learning_rate': 4.596448748991122e-05, 'epoch': 0.24}
{'loss': 3.9622, 'grad_norm': 4.991582870483398, 'learning_rate': 4.192897497982244e-05, 'epoch': 0.48}
{'loss': 3.9014, 'grad_norm': 4.049081802368164, 'learning_rate': 3.789346246973366e-05, 'epoch': 0.73}
{'loss': 3.8867, 'grad_norm': 3.6358044147491455, 'learning_rate': 3.3857949959644874e-05, 'epoch': 0.97}
{'loss': 3.702, 'grad_norm': 3.787088632583618, 'learning_rate': 2.9822437449556095e-05, 'epoch': 1.21}
{'loss': 3.6928, 'grad_norm': 3.6753780841827393, 'learning_rate': 2.5786924939467316e-05, 'epoch': 1.45}
{'loss': 3.6728, 'grad_norm': 4.0121588706970215, 'learning_rate': 2.175141242937853e-05, 'epoch': 1.69}
{'loss': 3.655, 'grad_norm': 3.865846633911133, 'learning_rate': 1.7715899919289748e-05, 'epoch': 1.94}
{'loss': 3.5793, 'grad_norm': 3.809112787246704, 'learning_rate': 1.3680387409200971e-05, 'epoch': 2.18}
{'loss': 3.5517, 'grad_norm': 4.223584175109863, 'learning

TrainOutput(global_step=2478, training_loss=3.7263157238779385, metrics={'train_runtime': 6995.9533, 'train_samples_per_second': 1.416, 'train_steps_per_second': 0.354, 'total_flos': 647089717248000.0, 'train_loss': 3.7263157238779385, 'epoch': 3.0})

In [9]:
def chat_bot():
    conversation_history = ""
    
    print('Chatbot: Hi! I am a movie bot for AAI 520, what do you want to talk about?("exit" to end conversation)')
    
    while True:
        try:
            user_input = input('You: ')
            
            if user_input.lower() == 'exit':
                print('Chatbot: Cheers!')
                break
            
            conversation_history += f'User: {user_input}\n'
            input_ids = tokenizer.encode(conversation_history, return_tensors='pt', max_length=1024, truncation=True)
            
            output = model.generate(
                input_ids,
                max_length=200, 
                num_return_sequences=1,
                no_repeat_ngram_size=2,
                top_p=0.95,
                temperature=0.7
            )
            
            # Decode the model's response and add it to the conversation history
            chatbot_response = tokenizer.decode(output[0], skip_special_tokens=True)
            conversation_history += f'Chatbot: {chatbot_response}\n'
            
            # Limit conversation history to last 1000 characters
            conversation_history = conversation_history[-1000:]
            print(f'Chatbot: {chatbot_response}')
        
        except Exception as e:
            print(f'An error occurred: {e}')

chat_bot()



Chatbot: Hi! I am a movie bot for AAI 520, what do you want to talk about?("exit" to end conversation)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Chatbot: User: darth vader
i m not sure what you re talking about 
you re not going to be able to get out of here what are you going ianne ernstein urnstein s a great place to live ertrand ichard ennie enzo is that all 


what 


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Chatbot: User: darth vader
Chatbot: User: darth vader
i m not sure what you re talking about 
you re not going to be able to get out of here what are you going ianne ernstein urnstein s a great place to live ertrand ichard ennie enzo is that all 


what 
User: what did darth vader said to luke?
it s not like i m a virgin irl what are we going on 
Chatbot: Cheers!


https://realpython.com/build-a-chatbot-python-chatterbot/
https://huggingface.co/docs/transformers/en/main_classes/data_collator
NLP with transformers