In [6]:
import pandas as pd
from google.colab import drive

# Step 1: Mount Google Drive and Load the Data
drive.mount('/content/drive')

# Updated paths to your files
movie_lines_file = '/content/drive/My Drive/Collab_finalprj/movie_lines.txt'
movie_conversations_file = '/content/drive/My Drive/Collab_finalprj/movie_conversations.txt'

# Load the movie lines data with encoding handling
movie_lines = pd.read_csv(movie_lines_file, sep=r" \+\+\+\$\+\+\+ ", header=None, engine="python",
                          names=["lineID", "characterID", "movieID", "character", "text"], encoding="ISO-8859-1")

# Load the movie conversations data with encoding handling
movie_conversations = pd.read_csv(movie_conversations_file, sep=r" \+\+\+\$\+\+\+ ", header=None, engine="python",
                                  names=["character1ID", "character2ID", "movieID", "utteranceIDs"], encoding="ISO-8859-1")  # Corrected this line

# Step 2: Clean Text and Tokenize
def clean_text(text):
    return text.lower().strip() if isinstance(text, str) else ""

# Apply cleaning to movie lines
movie_lines['cleaned_text'] = movie_lines['text'].apply(clean_text)

# Tokenized movie lines (using simple space-based tokenization)
movie_lines['tokens'] = movie_lines['cleaned_text'].apply(lambda x: x.split())

# Display sample results
print("Sample movie lines:")
print(movie_lines[['lineID', 'characterID', 'movieID', 'character', 'text']].head())
print("\nSample movie conversations:")
print(movie_conversations[['character1ID', 'character2ID', 'movieID', 'utteranceIDs']].head())
print("\nCleaned movie lines:")
print(movie_lines[['text', 'cleaned_text']].head())
print("\nTokenized movie lines:")
print(movie_lines[['cleaned_text', 'tokens']].head())

# Step 3: Prepare the Sequences for the Seq2Seq Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

def prepare_data_for_seq2seq(movie_lines, movie_conversations):
    # Create a dictionary to map lineIDs to text
    line_to_text = dict(zip(movie_lines['lineID'], movie_lines['cleaned_text']))

    input_texts = []
    target_texts = []

    # Iterate through the conversations
    for _, row in movie_conversations.iterrows():
        utterance_ids = eval(row['utteranceIDs'])  # Convert string representation of list back to list
        for i in range(len(utterance_ids) - 1):
            input_line = line_to_text.get(utterance_ids[i], "")
            target_line = line_to_text.get(utterance_ids[i + 1], "")
            if input_line and target_line:
                input_texts.append(input_line)
                target_texts.append(target_line)

    # Tokenize the input and target texts
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(input_texts + target_texts)

    input_sequences = tokenizer.texts_to_sequences(input_texts)
    target_sequences = tokenizer.texts_to_sequences(target_texts)

    # Pad sequences to the same length
    max_sequence_length = max(len(seq) for seq in input_sequences)
    input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_length, padding='post')
    target_sequences = pad_sequences(target_sequences, maxlen=max_sequence_length, padding='post')

    return input_sequences, target_sequences, tokenizer

# Prepare the data for Seq2Seq
input_sequences, target_sequences, tokenizer = prepare_data_for_seq2seq(movie_lines, movie_conversations)

# Get the vocabulary size
vocab_size = len(tokenizer.word_index) + 1

print("Vocabulary size:", vocab_size)
print("Sample input sequence:", input_sequences[0])
print("Sample target sequence:", target_sequences[0])

# Step 4: Define the Seq2Seq model
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense

def define_seq2seq_model(vocab_size, max_sequence_length):
    # Encoder
    encoder_inputs = Input(shape=(max_sequence_length,))
    encoder_embedding = Embedding(vocab_size, 256)(encoder_inputs)
    encoder_lstm, state_h, state_c = LSTM(256, return_state=True)(encoder_embedding)
    encoder_states = [state_h, state_c]

    # Decoder
    decoder_inputs = Input(shape=(max_sequence_length,))
    decoder_embedding = Embedding(vocab_size, 256)(decoder_inputs)
    decoder_lstm = LSTM(256, return_sequences=True, return_state=False)(decoder_embedding, initial_state=encoder_states)
    decoder_dense = Dense(vocab_size, activation='softmax')(decoder_lstm)

    # Seq2Seq model
    model = Model([encoder_inputs, decoder_inputs], decoder_dense)
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    return model

# Step 5: Train the Seq2Seq Model
# Instantiate the model
max_sequence_length = input_sequences.shape[1]
seq2seq_model = define_seq2seq_model(vocab_size, max_sequence_length)

# Print the model summary
seq2seq_model.summary()

# Reshape the target_sequences to match the output shape for training
target_sequences = target_sequences.reshape(target_sequences.shape[0], target_sequences.shape[1], 1)

# Train the model (adjust epochs and batch_size for memory handling)
history = seq2seq_model.fit([input_sequences, target_sequences], target_sequences, epochs=10, batch_size=32)  # Reduced batch size

# Save the model after training
seq2seq_model.save('seq2seq_movie_chatbot.h5')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Sample movie lines:
  lineID characterID movieID character          text
0  L1045          u0      m0    BIANCA  They do not!
1  L1044          u2      m0   CAMERON   They do to!
2   L985          u0      m0    BIANCA    I hope so.
3   L984          u2      m0   CAMERON     She okay?
4   L925          u0      m0    BIANCA     Let's go.

Sample movie conversations:
  character1ID character2ID movieID                      utteranceIDs
0           u0           u2      m0  ['L194', 'L195', 'L196', 'L197']
1           u0           u2      m0                  ['L198', 'L199']
2           u0           u2      m0  ['L200', 'L201', 'L202', 'L203']
3           u0           u2      m0          ['L204', 'L205', 'L206']
4           u0           u2      m0                  ['L207', 'L208']

Cleaned movie lines:
           text  cleaned_text
0  They do not!  they do not!
1 

Epoch 1/10
[1m6916/6916[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m873s[0m 126ms/step - accuracy: 0.9805 - loss: 0.2845
Epoch 2/10
[1m6916/6916[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m871s[0m 126ms/step - accuracy: 0.9981 - loss: 0.0175
Epoch 3/10
[1m6916/6916[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m872s[0m 126ms/step - accuracy: 0.9992 - loss: 0.0070
Epoch 4/10
[1m6916/6916[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m872s[0m 126ms/step - accuracy: 0.9996 - loss: 0.0032
Epoch 5/10
[1m6916/6916[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m873s[0m 126ms/step - accuracy: 0.9998 - loss: 0.0019
Epoch 6/10
[1m6916/6916[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m873s[0m 126ms/step - accuracy: 0.9999 - loss: 0.0012
Epoch 7/10
[1m6916/6916[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m873s[0m 126ms/step - accuracy: 1.0000 - loss: 3.3280e-04
Epoch 8/10
[1m6916/6916[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m873s[0m 126ms/step - accuracy: 1.0000 - l

