<a href="https://colab.research.google.com/github/maryamhgf/LLM-Mastery/blob/master/Chatbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Load Packages**

In [29]:
!pip install kaggle

import pandas as pd
import re
import ast
import numpy as np
import pickle
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.chat.util import Chat, reflections
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.optimizers import SGD



In [6]:
import tensorflow as tf

devices = tf.config.list_physical_devices()
print("Available devices:", devices)

gpus = tf.config.list_physical_devices('GPU')
if gpus:
    print("TensorFlow is using the following GPU devices:")
    for gpu in gpus:
        print(gpu)
else:
    print("No GPU devices found. TensorFlow is using the CPU.")

Available devices: [PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
TensorFlow is using the following GPU devices:
PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')


# **Download Dataset**

In [3]:
!mkdir -p ~/.kaggle
!kaggle datasets download -d rajathmc/cornell-moviedialog-corpus
!unzip /content/cornell-moviedialog-corpus.zip -d cornell-moviedialog-corpus

Dataset URL: https://www.kaggle.com/datasets/rajathmc/cornell-moviedialog-corpus
License(s): CC0-1.0
cornell-moviedialog-corpus.zip: Skipping, found more recently modified local copy (use --force to force download)
Archive:  /content/cornell-moviedialog-corpus.zip
replace cornell-moviedialog-corpus/.DS_Store? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [7]:
lines = pd.read_csv('/content/cornell-moviedialog-corpus/movie_lines.txt', sep='\+\+\+\$\+\+\+', engine='python', names=["lineID", "characterID", "movieID", "character", "text"], encoding='ISO-8859-1')

conversations = pd.read_csv('/content/cornell-moviedialog-corpus/movie_conversations.txt', sep='\+\+\+\$\+\+\+', engine='python', names=["character1ID", "character2ID", "movieID", "utteranceIDs"], encoding='ISO-8859-1')

In [5]:
lines

Unnamed: 0,lineID,characterID,movieID,character,text
0,L1045,u0,m0,BIANCA,They do not!
1,L1044,u2,m0,CAMERON,They do to!
2,L985,u0,m0,BIANCA,I hope so.
3,L984,u2,m0,CAMERON,She okay?
4,L925,u0,m0,BIANCA,Let's go.
...,...,...,...,...,...
304708,L666371,u9030,m616,DURNFORD,Lord Chelmsford seems to want me to stay back...
304709,L666370,u9034,m616,VEREKER,I'm to take the Sikali with the main column t...
304710,L666369,u9030,m616,DURNFORD,"Your orders, Mr Vereker?"
304711,L666257,u9030,m616,DURNFORD,"Good ones, yes, Mr Vereker. Gentlemen who can..."


In [6]:
conversations

Unnamed: 0,character1ID,character2ID,movieID,utteranceIDs
0,u0,u2,m0,"['L194', 'L195', 'L196', 'L197']"
1,u0,u2,m0,"['L198', 'L199']"
2,u0,u2,m0,"['L200', 'L201', 'L202', 'L203']"
3,u0,u2,m0,"['L204', 'L205', 'L206']"
4,u0,u2,m0,"['L207', 'L208']"
...,...,...,...,...
83092,u9028,u9031,m616,"['L666324', 'L666325', 'L666326', 'L666327']"
83093,u9028,u9031,m616,"['L666575', 'L666576']"
83094,u9030,u9034,m616,"['L666256', 'L666257']"
83095,u9030,u9034,m616,"['L666369', 'L666370', 'L666371', 'L666372']"


#**Preprocess**

In [19]:
def analyze_data(data):
  missing_values_per_column = data.isna().sum()
  print("Missing values per column:")
  print(missing_values_per_column)

In [20]:
analyze_data(lines)
analyze_data(conversations)

Missing values per column:
lineID           0
characterID      0
movieID          0
character        0
text           267
dtype: int64
Missing values per column:
character1ID    0
character2ID    0
movieID         0
utteranceIDs    0
dtype: int64


In [21]:
lines = lines.dropna()
conversations = conversations.dropna()

In [22]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9]+", " ", text)
    return text

lines = lines.copy()
lines['text'] = lines['text'].apply(clean_text)

In [11]:
lines

Unnamed: 0,lineID,characterID,movieID,character,text
0,L1045,u0,m0,BIANCA,they do not
1,L1044,u2,m0,CAMERON,they do to
2,L985,u0,m0,BIANCA,i hope so
3,L984,u2,m0,CAMERON,she okay
4,L925,u0,m0,BIANCA,let s go
...,...,...,...,...,...
304708,L666371,u9030,m616,DURNFORD,lord chelmsford seems to want me to stay back...
304709,L666370,u9034,m616,VEREKER,i m to take the sikali with the main column t...
304710,L666369,u9030,m616,DURNFORD,your orders mr vereker
304711,L666257,u9030,m616,DURNFORD,good ones yes mr vereker gentlemen who can ri...


In [12]:
conversations['utteranceIDs']

Unnamed: 0,utteranceIDs
0,"['L194', 'L195', 'L196', 'L197']"
1,"['L198', 'L199']"
2,"['L200', 'L201', 'L202', 'L203']"
3,"['L204', 'L205', 'L206']"
4,"['L207', 'L208']"
...,...
83092,"['L666324', 'L666325', 'L666326', 'L666327']"
83093,"['L666575', 'L666576']"
83094,"['L666256', 'L666257']"
83095,"['L666369', 'L666370', 'L666371', 'L666372']"


In [23]:
#Map each line ID to its corresponding text
id2line = {line['lineID'].replace(" ", ""): line['text'] for index, line in lines.iterrows()}

In [24]:
conversations['utteranceIDs'] = conversations['utteranceIDs'].apply(ast.literal_eval)

In [25]:
pairs = []
errors = 0
for conv in conversations['utteranceIDs']:
    for i in range(len(conv) - 1):
        try:
          input_line = id2line[conv[i]]
          target_line = id2line[conv[i + 1]]
          pairs.append((input_line, target_line))
        except Exception as e:
          continue
pairs_df = pd.DataFrame(pairs, columns=['input', 'response'])

In [16]:
pairs_df

Unnamed: 0,input,response
0,can we make this quick roxanne korrine and an...,well i thought we d start with pronunciation ...
1,well i thought we d start with pronunciation ...,not the hacking and gagging and spitting part...
2,not the hacking and gagging and spitting part...,okay then how bout we try out some french cui...
3,you re asking me out that s so cute what s yo...,forget it
4,no no it s my fault we didn t have a proper i...,cameron
...,...,...
221277,your orders mr vereker,i m to take the sikali with the main column t...
221278,i m to take the sikali with the main column t...,lord chelmsford seems to want me to stay back...
221279,lord chelmsford seems to want me to stay back...,i think chelmsford wants a good man on the bo...
221280,well i assure you sir i have no desire to cre...,and i assure you you do not in fact i d be ob...


#**Sample Data (for experiments)**

In [17]:
pairs_df = pairs_df.sample(n=10000, random_state=42)

#**Seq2Seq Model**

## Tokeniziation

In [18]:
tokenizer = Tokenizer(filters='', lower=True)

tokenizer.fit_on_texts(pairs_df['input'].tolist() + pairs_df['response'].tolist())

encoder_input_sequences = tokenizer.texts_to_sequences(pairs_df['input'].tolist())
decoder_input_sequences = tokenizer.texts_to_sequences(pairs_df['response'].tolist())

num_encoder_tokens = len(tokenizer.word_index) + 1
num_decoder_tokens = num_encoder_tokens

##Padding the sequences

In [19]:
max_encoder_seq_length = max([len(seq) for seq in encoder_input_sequences])
max_decoder_seq_length = max([len(seq) for seq in decoder_input_sequences])

encoder_input_data = pad_sequences(encoder_input_sequences, maxlen=max_encoder_seq_length, padding='post')
decoder_input_data = pad_sequences(decoder_input_sequences, maxlen=max_decoder_seq_length, padding='post')

In [20]:
len(pairs_df)

10000

##Create Data

In [21]:
import numpy as np
from scipy.sparse import csr_matrix

rows = []
cols = []
data = []

for i, seq in enumerate(decoder_input_data):
    for t, token in enumerate(seq):
        if t > 0:
            rows.append(i * max_decoder_seq_length + (t - 1))

            cols.append(token)
            data.append(1.0)

rows = np.array(rows)
cols = np.array(cols)
data = np.array(data)

num_samples = len(pairs_df)
num_rows = num_samples * max_decoder_seq_length
num_cols = num_decoder_tokens

sparse_decoder_target_data = csr_matrix((data, (rows, cols)), shape=(num_rows, num_cols))

In [22]:
dense_decoder_target_data = sparse_decoder_target_data.toarray()

In [None]:
'''
#If you have enough GPU RAM:
decoder_target_data = np.zeros((len(pairs_df), max_decoder_seq_length, num_decoder_tokens), dtype='float16')

for i, seq in enumerate(decoder_input_data):
    for t, token in enumerate(seq):
        if t > 0:
            decoder_target_data[i, t-1, token] = 1.0
'''

In [None]:
#Needs propoer GPU RAM
latent_dim = 64

encoder_inputs = Input(shape=(None, num_encoder_tokens))
encoder = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
encoder_states = [state_h, state_c]

decoder_inputs = Input(shape=(None, num_decoder_tokens))
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs,
                                     initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

model.compile(optimizer='rmsprop', loss='categorical_crossentropy')
model.fit([encoder_input_data, decoder_input_data], dense_decoder_target_data,
          batch_size=16,
          epochs=100,
          validation_split=0.2)

#**Usign NLTK Chat**

In [26]:
pairs = list(pairs_df.itertuples(index=False, name=None))

In [None]:
chatbot = Chat(pairs, reflections)

In [None]:
print("Hello! I am a chatbot. Type 'quit' to exit. (not a powerful chatbut, this is just a test to see the performance)")
while True:
    user_input = input('You: ')
    if user_input.lower() == 'quit':
        print('Chatbot: Bye! Take care!')
        break
    response = chatbot.respond(user_input)
    print('Chatbot:', response)