# Movie Lines Chat Bot
## Import libraries

In [1]:
import spacy
import pandas as pd
import nltk
import numpy as np
import string
import collections
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from pprint import pprint
import math
nlp = spacy.load("en_core_web_sm")

## Import database

In [3]:
# import lines 
url_movie_line = "https://raw.githubusercontent.com/maximecharriere/movie-chatbot/master/MovieLineChatBot/data/outLines.txt"
movie_lines = pd.read_csv(url_movie_line, sep='\+{3}\$\+{3}', engine='python', names=("First line","Reply", "Film", "Character"))

## Functions to process text

In [4]:
lemmatizer = WordNetLemmatizer()
tokenizer = RegexpTokenizer(r'\w+')

# remove punctuation
def remove_punctuation(text):
    cleanedLines = "".join([c for c in text if c not in string.punctuation])
    return cleanedLines

# remove stop words
def remove_stopwords(text):
    words = [w for w in text if w not in stopwords.words('english')]
    return words

# lemmatizer
def word_lemmatizer(text):
    cleanedText = [lemmatizer.lemmatize(i) for i in text]
    return cleanedText

# Join words
def join_text(text):
    stem_text = " ".join(text)
    return stem_text

## Cleaning the text and extracting features

In [5]:
dataLen = len(movie_lines["First line"])
linesVectors = np.zeros((dataLen, 96))

for i in range(dataLen):
    if(not pd.isnull(movie_lines["First line"][i])):
        parsedText = remove_punctuation(movie_lines["First line"][i])
        parsedText = tokenizer.tokenize(parsedText.lower())
        parsedText = remove_stopwords(parsedText)
        parsedText = join_text(parsedText)
        line = nlp(parsedText)
        if(len(line.vector) != 0):
            linesVectors[i][:] = line.vector

## Unsupervised clustering of the text vectors

In [6]:
km_model = KMeans(n_clusters=600, init='random', max_iter=3000, algorithm='full')
km_model.fit(linesVectors)

KMeans(algorithm='full', copy_x=True, init='random', max_iter=3000,
       n_clusters=600, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

## Process user text, predict and reply

In [70]:
message  = "Spare me you excuses, why are you such a disappointment?"
parsedText = remove_punctuation(message)
parsedText = tokenizer.tokenize(parsedText.lower())
parsedText = remove_stopwords(parsedText)
parsedText = join_text(parsedText)
line = nlp(parsedText)

clustering = collections.defaultdict(list)
for idx, label in enumerate(km_model.labels_):
        clustering[label].append(idx)
        
clusterIndex = km_model.predict([line.vector])[0]
maxIndex = len(clustering[clusterIndex])
randIndex = np.random.randint(0, maxIndex)
lineIndex = clustering[clusterIndex][randIndex]

print("User : ", message)
print("Bot  :", movie_lines["Reply"][lineIndex], " -",  movie_lines["Character"][lineIndex], " from", movie_lines["Film"][lineIndex])

User :  Spare me you excuses, why are you such a disappointment?
Bot  :  I'm a photographer, remember?  -  PETER  from  spider-man 
