<a href="https://colab.research.google.com/github/maximecharriere/movie-chatbot/blob/master/movie-chatbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Chatbot cinéphile - WELCOME**
*Par Dylan **Morocutti** et Maxime **Charrière**.*

# Libraries importation

In [8]:
import numpy as np
import tensorflow as tf
import matplotlib
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import treebank, nps_chat
from nltk.cluster import KMeansClusterer
import pandas as pd
import gensim
from gensim.models import Word2Vec
import sklearn
from sklearn import cluster
from sklearn import metrics
import spacy

print("Libraries version:")
print(f"Numpy:      {np.__version__}")
print(f"Matplotlib: {matplotlib.__version__}")
print(f"Tensorflow: {tf.__version__}")
print(f"NLTK:       {nltk.__version__}")
print(f"Pandas:     {pd.__version__}")
print(f"Gensim:     {gensim.__version__}")
print(f"Sklearn:    {sklearn.__version__}")
print(f"Spacy:      {spacy.__version__}")

Libraries version:
Numpy:      1.18.4
Matplotlib: 3.2.1
Tensorflow: 2.2.0
NLTK:       3.2.5
Pandas:     1.0.4
Gensim:     3.6.0
Sklearn:    0.22.2.post1
Spacy:      2.2.4


# Importation des data
- Source: http://www.cs.cornell.edu/~cristian/Cornell_Movie-Dialogs_Corpus.html

In [0]:
url_movie_line = "https://raw.githubusercontent.com/maximecharriere/movie-chatbot/master/data/parsed_movie_dialogue.txt"
movie_line = pd.read_csv(url_movie_line, sep='\+{3}\$\+{3}', engine='python', names=("First line","Reply"))

In [0]:
movie_line["First line"][2]

'We saw it. All craft prepare to retreat.'

# Building DeepLearning **model**

In [0]:
# Flatten
model = tf.keras.models.Sequential()
# Add layers
model.add(tf.keras.layers.Dense(256, activation="relu"))
model.add(tf.keras.layers.Dense(128, activation="relu"))
model.add(tf.keras.layers.Dense(2, activation="softmax"))

# NLTK Test

### Download data

In [5]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('treebank')
nltk.download('nps_chat')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.
[nltk_data] Downloading package nps_chat to /root/nltk_data...
[nltk_data]   Unzipping corpora/nps_chat.zip.


True

In [0]:
sentence = """Your insight serves you well. Bury your feelings deep down, Luke. They do you credit. But they could be made to serve the Emperor."""
tokens = nltk.word_tokenize(sentence)
tagged = nltk.pos_tag(tokens)
entities = nltk.chunk.ne_chunk(tagged)
# t = treebank.parsed_sents('wsj_0001.mrg')[0]
# nltk.draw.tree.TreeView(t)._cframe.print_to_file('output.ps')

In [0]:
len(tagged)

29

In [6]:
nps_chat.fileids()

['10-19-20s_706posts.xml',
 '10-19-30s_705posts.xml',
 '10-19-40s_686posts.xml',
 '10-19-adults_706posts.xml',
 '10-24-40s_706posts.xml',
 '10-26-teens_706posts.xml',
 '11-06-adults_706posts.xml',
 '11-08-20s_705posts.xml',
 '11-08-40s_706posts.xml',
 '11-08-adults_705posts.xml',
 '11-08-teens_706posts.xml',
 '11-09-20s_706posts.xml',
 '11-09-40s_706posts.xml',
 '11-09-adults_706posts.xml',
 '11-09-teens_706posts.xml']

# SpaCy test

In [15]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("When Sebastian Thrun started working on self-driving cars at Google in 2007, few people outside of the company took him seriously.")

for ent in doc.ents:
    print(ent.text, ent.label_)

Sebastian NORP
Google ORG
2007 DATE


In [18]:
mango = nlp('When Sebastian Thrun started working on self-driving cars at Google in 2007, few people outside of the company took him seriously.')
print(mango.vector.shape)
print(mango.vector)

(96,)
[ 1.2547495   0.5018356  -0.10489882 -0.8144302   1.030638   -0.43493682
  1.4772629   0.30886003  0.7389851   1.7634985  -1.2179859  -0.06692379
 -0.29719985 -0.8395235  -1.4615259  -0.62463915 -0.07981872  0.00929824
 -0.24031505  0.25372627 -0.3475497  -0.5162379   0.10827694  0.09177113
 -0.38051963  0.52912337 -1.1400261  -0.32192612  0.6927819  -0.80070204
  0.6862533  -0.02631976 -0.24994369 -0.63626266 -0.03331006 -1.1234782
  0.01468814 -0.710911   -1.4692793  -0.32182044  0.5871417   0.1246916
  0.02204137 -0.7638084   0.32105687 -0.22490372 -0.09796767 -0.07184676
 -0.33491376  0.24576561  0.67671084 -0.19045591  0.20345376 -0.9203198
 -1.3877374   0.68930554  0.08902735  0.9616394   0.8992842   0.2098153
  0.55452806 -0.5523305   0.890179    0.5652993   0.27227864 -0.54219174
  0.5560992  -2.2867904  -0.08910669  1.7222291   1.1124057  -1.0997052
  0.50205594 -0.23024462  0.00424102 -0.27917016  1.7747836  -0.80378944
 -1.0076889  -0.577863   -0.42148054 -0.08094498  