<a href="https://colab.research.google.com/github/maximecharriere/movie-chatbot/blob/master/movie-chatbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Chatbot cinéphile - WELCOME**
*Par Dylan **Morocutti** et Maxime **Charrière**.*

# Libraries importation

In [28]:
import numpy as np
import tensorflow as tf
import matplotlib
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import treebank, nps_chat
from nltk.cluster import KMeansClusterer
import pandas as pd
import gensim
from gensim.models import Word2Vec
import sklearn
from sklearn import cluster
from sklearn import metrics
import spacy
from google.colab import files

print("Libraries version:")
print(f"Numpy:      {np.__version__}")
print(f"Matplotlib: {matplotlib.__version__}")
print(f"Tensorflow: {tf.__version__}")
print(f"NLTK:       {nltk.__version__}")
print(f"Pandas:     {pd.__version__}")
print(f"Gensim:     {gensim.__version__}")
print(f"Sklearn:    {sklearn.__version__}")
print(f"Spacy:      {spacy.__version__}")

Libraries version:
Numpy:      1.18.4
Matplotlib: 3.2.1
Tensorflow: 2.2.0
NLTK:       3.2.5
Pandas:     1.0.4
Gensim:     3.6.0
Sklearn:    0.22.2.post1
Spacy:      2.2.4


# Importation des data
- Source: http://www.cs.cornell.edu/~cristian/Cornell_Movie-Dialogs_Corpus.html

In [0]:
url_movie_line = "https://raw.githubusercontent.com/maximecharriere/movie-chatbot/master/data/parsed_movie_dialogue.txt"
movie_line = pd.read_csv(url_movie_line, sep='\+{3}\$\+{3}', engine='python', names=("First line","Reply"))

In [0]:
movie_line["First line"][2]

'We saw it. All craft prepare to retreat.'

# Building DeepLearning **model**

In [0]:
# Flatten
model = tf.keras.models.Sequential()
# Add layers
model.add(tf.keras.layers.Dense(256, activation="relu"))
model.add(tf.keras.layers.Dense(128, activation="relu"))
model.add(tf.keras.layers.Dense(2, activation="softmax"))

# NLTK Test

### Download data

In [0]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('treebank')
nltk.download('nps_chat')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.
[nltk_data] Downloading package nps_chat to /root/nltk_data...
[nltk_data]   Unzipping corpora/nps_chat.zip.


True

In [0]:
sentence = """Your insight serves you well. Bury your feelings deep down, Luke. They do you credit. But they could be made to serve the Emperor."""
tokens = nltk.word_tokenize(sentence)
tagged = nltk.pos_tag(tokens)
entities = nltk.chunk.ne_chunk(tagged)
# t = treebank.parsed_sents('wsj_0001.mrg')[0]
# nltk.draw.tree.TreeView(t)._cframe.print_to_file('output.ps')

In [0]:
len(tagged)

29

In [0]:
nps_chat.fileids()

['10-19-20s_706posts.xml',
 '10-19-30s_705posts.xml',
 '10-19-40s_686posts.xml',
 '10-19-adults_706posts.xml',
 '10-24-40s_706posts.xml',
 '10-26-teens_706posts.xml',
 '11-06-adults_706posts.xml',
 '11-08-20s_705posts.xml',
 '11-08-40s_706posts.xml',
 '11-08-adults_705posts.xml',
 '11-08-teens_706posts.xml',
 '11-09-20s_706posts.xml',
 '11-09-40s_706posts.xml',
 '11-09-adults_706posts.xml',
 '11-09-teens_706posts.xml']

# SpaCy test

In [75]:
url_movie_wiki = "https://raw.githubusercontent.com/maximecharriere/movie-chatbot/master/data/movie_wiki_infos/wiki_movie_plots_deduped.csv"
movies = pd.read_csv(url_movie_wiki, sep=',', engine='python')[['Plot','Genre']]
movies.shape
movies.head()

Unnamed: 0,Plot,Genre
0,"A bartender is working at a saloon, serving dr...",unknown
1,"The moon, painted with a smiling face hangs ov...",unknown
2,"The film, just over a minute long, is composed...",unknown
3,Lasting just 61 seconds and consisting of two ...,unknown
4,The earliest known adaptation of the classic f...,unknown


In [76]:
movies['Genre'].value_counts()

unknown                                                   6083
drama                                                     5964
comedy                                                    4379
horror                                                    1167
action                                                    1098
                                                          ... 
biju menon, krishnashankar, samskruti shenoy                 1
adventure, romance                                           1
action / science fiction                                     1
animation, musical                                           1
drama based on a novel and screenplay by emma donoghue       1
Name: Genre, Length: 2265, dtype: int64

In [0]:
movies['GenreCorrected']=movies['Genre'] 
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('/', ' ')

# Delete unwanted categories
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('unknown', '')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('\[not in citation given\]','')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('avant-garde','')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('period','')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('national','')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('board','')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('costume','')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('movies','')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('movie','')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('films','')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('film','')

#Rename categories
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('bio-pic', 'biography')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('biopic', 'biography')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('biographical', 'biography')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('biodrama', 'biography drama')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('bio-drama', 'biography drama')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('biographic', 'biography')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('animated','animation')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('anime','animation')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('disaster','action')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('comedey','comedy')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('dramedy','comedy drama')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('mockumentary','documentary')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('historical','history')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('historic','history')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('prehistoric','history')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('mythology','history')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('romantic','romance')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('3-d','animation')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('3d','animation')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('sci-fi','science_fiction')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('ttriller','thriller')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('sci fi','science_fiction')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('science fiction','science_fiction')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('kung fu','martial_arts')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('kung-fu','martial_arts')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('martial arts','martial_arts')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('bruceploitation','martial_arts')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('comedy-drama adaptation of the mordecai richler novel','comedy drama')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('stop-motion','animation')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('coming of age','coming_of_age')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('coming-of-age','coming_of_age')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('drama about child soldiers','drama')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('melodrama','drama')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('tv miniseries','serial')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('television miniseries','serial')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace("war-time","war")
movies['GenreCorrected']=movies['GenreCorrected'].str.replace("wartime","war")
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('world war ii','war')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('world war i','war')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace("tokusatsu","action")
movies['GenreCorrected']=movies['GenreCorrected'].str.replace("ww1","war")
movies['GenreCorrected']=movies['GenreCorrected'].str.replace("wwii","war")
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('superheroes','superhero')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('superheroe','superhero')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('familya','family')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('familydrama','family drama')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('famil ','family')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('espionage','spy')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('noir','black')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('dark','black')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('psychological','psycho')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('rom-com','romance')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('gun fu','martial_arts')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('docudrama','documentary drama')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('football','sports')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('martial art','martial_arts')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('erotica','adult')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('erotic','adult')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('musical','music')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('gangster','action')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('swashbuckler','action')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('police','action')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('dance','sport')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('sports','sport')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('muslim','religious')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('propaganda','political')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('samurai','action')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('sexploitation','adult')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('homosexual','adult')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('socio','social')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('sexual','adult')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('sex','adult')

# Merge some categories
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('family','child')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('martial_arts','action')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('horror','thriller')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('war','action')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('adventure','action')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('science_fiction','action')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('spy','action')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('superhero','action')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('suspense','action')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('thriller','action')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('epic','action')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('crime','action')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('western','action')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('music','cultural')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('sport','cultural')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('biography','cultural')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('history','cultural')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('documentary','cultural')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('political','cultural')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('science','cultural')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('psycho','cultural')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('mystery','thriller')


In [92]:
nlp = spacy.load("en_core_web_sm")

movies['GenreListed'] = movies['GenreCorrected']

for genre in movies['GenreCorrected'].unique():
  tokens = nlp(genre)
  listed_genre = [token.lemma_.lower() for token in tokens if not token.is_punct and not token.is_digit and not token.is_space and not token.is_stop]
  for i in np.where(movies['GenreCorrected']==genre)[0]:
    movies.at[i,'GenreListed'] = np.unique(np.array(listed_genre))



movies.head(20)



Unnamed: 0,Plot,Genre,GenreCorrected,GenreListed
0,"A bartender is working at a saloon, serving dr...",unknown,,[]
1,"The moon, painted with a smiling face hangs ov...",unknown,,[]
2,"The film, just over a minute long, is composed...",unknown,,[]
3,Lasting just 61 seconds and consisting of two ...,unknown,,[]
4,The earliest known adaptation of the classic f...,unknown,,[]
5,"Alice follows a large white rabbit down a ""Rab...",unknown,,[]
6,The film opens with two bandits breaking into ...,western,action,[action]
7,The film is about a family who move to the sub...,comedy,comedy,[comedy]
8,The opening scene shows the interior of the ro...,unknown,,[]
9,Scenes are introduced using lines of the poem....,unknown,,[]


In [0]:
movies[['Genre','GenreCorrected','GenreListed']].to_csv('completGenres.csv')
files.download('completGenres.csv') 

In [0]:
genres_array = np.array([])

for genre in movies['GenreListed']:
    genres_array = np.concatenate((genres_array, genre ))

In [0]:
genres = pd.DataFrame({'Genre':genres_array})
genres['Count']=1
genres[['Genre','Count']].groupby(['Genre'], as_index=False).sum().to_csv('uniqueGenres.csv')
files.download('uniqueGenres.csv') 