<a href="https://colab.research.google.com/github/maximecharriere/movie-chatbot/blob/master/movie-chatbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Chatbot cinéphile - WELCOME**
*Par Dylan **Morocutti** et Maxime **Charrière**.*

# Libraries importation

In [40]:
import numpy as np
import tensorflow as tf
import matplotlib
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import treebank, nps_chat
from nltk.cluster import KMeansClusterer
import pandas as pd
import gensim
from gensim.models import Word2Vec
import sklearn
from sklearn import cluster
from sklearn import metrics
import spacy
from google.colab import files

print("Libraries version:")
print(f"Numpy:      {np.__version__}")
print(f"Matplotlib: {matplotlib.__version__}")
print(f"Tensorflow: {tf.__version__}")
print(f"NLTK:       {nltk.__version__}")
print(f"Pandas:     {pd.__version__}")
print(f"Gensim:     {gensim.__version__}")
print(f"Sklearn:    {sklearn.__version__}")
print(f"Spacy:      {spacy.__version__}")

Libraries version:
Numpy:      1.18.4
Matplotlib: 3.2.1
Tensorflow: 2.2.0
NLTK:       3.2.5
Pandas:     1.0.4
Gensim:     3.6.0
Sklearn:    0.22.2.post1
Spacy:      2.2.4


# Importation des data
- Source: http://www.cs.cornell.edu/~cristian/Cornell_Movie-Dialogs_Corpus.html

In [0]:
url_movie_line = "https://raw.githubusercontent.com/maximecharriere/movie-chatbot/master/data/parsed_movie_dialogue.txt"
movie_line = pd.read_csv(url_movie_line, sep='\+{3}\$\+{3}', engine='python', names=("First line","Reply"))

In [0]:
movie_line["First line"][2]

'We saw it. All craft prepare to retreat.'

# Building DeepLearning **model**

In [0]:
# Flatten
model = tf.keras.models.Sequential()
# Add layers
model.add(tf.keras.layers.Dense(256, activation="relu"))
model.add(tf.keras.layers.Dense(128, activation="relu"))
model.add(tf.keras.layers.Dense(2, activation="softmax"))

# NLTK Test

### Download data

In [0]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('treebank')
nltk.download('nps_chat')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.
[nltk_data] Downloading package nps_chat to /root/nltk_data...
[nltk_data]   Unzipping corpora/nps_chat.zip.


True

In [0]:
sentence = """Your insight serves you well. Bury your feelings deep down, Luke. They do you credit. But they could be made to serve the Emperor."""
tokens = nltk.word_tokenize(sentence)
tagged = nltk.pos_tag(tokens)
entities = nltk.chunk.ne_chunk(tagged)
# t = treebank.parsed_sents('wsj_0001.mrg')[0]
# nltk.draw.tree.TreeView(t)._cframe.print_to_file('output.ps')

In [0]:
len(tagged)

29

In [0]:
nps_chat.fileids()

['10-19-20s_706posts.xml',
 '10-19-30s_705posts.xml',
 '10-19-40s_686posts.xml',
 '10-19-adults_706posts.xml',
 '10-24-40s_706posts.xml',
 '10-26-teens_706posts.xml',
 '11-06-adults_706posts.xml',
 '11-08-20s_705posts.xml',
 '11-08-40s_706posts.xml',
 '11-08-adults_705posts.xml',
 '11-08-teens_706posts.xml',
 '11-09-20s_706posts.xml',
 '11-09-40s_706posts.xml',
 '11-09-adults_706posts.xml',
 '11-09-teens_706posts.xml']

# SpaCy test

In [174]:
url_movie_wiki = "https://raw.githubusercontent.com/maximecharriere/movie-chatbot/master/data/movie_wiki_infos/wiki_movie_plots_deduped.csv"
movies = pd.read_csv(url_movie_wiki, sep=',', engine='python')[['Genre','Plot']]
movies.shape
movies.head()

Unnamed: 0,Genre,Plot
0,unknown,"A bartender is working at a saloon, serving dr..."
1,unknown,"The moon, painted with a smiling face hangs ov..."
2,unknown,"The film, just over a minute long, is composed..."
3,unknown,Lasting just 61 seconds and consisting of two ...
4,unknown,The earliest known adaptation of the classic f...


In [175]:
movies['Genre'].value_counts()

unknown                             6083
drama                               5964
comedy                              4379
horror                              1167
action                              1098
                                    ... 
comedy, musical, romance, family       1
action drama, bio-pic, thriller        1
wuxia/historical/romance               1
horror, action comedy                  1
drama, patriotic, social               1
Name: Genre, Length: 2265, dtype: int64

In [0]:
# harmonization
movies['GenreCorrected'] =movies['Genre'] 
# movies['GenreCorrected']=movies['GenreCorrected'].str.strip()
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace(' - ', '|')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace(' / ', '|')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('/', '|')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace(' & ', '|')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace(', ', '|')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('; ', '|')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('unknown', '')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('bio-pic', 'biography')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('biopic', 'biography')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('biographical', 'biography')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('biodrama', 'biography | drama')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('bio-drama', 'biography | drama')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('biographic', 'biography')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace(' \(film genre\)', '')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('animated','animation')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('anime','animation')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('children\'s','children')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('comedey','comedy')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('\[not in citation given\]','')


movies['GenreCorrected']=movies['GenreCorrected'].str.replace(' set 4,000 years ago in the canadian arctic','')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('historical','history')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('romantic','romance')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('3-d','animation')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('3d','animation')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('sci-fi','science_fiction')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('ttriller','thriller')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('.','')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('sci fi','science_fiction')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('science fiction','science_fiction')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('kung fu','martial_arts')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('kung-fu','martial_arts')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('martial arts','martial_arts')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('world war ii','war')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('world war i','war')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('spy film','spy')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('avant-garde','avant_garde')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('2-reeler','')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('films','')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('film','')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('british-german co-production','')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('bruceploitation','martial_arts')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('comedy-drama adaptation of the mordecai richler novel','comedy drama')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('movies by the mob\|knkspl','')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('movies','')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('movie','')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('coming of age','coming_of_age')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('coming-of-age','coming_of_age')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('drama about child soldiers','drama')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('(( based).+)','')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('(( co-produced).+)','')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('(( adapted).+)','')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('(( about).+)','')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('musical b','musical')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace(' period','period')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('drama loosely','drama')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace(' \(aquatics|swimming\)','')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace(' \(aquatics|swimming\)','')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace("yogesh dattatraya gosavi's directorial debut \[9\]",'')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace("war-time","war")
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace("wartime","war")
movies['GenreCorrected']=movies['GenreCorrected'].str.replace("tokusatsu","action")

# movies['GenreCorrected']=movies['GenreCorrected'].str.replace("ww1","war")
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('unknown','')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace("wwii","war")
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('psychological','psycho')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('rom-coms','romance')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('true crime','crime')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('\|007','')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('slice of life','slice_of_life')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('computer animation','animation')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('gun fu','martial_arts')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('j-horror','horror')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace(' \(shogi|chess\)','')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('afghan war drama','war drama')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('\|6 separate stories','')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace(' \(30min\)','')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace(' (road bicycle racing)','')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace(' v-cinema','')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('tv miniseries','tv_miniseries')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('\|docudrama','\|documentary|drama')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace(' in animation','|animation')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('((adaptation).+)','')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('((adaptated).+)','')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('((adapted).+)','')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('(( on ).+)','')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('american football','sports')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('dev\|nusrat jahan','sports')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('television miniseries','tv_miniseries')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace(' \(artistic\)','')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace(' \|direct-to-dvd','')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('history dram','history drama')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('martial art','martial_arts')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('psycho thriller,','psycho thriller')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('\|1 girl\|3 suitors','')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace(' \(road bicycle racing\)','')
# filterE = movies['GenreCorrected']=="ero"
# movies.loc[filterE,'GenreCorrected']="adult"
# filterE = movies['GenreCorrected']=="music"
# movies.loc[filterE,'GenreCorrected']="musical"
# filterE = movies['GenreCorrected']=="-"
# movies.loc[filterE,'GenreCorrected']=''
# filterE = movies['GenreCorrected']=="comedy–drama"
# movies.loc[filterE,'GenreCorrected'] = "comedy|drama"
# filterE = movies['GenreCorrected']=="comedy–horror"
# movies.loc[filterE,'GenreCorrected'] = "comedy|horror"
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace(' ','|')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace(',','|')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('-','')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('actionadventure','action|adventure')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('actioncomedy','action|comedy')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('actiondrama','action|drama')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('actionlove','action|love')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('actionmasala','action|masala')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('actionchildren','action|children')

# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('fantasychildren\|','fantasy|children')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('fantasycomedy','fantasy|comedy')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('fantasyperiod','fantasy|period')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('cbctv_miniseries','tv_miniseries')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('dramacomedy','drama|comedy')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('dramacomedysocial','drama|comedy|social')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('dramathriller','drama|thriller')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('comedydrama','comedy|drama')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('dramathriller','drama|thriller')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('comedyhorror','comedy|horror')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('sciencefiction','science_fiction')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('adventurecomedy','adventure|comedy')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('animationdrama','animation|drama')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('\|\|','|')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('muslim','religious')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('thriler','thriller')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('crimethriller','crime|thriller')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('fantay','fantasy')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('actionthriller','action|thriller')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('comedysocial','comedy|social')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('martialarts','martial_arts')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('\|\(children\|poker\|karuta\)','')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('epichistory','epic|history')

# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('erotica','adult')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('erotic','adult')

# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('((\|produced\|).+)','')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('chanbara','chambara')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('comedythriller','comedy|thriller')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('biblical','religious')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('biblical','religious')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('colour\|yellow\|productions\|eros\|international','')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('\|directtodvd','')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('liveaction','live|action')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('melodrama','drama')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('superheroes','superheroe')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('gangsterthriller','gangster|thriller')

# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('heistcomedy','comedy')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('heist','action')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('historic','history')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('historydisaster','history|disaster')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('warcomedy','war|comedy')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('westerncomedy','western|comedy')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('ancientcostume','costume')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('computeranimation','animation')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('dramatic','drama')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('familya','family')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('familya','family')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('dramedy','drama|comedy')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('dramaa','drama')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('famil\|','family')

# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('superheroe','superhero')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('biogtaphy','biography')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('devotionalbiography','devotional|biography')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('docufiction','documentary|fiction')

movies['GenreCorrected']=movies['GenreCorrected'].str.replace('familydrama','family|drama')

movies['GenreCorrected']=movies['GenreCorrected'].str.replace('espionage','spy')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('supeheroes','superhero')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('romancefiction','romance|fiction')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('horrorthriller','horror|thriller')

# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('suspensethriller','suspense|thriller')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('musicaliography','musical|biography')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('triller','thriller')

# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('\|\(fiction\)','|fiction')

# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('romanceaction','romance|action')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('romancecomedy','romance|comedy')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('romancehorror','romance|horror')

# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('romcom','romance|comedy')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('rom\|com','romance|comedy')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('satirical','satire')

# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('science_fictionchildren','science_fiction|children')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('homosexual','adult')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('sexual','adult')

# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('mockumentary','documentary')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('periodic','period')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('romanctic','romantic')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('politics','political')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('samurai','martial_arts')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('tv_miniseries','series')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('serial','series')

# filterE = movies['GenreCorrected']=="musical–comedy"
# movies.loc[filterE,'GenreCorrected'] = "musical|comedy"

# filterE = movies['GenreCorrected']=="roman|porno"
# movies.loc[filterE,'GenreCorrected'] = "adult"


# filterE = movies['GenreCorrected']=="action—masala"
# movies.loc[filterE,'GenreCorrected'] = "action|masala"


# filterE = movies['GenreCorrected']=="horror–thriller"
# movies.loc[filterE,'GenreCorrected'] = "horror|thriller"

movies['GenreCorrected']=movies['GenreCorrected'].str.replace('family','children')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('martial_arts','action')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('horror','thriller')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('war','action')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('adventure','action')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('science_fiction','action')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('western','action')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('noir','black')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('spy','action')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('superhero','action')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('social','')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('suspense','action')
movies['GenreCorrected']=movies['GenreCorrected'].replace(to_replace=r'\d+',value=' ', regex=True)

# filterE = movies['GenreCorrected']=="drama|romance|adult|children"
# movies.loc[filterE,'GenreCorrected'] = "drama|romance|adult"

# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('\|–\|','|')
# movies['GenreCorrected']=movies['GenreCorrected'].str.strip(to_strip='\|')
# movies['GenreCorrected']=movies['GenreCorrected'].str.replace('actionner','action')
# movies['GenreCorrected']=movies['GenreCorrected'].str.strip()

In [187]:
nlp = spacy.load("en_core_web_sm")

movies['GenreListed'] = movies['GenreCorrected']

for genre in movies['GenreCorrected'].unique():
  tokens = nlp(genre)
  listed_genre = [token.lemma_ for token in tokens if not token.is_punct and not token.is_digit]
  for i in np.where(movies['GenreCorrected']==genre)[0]:
    movies.at[i,'GenreListed'] = np.array(listed_genre)



movies.head(20)



Unnamed: 0,Genre,Plot,GenreCorrected,GenreListed
0,unknown,"A bartender is working at a saloon, serving dr...",,[]
1,unknown,"The moon, painted with a smiling face hangs ov...",,[]
2,unknown,"The film, just over a minute long, is composed...",,[]
3,unknown,Lasting just 61 seconds and consisting of two ...,,[]
4,unknown,The earliest known adaptation of the classic f...,,[]
5,unknown,"Alice follows a large white rabbit down a ""Rab...",,[]
6,western,The film opens with two bandits breaking into ...,action,[action]
7,comedy,The film is about a family who move to the sub...,comedy,[comedy]
8,unknown,The opening scene shows the interior of the ro...,,[]
9,unknown,Scenes are introduced using lines of the poem....,,[]


In [196]:
x = np.array(movies.loc[10:15,'GenreListed'])
np.where(x == ['short'])

  


(array([], dtype=int64),)

In [0]:
genres_array = np.array([])

for genre in movies['GenreListed']:
    genres_array = np.concatenate((genres_array, genre ))
    


In [0]:
genres = pd.DataFrame({'Genre':genres_array})
genres['Count']=1
genres[['Genre','Count']].groupby(['Genre'], as_index=False).sum().to_csv('genres.csv')

In [0]:
files.download('genres.csv') 

In [130]:
# movies_lite.loc[(movies_lite['GenreCorrected']=='unknown'),'Genre'] = ['tablea1','tab2','dsa3']
# 
x = np.empty(3,dtype=object)

x[1] = ['asd','asd']
x

array([None, list(['asd', 'asd']), None], dtype=object)

In [102]:
x = np.arange(10, dtype=object)
# x[2:4] = ['asd','sadfgsdf','sdfg','456ssd','sdfg']

for i, cell in enumerate(x[2:4]):
  x[i] = ['asd','sadfgsdf','sdfg','456ssd','sdfg']

x

array([list(['asd', 'sadfgsdf', 'sdfg', '456ssd', 'sdfg']),
       list(['asd', 'sadfgsdf', 'sdfg', '456ssd', 'sdfg']), 2, 3, 4, 5, 6,
       7, 8, 9], dtype=object)

In [0]:
files.download('GenreCorrected.csv') 

In [0]:
movies['GenreSplit']=movies['GenreCorrected'].str.split('|')
# movies['GenreSplit']= movies['GenreSplit'].apply(np.sort).apply(np.unique)

In [0]:
movies['GenreSplit']

0                [unknown]
1                [unknown]
2                [unknown]
3                [unknown]
4                [unknown]
               ...        
34881            [unknown]
34882             [comedy]
34883             [comedy]
34884    [romantic comedy]
34885           [romantic]
Name: GenreSplit, Length: 34886, dtype: object

In [0]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("Hello, world. Here are two sentences.")
print([t.text for t in doc])


['Hello', ',', 'world', '.', 'Here', 'are', 'two', 'sentences', '.']


In [0]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("I'd like to tell you that I love you. I can't go to the party. Goole is very good..")
print([token.lemma_ for token in doc if not token.is_stop and not token.is_punct])


['like', 'tell', 'love', 'party', 'Goole', 'good']


In [0]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("science_fiction")
print([token.lemma_ for token in doc if not token.is_stop and not token.is_punct])

['science_fiction']


In [0]:
df = pd.DataFrame({'A': [0, 1, 2, 3, 4],
                   'B': [5, 6, 0, 8, 9],
                   'C': ['a', 'b', 'c', 'd', 'e']})
df['C'].replace('a', 5, inplace = True)
df

Unnamed: 0,A,B,C
0,0,5,5
1,1,6,b
2,2,0,c
3,3,8,d
4,4,9,e
