In [28]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [29]:
import pandas as pd
import numpy as np

In [30]:
# text mining
movies_metadata = pd.read_csv('/content/drive/MyDrive/Data cinéma/movies_metadata.csv',low_memory=False,usecols=['title','imdb_id','release_date','popularity','vote_average','vote_count','overview','genres','belongs_to_collection'])
m  = movies_metadata['vote_count'].quantile(0.6)
movies_metadata = movies_metadata.copy()[movies_metadata['vote_count'] >= m]
movies_metadata.drop_duplicates(inplace=True)

# filtre les films qui n'ont pas de résumé
movies_metadata_filtre=movies_metadata[~movies_metadata['overview'].isna()]
print("On perd",len(movies_metadata)-len(movies_metadata_filtre),"films sur",len(movies_metadata))
movies_metadata_filtre.reset_index(drop=True,inplace=True)

movies_metadata_filtre['overview'].head(5)


On perd 130 films sur 18517


0    Led by Woody, Andy's toys live happily in his ...
1    When siblings Judy and Peter discover an encha...
2    A family wedding reignites the ancient feud be...
3    Cheated on, mistreated and stepped on, the wom...
4    Just when George Banks has recovered from his ...
Name: overview, dtype: object

In [10]:
import nltk
from nltk.tokenize import WordPunctTokenizer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

tokenizer = nltk.RegexpTokenizer(r'\w+')
stopwords = nltk.corpus.stopwords.words('english')
lemmatizer = WordNetLemmatizer()
tfidf = TfidfVectorizer(stop_words='english')

def tokenize_mot_vide_lemm(a):
  # Etape 1 : tokenize
  Sac_de_mots=tokenizer.tokenize(a)
  # Etape 2 : suppression des mots vides et Etape 3 : lemmatizer
  F2 = [lemmatizer.lemmatize(x) for x in Sac_de_mots if x not in stopwords]
  return (" ".join(F2))

base_toke_vide_lemm = [tokenize_mot_vide_lemm(x) for x in movies_metadata_filtre['overview']]
tfidf_matrix = tfidf.fit_transform(base_toke_vide_lemm)
tfidf_matrix.shape

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


(18387, 40472)

In [None]:
# from sklearn.feature_extraction.text import TfidfVectorizer

# Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
# tfidf = TfidfVectorizer(stop_words='english')
# tfidf_matrix = tfidf.fit_transform(movies_metadata_filtre['overview'])
# tfidf_matrix.shape


In [14]:
from sklearn.metrics.pairwise import linear_kernel
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
indices = pd.Series(movies_metadata_filtre.index, index=movies_metadata_filtre['title'])

In [23]:
def get_recommendations(title):
    idx = indices[title]
    df = cosine_sim[idx]
    if len(df) < len(indices):
      df = np.mean(df, axis=0)
    sim_scores = list(enumerate(df))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[0:(10+len(idx))]
    movie_indices = [i[0] for i in sim_scores if i[0] not in list(idx)]
    # Return the top 10 most similar movies
    return movies_metadata_filtre[['title','release_date','vote_average','vote_count']].iloc[movie_indices]

In [22]:
get_recommendations(['The Dark Knight Rises'])

Unnamed: 0,title,release_date,vote_average,vote_count
8487,The Dark Knight,2008-07-16,8.3,12269.0
111,Batman Forever,1995-06-16,5.2,1529.0
963,Batman Returns,1992-06-19,6.6,1706.0
9998,Batman: Under the Red Hood,2010-07-27,7.6,459.0
446,Batman,1989-06-23,7.0,2145.0
6408,Batman Beyond: Return of the Joker,2000-12-12,7.5,152.0
12123,Batman Unmasked: The Psychology of the Dark Kn...,2008-07-15,8.0,17.0
11601,"Batman: The Dark Knight Returns, Part 1",2012-09-06,7.7,410.0
10857,Batman: Year One,2011-09-27,7.1,255.0
4355,Q & A,1990-04-27,6.6,22.0


In [27]:
get_recommendations(['Superman','Batman','The Dark Knight Rises'])

Unnamed: 0,title,release_date,vote_average,vote_count
6408,Batman Beyond: Return of the Joker,2000-12-12,7.5,152.0
9998,Batman: Under the Red Hood,2010-07-27,7.6,459.0
111,Batman Forever,1995-06-16,5.2,1529.0
8487,The Dark Knight,2008-07-16,8.3,12269.0
12213,Batman: Mystery of the Batwoman,2003-10-21,6.6,87.0
963,Batman Returns,1992-06-19,6.6,1706.0
6376,The Batman Superman Movie: World's Finest,1998-08-18,7.1,53.0
1065,Batman & Robin,1997-06-20,4.2,1447.0
12095,Lego Batman: The Movie - DC Super Heroes Unite,2013-05-20,6.4,81.0
12123,Batman Unmasked: The Psychology of the Dark Kn...,2008-07-15,8.0,17.0


In [None]:
import nltk
from nltk.tokenize import WordPunctTokenizer
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

tokenizer = nltk.RegexpTokenizer(r'\w+')
stopwords = nltk.corpus.stopwords.words('english')
lemmatizer = WordNetLemmatizer()

def test_1():
  Sac_de_mots=tokenizer.tokenize(a)
  F1=[]
  for i in Sac_de_mots:
    if i not in stopwords:
      F1.append(lemmatizer.lemmatize(i))
  return (F1)

def test_2():
  Sac_de_mots=tokenizer.tokenize(a)
  F2 = [lemmatizer.lemmatize(x) for x in Sac_de_mots if x not in stopwords]
  return (F2)

F1 = test_1()
F2 = test_2()

print(F1)
print(F2)

#import timeit

#print(timeit.timeit("test_1()",setup="from __main__ import test_1",number=10000))
#print(timeit.timeit("test_2()",setup="from __main__ import test_2",number=10000))


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
['Babe', 'little', 'pig', 'quite', 'know', 'place', 'world', 'With', 'bunch', 'odd', 'friend', 'like', 'Ferdinand', 'duck', 'think', 'rooster', 'Fly', 'dog', 'call', 'mom', 'Babe', 'realizes', 'making', 'become', 'greatest', 'sheep', 'pig', 'time', 'Farmer', 'Hogget', 'know', 'With', 'help', 'sheep', 'dog', 'Babe', 'learns', 'pig', 'anything', 'want']
['Babe', 'little', 'pig', 'quite', 'know', 'place', 'world', 'With', 'bunch', 'odd', 'friend', 'like', 'Ferdinand', 'duck', 'think', 'rooster', 'Fly', 'dog', 'call', 'mom', 'Babe', 'realizes', 'making', 'become', 'greatest', 'sheep', 'pig', 'time', 'Farmer', 'Hogget', 'know', 'With', 'help', 'sheep',