In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import re

In [2]:
def filter_correct_id(word): # MEILLEURE VERSION ICI
    if not isinstance(word, str) or re.fullmatch(r'[0-9]+', word):
        return word
    return "wrong_id"

In [42]:
movies = pd.read_csv("movies_metadata.csv")
ratings = pd.read_csv("ratings_small.csv")

In [43]:
movies = movies[~movies.id.duplicated()]
movies.id = movies.id.apply(filter_correct_id)
movies = movies[movies.id != "wrong_id"]
movies.id = movies.id.astype('int64')
movies.budget = movies.budget.astype('int64')

In [44]:
movies = movies.rename(columns={'id' : 'movieId'})
ratings = ratings.movieId.drop_duplicates()
movies = movies.merge(ratings, how='inner')

On sélectionne les attributs de films qui semblent pertinents pour différencier les films sur leur contenu.
Ces choix sont arbitraires et on pourra être amenés à réfléchir dessus et à les modifier.

In [46]:
movies.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage',
       'movieId', 'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [47]:
cluster_features = movies[['genres', 'release_date', 'production_countries', 'original_language', 'runtime', 'budget']]
cluster_features = cluster_features.dropna()

In [50]:
cluster_features.head()

Unnamed: 0,genres,release_date,production_countries,original_language,runtime,budget
0,"[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...",1995-12-15,"[{'iso_3166_1': 'US', 'name': 'United States o...",en,170.0,60000000
1,"[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",1995-11-16,"[{'iso_3166_1': 'GB', 'name': 'United Kingdom'...",en,130.0,58000000
2,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",1995-12-22,"[{'iso_3166_1': 'FR', 'name': 'France'}, {'iso...",en,119.0,98000000
3,"[{'id': 18, 'name': 'Drama'}, {'id': 80, 'name...",1995-11-22,"[{'iso_3166_1': 'FR', 'name': 'France'}, {'iso...",en,178.0,52000000
4,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",1995-12-13,"[{'iso_3166_1': 'GB', 'name': 'United Kingdom'...",en,136.0,16500000


On modifie les attributs afin de pouvoir facilement comparer les films

In [51]:
def vectorize_genres(genres):
    if isinstance(genres, str):
        pattern = re.compile(r"'id': [0-9]*")
        return np.array([int(w[6:]) for w in pattern.findall(genres)])
    return genres

In [52]:
cluster_features.genres = cluster_features.genres.apply(vectorize_genres)

In [53]:
cluster_features.head()

Unnamed: 0,genres,release_date,production_countries,original_language,runtime,budget
0,"[28, 80, 18, 53]",1995-12-15,"[{'iso_3166_1': 'US', 'name': 'United States o...",en,170.0,60000000
1,"[12, 28, 53]",1995-11-16,"[{'iso_3166_1': 'GB', 'name': 'United Kingdom'...",en,130.0,58000000
2,"[28, 12]",1995-12-22,"[{'iso_3166_1': 'FR', 'name': 'France'}, {'iso...",en,119.0,98000000
3,"[18, 80]",1995-11-22,"[{'iso_3166_1': 'FR', 'name': 'France'}, {'iso...",en,178.0,52000000
4,"[18, 10749]",1995-12-13,"[{'iso_3166_1': 'GB', 'name': 'United Kingdom'...",en,136.0,16500000


In [54]:
def simplify_date(date):
    if isinstance(date, str):
        return int(date[:4])
    return date

In [55]:
cluster_features.release_date = cluster_features.release_date.apply(simplify_date)

In [56]:
cluster_features.head()

Unnamed: 0,genres,release_date,production_countries,original_language,runtime,budget
0,"[28, 80, 18, 53]",1995,"[{'iso_3166_1': 'US', 'name': 'United States o...",en,170.0,60000000
1,"[12, 28, 53]",1995,"[{'iso_3166_1': 'GB', 'name': 'United Kingdom'...",en,130.0,58000000
2,"[28, 12]",1995,"[{'iso_3166_1': 'FR', 'name': 'France'}, {'iso...",en,119.0,98000000
3,"[18, 80]",1995,"[{'iso_3166_1': 'FR', 'name': 'France'}, {'iso...",en,178.0,52000000
4,"[18, 10749]",1995,"[{'iso_3166_1': 'GB', 'name': 'United Kingdom'...",en,136.0,16500000


In [57]:
def simplify_countries(countries):
    if isinstance(countries, str):
        pattern = re.compile(r"'iso_3166_1': ...")
        return [w[15:] for w in pattern.findall(countries)]
    return genres

In [58]:
cluster_features.production_countries = cluster_features.production_countries.apply(simplify_countries)

In [134]:
cluster_features.tail()

Unnamed: 0,genres,release_date,production_countries,original_language,runtime,budget
2825,[18],2009,[ID],en,121.0,0
2826,"[27, 878]",1967,[GB],en,92.0,0
2827,"[10749, 18]",2007,[RU],ru,97.0,0
2828,"[35, 10749]",2012,[RU],ru,91.0,0
2829,"[14, 28, 53]",1900,[FR],fr,1.0,0


On va maintenant définir une distance sur les films

In [60]:
MAX_YEAR = max(cluster_features.release_date) - min(cluster_features.release_date)

In [61]:
cluster_features.runtime.describe()

count    2826.000000
mean      105.440906
std        27.537345
min         0.000000
25%        92.000000
50%       103.000000
75%       118.000000
max       320.000000
Name: runtime, dtype: float64

In [62]:
cluster_features.budget.describe()

count    2.826000e+03
mean     1.396692e+07
std      3.119462e+07
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      1.300000e+07
max      3.800000e+08
Name: budget, dtype: float64

In [63]:
def similarity_vect(m1, m2):
    simi_vect = m1.copy()
    g1, g2 = m1.genres, m2.genres
    if list(g1) and list(g2):
        g3 = np.append(g1, g2)
        simi_vect.genres = 2 * len(np.unique(g3)) / len(g3) - 1 # CHANGER PEUT ETRE
    else:
        simi_vect.genres = 1
    
    simi_vect.release_date = abs(m1.release_date - m2.release_date) / MAX_YEAR
    
    p1, p2 = m1.production_countries, m2.production_countries
    if list(p1) and list(p2):
        p3 = []
        p3.extend(p1)
        p3.extend(p2)
        simi_vect.production_countries = 2 * len(np.unique(p3)) / len(p3) - 1 # Changer aussi peut etre
    else:
        simi_vect.production_countries = 1
    
    simi_vect.original_language = int(not m1.original_language == m2.original_language)
    
    r1, r2 = m1.runtime, m2.runtime
    if r1 == 0 or r2 == 0:
        simi_vect.runtime = 1 # Changer peut etre
    else:
        simi_vect.runtime = min(1, abs(r1 - r2) / 55) # Choix 2 * std
    
    b1, b2 = m1.budget, m2.budget
    if b1 == 0 or b2 == 0:
        simi_vect.budget = 1 # Changer peut etre
    else:
        simi_vect.budget = min(1, abs(b1 - b2) / (3 * 1.1e+07)) # La aussi (3 * ecart type ~ arbitraire)
    
    return simi_vect

In [64]:
def weight_vect(simi_vect, w_gen=3, w_rel=2, w_pro=1, w_ori=1, w_run=1, w_bud=1):
    weighted_vect = simi_vect.copy()
    weighted_vect.genres *= w_gen
    weighted_vect.release_date *= w_rel
    weighted_vect.production_countries *= w_pro
    weighted_vect.original_language *= w_ori
    weighted_vect.runtime *= w_run
    weighted_vect.budget *= w_bud
    
    return weighted_vect

In [65]:
def movie_distance(m1, m2, distance=lambda x: np.linalg.norm(x, ord=1)):
    simi_vect = similarity_vect(m1, m2)
    weighted_vect = weight_vect(simi_vect)
    return distance(weighted_vect)

In [84]:
def compute_dist_matrix(clu_fea):
    dim = len(clu_fea)
    dist_mat = 100 * np.eye((dim))
    for i in range(1, dim):
        for j in range(0, i):
            dist_mat[i, j] = movie_distance(clu_fea.iloc[i], clu_fea.iloc[j])
    return dist_mat + dist_mat.T

In [76]:
#m = compute_dist_matrix(cluster_features) # environ 1 heure

In [291]:
def clusterize(dist_mat, clu_fea):
    total_nb_movies = dist_mat.shape[0]
    for cpt in range(total_nb_movies - 1):
        index1 = np.argmin(dist_mat.min(0))
        index2 = np.argmin(dist_mat[index1])
        size_mat = dist_mat.shape[0]
        new_dist_values = np.zeros(size_mat)
        for i in range(size_mat):
            new_dist_values[i] = min(movie_distance(clu_fea.iloc[i], clu_fea.iloc[index1]),
                                     movie_distance(clu_fea.iloc[i], clu_fea.iloc[index2])) # min --> Choix modifiable
        dist_mat[index1] = new_dist_values
        dist_mat[:, index1] = new_dist_values
        dist_mat[index1, index1] = 200
        dist_mat = np.delete(dist_mat, index2, 0)
        dist_mat = np.delete(dist_mat, index2, 1)

In [287]:
clu_test = cluster_features.iloc[:4].copy()
dist_test = compute_dist_matrix(test)
clu_test

Unnamed: 0,genres,release_date,production_countries,original_language,runtime,budget
0,"[28, 80, 18, 53]",1995,[US],en,170.0,60000000
1,"[12, 28, 53]",1995,"[GB, US]",en,130.0,58000000
2,"[28, 12]",1995,"[FR, DE, IT, US]",en,119.0,98000000
3,"[18, 80]",1995,"[FR, US]",en,178.0,52000000


In [290]:
clusterize(dist_test, clu_test)

0
1
2
