In [20]:
import numpy as np
import pandas as pd
import matplotlib
from matplotlib import pyplot as plt, cm as cm

from sklearn import linear_model
from sklearn.cluster import KMeans
from sklearn import cluster, metrics, manifold
from sklearn.neighbors import NearestNeighbors
from sklearn import preprocessing
from sklearn import decomposition
from sklearn.metrics import silhouette_samples, silhouette_score

from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

from plotly import tools
import plotly.plotly as py
import plotly.graph_objs as go

In [2]:
# Lieu où se trouve le fichier
_FICHIER = 'C:\\Users\\Toni\\Desktop\\pas_synchro\\p3_bdd_clean.csv'
_DOSSIERTRAVAIL = 'C:\\Users\\Toni\\python\\python\\Projet_3\\images'

In [6]:
def count_word(data, ref_col, liste):
    """
    TBD
    """
    keyword_count = dict()

    for word in liste:
        keyword_count[word] = 0

    for liste_keywords in data[ref_col].str.split('|'):
        if isinstance(liste_keywords, float) and pd.isnull(liste_keywords):
            continue
        for word in [word for word in liste_keywords if word in liste]:
            if pd.notnull(word):
                keyword_count[word] = keyword_count[word] + 1

    # convert the dictionary in a list to sort the keywords by frequency
    keyword_occurences = []

    for k, v in keyword_count.items():
        keyword_occurences.append([k, v])

    keyword_occurences.sort(key=lambda x: x[1], reverse=True)

    return keyword_occurences, keyword_count

def comptabiliser(data, valeur_cherchee):
    """
    TBD
    """
    # compter tous les genres différents
    listing = set()

    for word in data[valeur_cherchee].str.split('|').values:
        if isinstance(word, float):
            continue
        listing = listing.union(word)

    # compter le nombre d'occurence de ces genres
    listing_compte, dum = count_word(data, valeur_cherchee, listing)

    return listing_compte

def affichage_kmeans(datanum, vmin, vmax):

    # Scale des données obligatoire avant la réduction des dimensions
    std_scale = preprocessing.StandardScaler().fit(datanum)
    X_scaled = std_scale.transform(datanum)
    
    # Réduction t-Sne
    print("Computing t-SNE embedding")
    tsne = manifold.TSNE(n_components=2, perplexity=50, n_iter=500)
    
    for i in range(vmin,vmax+1):
        # On fait i clusters avec les données scalées.
        kmeans = KMeans(n_clusters=i)
        kmeans.fit(X_scaled)
        # Nouvelle colonne avec les conclusions de kmeans
        datanum['labels'] = kmeans.labels_
    
        # perform t-SNE embedding
        x_tsne = tsne.fit_transform(X_scaled)

        # plot the result
        vis_x = x_tsne[:, 0]
        vis_y = x_tsne[:, 1]
    
        # figure de prédiction
        plt.figure(figsize=(10, 6))
        plt.scatter(vis_x, vis_y, c=datanum['labels'], cmap=plt.cm.get_cmap("jet", i))
        plt.colorbar(ticks=range(i))
        plt.grid('on')
        Titre = 'Affichage du clustering déduit par le Kmeans n=%s ' % i
        plt.title(Titre)
        plt.show()

def transpose_bool(data, colon):
    
    # On supprime les #NA
    data[colon].fillna('vide', inplace=True)
    
    # énumaration des genres
    listing = comptabiliser(data, colon)
    
    for mot, compte in listing:
        data[mot] = pd.Series(((1 if mot in data[colon][i] else 0) for i in range(len(data[colon]))), index=data.index)

In [8]:
# On charge le dataset
data = pd.read_csv(_FICHIER, encoding = "ISO-8859-1")
del data['Unnamed: 0']

datanum = data.copy()
datanum.describe()

# Données manquantes
missing_data = datanum.isnull().sum(axis=0).reset_index()
missing_data.columns = ['column_name', 'missing_count']
missing_data['filling_factor'] = (datanum.shape[0]-missing_data['missing_count'])/datanum.shape[0]*100
missing_data.sort_values('filling_factor').reset_index(drop=True)

transpose_bool(datanum, 'genres')
transpose_bool(datanum, 'language')
transpose_bool(datanum, 'country')
transpose_bool(datanum, 'color')
transpose_bool(datanum, 'content_rating')

# Suprresion de ce qui n'est pas chiffré
datanum = datanum.drop(['color', 'director_name', 'actor_1_name', 'genres', 'movie_title', 'actor_2_name', 'actor_3_name'], axis=1)
datanum = datanum.drop(['plot_keywords', 'movie_imdb_link', 'language', 'country', 'content_rating'], axis=1)

datanum.fillna(0, inplace=True)

In [9]:
range_n_clusters = [2, 3, 4, 5, 6]

In [22]:
X = datanum.copy()

In [24]:
print(datanum)

      num_critic_for_reviews  duration  director_facebook_likes  \
0                      723.0     178.0                      0.0   
1                      302.0     169.0                    563.0   
2                      602.0     148.0                      0.0   
3                      813.0     164.0                  22000.0   
4                        0.0       0.0                    131.0   
5                      462.0     132.0                    475.0   
6                      392.0     156.0                      0.0   
7                      324.0     100.0                     15.0   
8                      635.0     141.0                      0.0   
9                      375.0     153.0                    282.0   
10                     673.0     183.0                      0.0   
11                     434.0     169.0                      0.0   
12                     403.0     106.0                    395.0   
13                     313.0     151.0                    563.