In [7]:
import sys
sys.path.append("..")
from reader import generic_reader

import numpy as np
import pandas as pd

from sklearn.cluster import  MeanShift, estimate_bandwidth
from scipy.spatial.distance import cdist
from sklearn import metrics
import ast
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D 
import matplotlib.animation

In [8]:
df = generic_reader.read_csv_file_to_data_frame("movie_data/danish_movies_cleaned.csv")

In [9]:
df.head

<bound method NDFrame.head of            id                                              title release_date  \
0      195981                                      The Execution   1903-05-22   
1      778036                                      Female Rivals   1906-01-01   
2     1341178                            The Spiritist Faustinus   1906-02-15   
3      237871                        Fishing Life in the Nordics   1906-09-15   
4      232590                      The Anarchist's Mother-in-Law   1906-10-23   
...       ...                                                ...          ...   
3985  1407803                       Verdensmænd - Bobos surprise   2024-12-26   
3986  1410638                                 Gud bevare Danmark   2024-12-30   
3987  1409581                                Frank Hvam - Nobody   2024-12-30   
3988  1412034  Dan Andersen - Nedsat Hørelse, Nedsat Sædkvali...   2024-12-31   
3989  1410006                       Jan Hellesøe - Forudsigelser   2024-12-31  

In [10]:
df.shape

(3990, 8)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3990 entries, 0 to 3989
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   id               3990 non-null   int64  
 1   title            3990 non-null   object 
 2   release_date     3990 non-null   object 
 3   runtime          3990 non-null   int64  
 4   vote_average     3990 non-null   float64
 5   vote_count       3990 non-null   int64  
 6   genre_ids        3990 non-null   object 
 7   cast_person_ids  3990 non-null   object 
dtypes: float64(1), int64(3), object(4)
memory usage: 249.5+ KB


In [None]:
#Husk at dobbelt tjek koden når opgaven samles, jeg tror ikke der er behov for genre_ids og convert release_date

#Transforming the genre_ids to a python list
df['genre_ids'] = df['genre_ids'].apply(
    lambda x: ast.literal_eval(x) if isinstance(x, str) else x
)

#Looping through all movie to find the unique ids
all_genres = sorted({g for sublist in df['genre_ids'] for g in sublist})

#Making a one-hot dataframe with as many rows and the df (3990 rows as seen above), all rows have a value of 0
genre_df = pd.DataFrame(0, index=df.index,
                        columns=[f'genre_{g}' for g in all_genres])

#Looping through all the genre rows and changing the 0 to 1 cells which match the genre id
for i, genres in enumerate(df['genre_ids']):
    genre_df.loc[i, [f'genre_{g}' for g in genres]] = 1


df_encoded = pd.concat([df, genre_df], axis=1)

#Changing release_date to a date-time format and making a new column, release_year, and picks year from the date time as an int
df_encoded['release_date'] = pd.to_datetime(df_encoded['release_date'], errors='coerce')
df_encoded['release_year'] = df_encoded['release_date'].dt.year

#Scaling the years to not dominate the clustering 
numerical_features = ['release_year']
X_num = df_encoded[numerical_features]
X_num_scaled = (X_num - X_num.mean()) / X_num.std(ddof=0)
X = np.hstack([X_num_scaled.to_numpy(), genre_df.values])

In [13]:
bandwidth = estimate_bandwidth(X, quantile=0.2, n_samples=500)
bandwidth 

np.float64(1.1986095585447263)

In [14]:
msmodel = MeanShift(bandwidth=bandwidth, bin_seeding=True)
msmodel.fit(X)

In [15]:
labels = msmodel.labels_
labels

array([1, 1, 1, ..., 0, 0, 0])

In [16]:
labels_unique = np.unique(labels)
labels_unique

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [17]:
n_clusters_ = len(labels_unique)
n_clusters_

10

In [18]:
cluster_centers = msmodel.cluster_centers_
cluster_centers

array([[ 6.41869190e-01,  9.54653938e-04,  9.54653938e-04,
         3.02147971e-01,  1.57517900e-02,  2.38663484e-03,
         2.56801909e-01,  0.00000000e+00,  0.00000000e+00,
         1.14558473e-02,  4.29594272e-03,  4.29594272e-03,
         1.43198091e-03,  2.52983294e-02,  2.38663484e-03,
         1.71837709e-02,  4.77326969e-04,  4.77326969e-04],
       [-1.65581279e+00,  1.43472023e-03,  0.00000000e+00,
         2.66857963e-01,  4.30416069e-03,  1.43472023e-03,
         3.15638451e-01,  1.43472023e-03,  0.00000000e+00,
         2.86944046e-03,  4.30416069e-03,  0.00000000e+00,
         2.86944046e-03,  0.00000000e+00,  1.43472023e-03,
         1.00430416e-02,  1.43472023e-03,  0.00000000e+00],
       [ 5.72106965e-01,  0.00000000e+00,  6.66666667e-01,
         1.00000000e+00,  1.00000000e+00,  0.00000000e+00,
         1.66666667e-01,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         1.00000000e+00,  0.00000000e+00,  0.00000000e

In [19]:
Y = msmodel.predict(X)
Y

array([1, 1, 1, ..., 0, 0, 0])

In [20]:
len(msmodel.labels_)

3990

In [21]:
labels_unique = np.unique(labels)
n_clusters_ = len(labels_unique)

print(f"Estimated number of clusters = {n_clusters_}")

Estimated number of clusters = 10
