In [40]:
import sys
sys.path.append("..")
from reader import generic_reader

import numpy as np
import pandas as pd

from sklearn.cluster import  MeanShift, estimate_bandwidth
from scipy.spatial.distance import cdist
from sklearn import metrics
import ast
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D 
import matplotlib.animation

In [41]:
df = generic_reader.read_csv_file_to_data_frame("movie_data/danish_movies.csv")

In [42]:
print(df.columns)

Index(['id', 'title', 'release_date', 'runtime', 'vote_average', 'vote_count',
       'genre_ids', 'cast_person_ids'],
      dtype='object')


In [43]:
df.head

<bound method NDFrame.head of            id                                title release_date  runtime  \
0       77133                            The Abyss   1910-09-12       41   
1      786475            In the Hands of Imposters   1911-01-23       47   
2      292179          Temptations of a Great City   1911-03-06       41   
3      629224                      A Homeless Bird   1911-05-08       42   
4      283512          The Girl Behind the Counter   1911-08-12       52   
...       ...                                  ...          ...      ...   
1618  1398305             BROS - Så Blev der Mørkt   2024-12-06       74   
1619  1263071                          Bloodsucker   2024-12-17       94   
1620  1407564              Martin Kanstrup - TRÆLS   2024-12-21      119   
1621  1408690  Christopher - A Beautiful Live Tour   2024-12-22       72   
1622  1409581                  Frank Hvam - Nobody   2024-12-30       56   

      vote_average  vote_count        genre_ids  \
0     

In [44]:
df.shape

(1623, 8)

In [59]:
df.describe().round(2)

Unnamed: 0,id,runtime,vote_average,vote_count
count,1623.0,1623.0,1623.0,1623.0
mean,367510.86,90.9,4.98,30.18
std,380150.0,27.16,2.53,175.2
min,102.0,40.0,0.0,0.0
25%,56479.0,81.0,4.3,2.0
50%,238264.0,90.0,5.7,4.0
75%,542817.5,100.0,6.53,11.0
max,1538957.0,579.0,10.0,4472.0


In [45]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1623 entries, 0 to 1622
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   id               1623 non-null   int64  
 1   title            1623 non-null   object 
 2   release_date     1623 non-null   object 
 3   runtime          1623 non-null   int64  
 4   vote_average     1623 non-null   float64
 5   vote_count       1623 non-null   int64  
 6   genre_ids        1623 non-null   object 
 7   cast_person_ids  1623 non-null   object 
dtypes: float64(1), int64(3), object(4)
memory usage: 101.6+ KB


Converting 

In [None]:
#Husk at dobbelt tjek koden når opgaven samles, jeg tror ikke der er behov for genre_ids og convert release_date

#Transforming the genre_ids to a python list
df['genre_ids'] = df['genre_ids'].apply(
    lambda x: ast.literal_eval(x) if isinstance(x, str) else x
)

#Looping through all movie ids to find the unique ids
all_genres = sorted({g for sublist in df['genre_ids'] for g in sublist})

#Making a one-hot dataframe with as many rows and the df (3990 rows as seen above), all rows have a value of 0
genre_df = pd.DataFrame(0, index=df.index,
                        columns=[f'genre_{g}' for g in all_genres])

#Looping through all the genre rows and changing the 0 to 1 cells which match the genre id
for i, genres in enumerate(df['genre_ids']):
    genre_df.loc[i, [f'genre_{g}' for g in genres]] = 1


df_encoded = pd.concat([df, genre_df], axis=1)

#Changing release_date to a date-time format and making a new column, release_year, and picks year from the date time as an int
df_encoded['release_date'] = pd.to_datetime(df_encoded['release_date'], errors='coerce')
df_encoded['release_year'] = df_encoded['release_date'].dt.year

#Scaling the years to not dominate the clustering 
numerical_features = ['release_year', 'runtime']
X_num = df_encoded[numerical_features]
X_num_scaled = (X_num - X_num.mean()) / X_num.std(ddof=0)
X = np.hstack([X_num_scaled.to_numpy(), genre_df.values])

In [58]:
X_num_scaled.mean(), X_num_scaled.std(ddof=0)

(release_year    2.661799e-15
 runtime         2.539216e-16
 dtype: float64,
 release_year    1.0
 runtime         1.0
 dtype: float64)

In [47]:
bandwidth = estimate_bandwidth(X, quantile=0.2, n_samples=500)
bandwidth 

np.float64(1.7103733502032255)

In [48]:
msmodel = MeanShift(bandwidth=bandwidth, bin_seeding=True)
msmodel.fit(X)

In [49]:
labels = msmodel.labels_
labels

array([6, 6, 6, ..., 0, 0, 0])

In [50]:
labels_unique = np.unique(labels)
labels_unique

array([0, 1, 2, 3, 4, 5, 6])

In [51]:
n_clusters_ = len(labels_unique)
n_clusters_

7

In [52]:
cluster_centers = msmodel.cluster_centers_
cluster_centers

array([[ 3.87737531e-01,  1.30680907e-02,  1.61290323e-02,
         5.37634409e-03,  4.64157706e-01,  3.49462366e-02,
         2.32974910e-02,  4.98207885e-01,  8.06451613e-03,
         2.68817204e-03,  7.07885305e-02,  6.27240143e-02,
         8.96057348e-03,  8.96057348e-03,  2.24014337e-02,
         7.88530466e-02,  1.67562724e-01,  1.07526882e-02,
         8.06451613e-03],
       [ 7.27126538e-01,  7.08274674e+00,  0.00000000e+00,
         0.00000000e+00,  2.00000000e-01,  0.00000000e+00,
         0.00000000e+00,  8.00000000e-01,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00],
       [ 2.79781603e-01,  6.87280217e+00,  0.00000000e+00,
         1.00000000e+00,  1.00000000e+00,  1.00000000e+00,
         0.00000000e+00,  5.00000000e-01,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
    

In [53]:
Y = msmodel.predict(X)
Y

array([6, 6, 6, ..., 0, 0, 0])

In [54]:
len(msmodel.labels_)

1623

In [55]:
labels_unique = np.unique(labels)
n_clusters_ = len(labels_unique)

print(f"Estimated number of clusters = {n_clusters_}")

Estimated number of clusters = 7
