In [699]:
import sys
sys.path.append("..")
from reader import generic_reader

import numpy as np
import pandas as pd

from sklearn.cluster import  MeanShift, estimate_bandwidth
from scipy.spatial.distance import cdist
from sklearn import metrics

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D 
import matplotlib.animation

In [700]:
data = generic_reader.read_csv_file_to_data_frame("movie_data/danish_movies.csv")

In [701]:
data.isnull().sum()

adult                        0
backdrop_path             3175
budget                       0
homepage                  4889
id                           0
imdb_id                   1445
origin_country               0
original_language            0
original_title               0
overview                  1239
popularity                   0
poster_path               1045
production_countries         0
release_date                 0
revenue                      0
runtime                      0
spoken_languages             0
status                       0
tagline                   4825
title                        0
video                        0
vote_average                 0
vote_count                   0
genre_ids                    0
production_company_ids       0
collection_id             4871
cast_person_ids              0
cast_credit_ids              0
crew_person_ids              0
crew_credit_ids              0
dtype: int64

In [702]:
# Droppping columns which won't be neeeded, delete when merging with cleaned data
columns_to_drop = ['adult', 'backdrop_path', 'homepage', 'id', 'imdb_id', 'origin_country', 'original_language', 'original_title', 
                   'overview', 'poster_path', 'production_countries', 'production_company_ids', 'runtime', 'spoken_languages', 'status', 'tagline', 'title', 
                   'video', 'production_company_ids', 'collection_id', 'cast_credit_ids', 'crew_credit_ids', 'budget', 'revenue', 'vote_count']
data = data.drop(columns=columns_to_drop)

In [703]:
data.head()

Unnamed: 0,popularity,release_date,vote_average,genre_ids,cast_person_ids,crew_person_ids
0,0.0725,1897-01-01,5.3,[99],[2452695],"[1171313, 1171313]"
1,0.0671,1897-12-25,3.0,[99],[],[1171313]
2,0.0261,1899-05-20,4.0,[99],"[4658231, 4658234]",[1171313]
3,0.0071,1899-09-02,4.5,[99],[],[1171313]
4,0.0363,1899-11-25,4.8,[99],[],[1171313]


In [704]:
data.shape

(5134, 6)

In [705]:
# Make sure we operate on the same dataframe
# Remove 99 from genre_ids lists
df['genre_ids'] = df['genre_ids'].apply(lambda x: [g for g in x if g != 99])

# One-hot encode remaining genres
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
genre_onehot = mlb.fit_transform(df['genre_ids'])
genre_df = pd.DataFrame(genre_onehot, columns=[f'genre_{g}' for g in mlb.classes_])

# Merge back to the main dataframe
df_encoded = pd.concat([df, genre_df], axis=1)

# Convert release_date to datetime and extract year
df_encoded['release_date'] = pd.to_datetime(df_encoded['release_date'], errors='coerce')
df_encoded['release_year'] = df_encoded['release_date'].dt.year

# Numerical features (use release_year instead of release_date)
numerical_features = ['release_year', 'vote_average', 'popularity']
X_num = df_encoded[numerical_features]

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_num_scaled = scaler.fit_transform(X_num)

# Combine numerical + genre one-hot
import numpy as np
X = np.hstack([X_num_scaled, genre_df.values])


In [706]:
bandwidth = estimate_bandwidth(X, quantile=0.1, n_samples=500)
bandwidth 

np.float64(1.779335957811354)

In [707]:
msmodel = MeanShift(bandwidth=bandwidth, bin_seeding=True)
msmodel.fit(X)

In [708]:
labels = msmodel.labels_
labels

array([2, 2, 2, ..., 1, 0, 0])

In [None]:
labels_unique = np.unique(labels)
labels_unique

array([0, 1, 2, 3, 4])

In [None]:
n_clusters_ = len(labels_unique)
n_clusters_

5

In [711]:
cluster_centers = msmodel.cluster_centers_
cluster_centers

array([[ 5.57725144e-01, -8.33869240e-01, -4.83329838e-01,
         0.00000000e+00,  0.00000000e+00,  3.38218715e-03,
         1.64036077e-01,  1.24013529e-02,  9.75197294e-02,
         1.12739572e-03,  9.75197294e-02,  2.31116122e-02,
         1.57835400e-02,  1.45997745e-01,  2.94250282e-01,
         1.00000000e+00,  1.00000000e+00],
       [ 4.49259857e-01,  1.02543736e+00, -1.85759060e-01,
         0.00000000e+00,  0.00000000e+00,  5.90551181e-03,
         3.04133858e-01,  8.85826772e-03,  3.11023622e-01,
         0.00000000e+00,  3.11023622e-01,  1.47637795e-02,
         8.85826772e-03,  2.99212598e-01,  1.99803150e-01,
         1.00000000e+00,  1.00000000e+00],
       [-1.49737727e+00, -8.10854112e-01, -4.71676770e-01,
         0.00000000e+00,  0.00000000e+00,  5.98802395e-03,
         1.96107784e-01,  4.49101796e-03,  7.33532934e-02,
         0.00000000e+00,  7.33532934e-02,  3.14371257e-02,
         4.49101796e-03,  1.70658683e-01,  2.55988024e-01,
         1.00000000e+00,  1.0

In [712]:
Y = msmodel.predict(X)
Y

array([2, 2, 2, ..., 1, 0, 0])

In [713]:
len(msmodel.labels_)

5134

In [None]:
labels_unique = np.unique(labels)
n_clusters_ = len(labels_unique)

print(f"Estimated number of clusters = {n_clusters_}")

Estimated number of clusters = 5
