In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans

In [2]:
movies_data = pd.read_csv('movies.csv')
ratings_data = pd.read_csv('ratings.csv')
tags_data = pd.read_csv('tags.csv')

In [3]:
popular_movies_ids = ratings_data['movieId'].value_counts()[ratings_data['movieId'].value_counts() > 15].keys()
popular_movies = movies_data.loc[movies_data['movieId'].isin(popular_movies_ids)]
active_users_ids = ratings_data['userId'].value_counts()[ratings_data['userId'].value_counts() > 15].keys()
ratings_reduced = ratings_data.loc[ratings_data['movieId'].isin(popular_movies_ids) & ratings_data['userId'].isin(active_users_ids)]

In [4]:
genres = ['Action','Adventure','Animation','Children','Comedy','Crime','Documentary','Drama','Fantasy','Film-Noir','Horror','Musical','Mystery','Romance','Sci-Fi','Thriller','War','Western']
data = pd.DataFrame()
for genre in genres:        
    genre_movies = popular_movies[popular_movies['genres'].str.contains(genre)]
    avg_genre_votes_per_user = ratings_reduced[ratings_reduced['movieId'].isin(genre_movies['movieId'])].loc[:, ['userId', 'rating']].groupby(['userId'])['rating'].mean().round(2)
    data = pd.concat([data, avg_genre_votes_per_user], axis=1)
data.columns = genres

In [11]:
# data.replace(np.nan, 0)
data = data.fillna(0)
print(data)

        Action  Adventure  Animation  Children  Comedy  Crime  Documentary  \
1         4.12       3.73       4.00      3.83    3.87   4.06         2.00   
2         3.70       3.91       3.62      3.66    3.32   3.14         0.00   
3         3.64       3.69       3.98      3.71    3.45   3.89         3.17   
4         3.19       3.07       3.48      3.21    3.61   3.97         4.38   
5         3.72       3.86       3.75      3.33    3.57   4.14         0.00   
...        ...        ...        ...       ...     ...    ...          ...   
152497    0.00       0.00       0.00      0.00    4.00   0.00         0.00   
153119    0.00       0.00       0.00      0.00    4.50   0.00         0.00   
156909    0.00       0.00       0.00      0.00    5.00   0.00         0.00   
160453    0.00       0.00       0.00      0.00    3.92   3.83         0.00   
156588    0.00       0.00       0.00      0.00    0.00   3.50         0.00   

        Drama  Fantasy  Film-Noir  Horror  Musical  Mystery  Ro

In [12]:
k_means = KMeans(n_clusters=5, random_state=0)
labels = k_means.fit_predict(data)
clusters = [[]] * 5
for i in range(5):
    temp = []
    for j in range(len(labels)):
        if labels[j] == i:
            temp.append(data.index[j])
    clusters[i] = temp

In [21]:
for cluster_idx, cluster in enumerate(clusters):
    genres = ['Action','Adventure','Animation','Children','Comedy','Crime','Documentary','Drama','Fantasy','Film-Noir','Horror','Musical','Mystery','Romance','Sci-Fi','Thriller','War','Western']
    data = pd.DataFrame()
    for genre in genres:        
        genre_movies = popular_movies[popular_movies['genres'].str.contains(genre)]
        avg_genre_votes_per_user = ratings_reduced[ratings_reduced['movieId'].isin(genre_movies['movieId']) & ratings_reduced['userId'].isin(cluster)].loc[:, ['userId', 'rating']].groupby(['userId'])['rating'].mean().round(2)
        data = pd.concat([data, avg_genre_votes_per_user], axis=1)
    data.columns = genres
    current = {genre: data[genre].median() for genre in genres}
    worst = min(current, key=current.get)
    best = max(current, key=current.get)
    print(f'best: {best} ({current[best]}), worst: {worst} ({current[worst]})')

best: Documentary (4.0), worst: Horror (3.5)
best: War (3.88), worst: Animation (2.0)
best: Animation (4.0), worst: Western (1.5)
best: War (4.0), worst: Film-Noir (1.5)
best: Film-Noir (4.0), worst: Documentary (1.17)
