In [2]:
import datetime
import json
import requests
import itertools
import pickle

import pandas as pd
import numpy as np 
from scipy.sparse import hstack

from sklearn.cluster import KMeans
from sklearn.preprocessing import OneHotEncoder

from elasticsearch import Elasticsearch

In [3]:
df = pd.read_csv('movies.csv')
df['title'] = df['title'].map(lambda x: x.strip())
df['genres'] = df['genres'].map(lambda x: x.split('|'))
# df['year'] = df['title'].map(lambda x: x[-6:][1:-1])
df = df.astype({'title': 'str'})

df_movies = df

df = pd.read_csv('ratings.csv')
df['datetime'] = df['timestamp'].map(lambda x: datetime.datetime.fromtimestamp(x))

df_ratings = df

In [4]:
userIds = df_ratings['userId'].unique()
movieIds = df_ratings['movieId'].unique()

ratings = df_ratings['rating'].unique()

genres = set()

movie_genre_map = {}

for idx, row in df_movies.iterrows():
    movie_genre_map[row['movieId']] = row['genres']
    genres.update(row['genres'])
genres

{'(no genres listed)',
 'Action',
 'Adventure',
 'Animation',
 'Children',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'IMAX',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western'}

In [5]:
user_genre_map = {u: {g: 0 for g in genres} for u in userIds}
user_genre_count = {u: {g: 0 for g in genres} for u in userIds}

for idx, row in df_ratings.iterrows():
    _genres = movie_genre_map[row['movieId']]
    r = row['rating']
    u = row['userId']
    for g in genres:
        user_genre_count[u][g] += 1
        user_genre_map[u][g] += r

for u in userIds:
    for g in genres:
        user_genre_map[u][g] = user_genre_map[u][g] / user_genre_count[u][g]

In [14]:
X = np.array(list(user_genre_map.keys()))
X.shape

(671,)

In [22]:
[x for x in list(user_genre_map.values())]

or': 4.14,
  'Drama': 4.14,
  'Comedy': 4.14,
  'Mystery': 4.14,
  'Film-Noir': 4.14,
  'Animation': 4.14,
  'War': 4.14,
  'Western': 4.14,
  'Thriller': 4.14,
  'Documentary': 4.14,
  'IMAX': 4.14,
  'Fantasy': 4.14,
  'Sci-Fi': 4.14,
  'Musical': 4.14,
  'Action': 4.14},
 {'Crime': 3.0,
  'Children': 3.0,
  '(no genres listed)': 3.0,
  'Romance': 3.0,
  'Adventure': 3.0,
  'Horror': 3.0,
  'Drama': 3.0,
  'Comedy': 3.0,
  'Mystery': 3.0,
  'Film-Noir': 3.0,
  'Animation': 3.0,
  'War': 3.0,
  'Western': 3.0,
  'Thriller': 3.0,
  'Documentary': 3.0,
  'IMAX': 3.0,
  'Fantasy': 3.0,
  'Sci-Fi': 3.0,
  'Musical': 3.0,
  'Action': 3.0},
 {'Crime': 3.381818181818182,
  'Children': 3.381818181818182,
  '(no genres listed)': 3.381818181818182,
  'Romance': 3.381818181818182,
  'Adventure': 3.381818181818182,
  'Horror': 3.381818181818182,
  'Drama': 3.381818181818182,
  'Comedy': 3.381818181818182,
  'Mystery': 3.381818181818182,
  'Film-Noir': 3.381818181818182,
  'Animation': 3.381818181

In [24]:
def flatten_dict(x):
    return [x[e] for e in genres]

Y = np.array([flatten_dict(x) for x in user_genre_map.values()])
Y.shape, len(genres)

((671, 20), 20)

In [27]:
user_one_hot = OneHotEncoder()

user_one_hot.fit(X.reshape(-1, 1))

X_kmeans = user_one_hot.transform(X.reshape(-1, 1))

X_kmeans.shape

(671, 671)

In [None]:
# optional hyperparam search
# scores = {}
# for k in range(len(genres), len(genres)*3):
#     kmeans = KMeans(n_clusters=k)
#     kmeans.fit(X_kmeans, Y)

#     scores[k] = kmeans.inertia_

# best k = min(scores.items(), key=operator.itemgetter(1))[0]

In [33]:
cluster_num = len(genres) * 2
kmeans = KMeans(n_clusters=cluster_num)
kmeans.fit(X_kmeans, Y.reshape(-1, 1))

KMeans(n_clusters=40)

In [34]:
clust_idx = kmeans.predict(X_kmeans)
clust_idx.shape

(671,)

In [39]:
cluster_user_map = {k: [] for k in range(0, cluster_num)}
user_cluster_map = {}
for cluster, u in zip(clust_idx, list(user_genre_map.keys())):
    user_cluster_map[u] = cluster
    cluster_user_map[cluster].append(u)

In [40]:
pickle.dump(cluster_user_map, open('cluster_user_map.pkl', 'wb'))
pickle.dump(user_cluster_map, open('user_cluster_map.pkl', 'wb'))