In [1]:
import pandas as pd
movies = pd.read_csv('source/movies.csv')
ratings= pd.read_csv('source/ratings.csv')

In [2]:
movies['genres'] = movies['genres'].str.split('|')
movies.head()
#ratings.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]"
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]"
2,3,Grumpier Old Men (1995),"[Comedy, Romance]"
3,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]"
4,5,Father of the Bride Part II (1995),[Comedy]


In [3]:
def getAllGenres():
    #get all categories
    genres=[]
    for index, row in movies.iterrows():
        eidh=row['genres']
        for genre in eidh:
            if not genre in genres:
                genres.append(genre)
    return genres

In [4]:
allGenres=getAllGenres()
allGenres

['Adventure',
 'Animation',
 'Children',
 'Comedy',
 'Fantasy',
 'Romance',
 'Drama',
 'Action',
 'Crime',
 'Thriller',
 'Horror',
 'Mystery',
 'Sci-Fi',
 'Documentary',
 'IMAX',
 'War',
 'Musical',
 'Western',
 'Film-Noir',
 '(no genres listed)']

In [5]:
def getMovieGenres(movieId):
    index=movies.index[movies['movieId'] == movieId].tolist()[0]#could just write movieId-1
    return movies['genres'][index]

In [6]:
def createUserRatingsDict(ratings):
    userRatings={}
    for index, row in ratings.iterrows():
        userId,movieId=row['userId'],row['movieId']
        rating=row['rating']
        genres=getMovieGenres(movieId)

        if not userId in userRatings: 
            #initiate the index of the user with all ratings=0
            genreDict={}
            for genre in allGenres:
                genreDict[genre]={'value':0 ,'count':0 }

            userRatings[userId]=genreDict

        for genre in genres:
            userRatings[userId][genre]['value']+=rating
            userRatings[userId][genre]['count']+=1
    return userRatings
        

In [7]:
def setAverageRating(df): 
    userGenreRatings=df.copy(deep=True)#changes will not affect the initial parmaeter
    for index, row in userGenreRatings.iterrows():
        for genre in allGenres:                
            if row[genre]['count']==0:
              pass
            else:
                userGenreRatings[genre][index]=row[genre]['value']/row[genre]['count']
    
    return userGenreRatings

In [8]:
def fillMissingDataPreProcessing(df,missing_data_completion='neutral'):
    userGenreRatings=df.copy(deep=True)#changes will not affect the initial parmaeter
    for index, row in userGenreRatings.iterrows():
        for genre in allGenres:  
            
            rated_of_column=[ i for i in  userGenreRatings[genre]  if type(i)!=dict ] 
            missing_value={
               'neutral': 2.5,
                'out_of_range':-1,
                'zero':0,
               'column_mean':sum(rated_of_column) /len(rated_of_column)  
            }
            
            if type( row[genre] )==dict:
                userGenreRatings[genre][index]=missing_value[missing_data_completion]
    
    return userGenreRatings
    
    

In [9]:
userRatings=createUserRatingsDict(ratings)
userGenreRatingsDict=pd.DataFrame(userRatings).T

In [10]:
userGenreRatingsWithMissing=setAverageRating(userGenreRatingsDict)
userGenreRatingsWithMissing.head()

Unnamed: 0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
1.0,"{'value': 0, 'count': 0}",2.8,2.16667,2.0,2.5,2.0,2.5,"{'value': 0, 'count': 0}",2.57143,2.125,"{'value': 0, 'count': 0}",3.0,"{'value': 0, 'count': 0}",3.0,"{'value': 0, 'count': 0}",3.5,2.4,2.83333,2.0,3
2.0,"{'value': 0, 'count': 0}",3.31579,3.75,3.66667,3.5,3.09375,3.66667,"{'value': 0, 'count': 0}",3.61538,3.4,"{'value': 0, 'count': 0}",3.5,4,3.75,4,3.59091,3.8,3.33333,3.8,5
3.0,"{'value': 0, 'count': 0}",3.46875,3.3,2.875,2.7,3.61111,3.7,3.66667,3.92,3.0,"{'value': 0, 'count': 0}",3.375,2.875,2.5,3.75,3.65,3.14286,3.54545,4.16667,4
4.0,"{'value': 0, 'count': 0}",4.27419,4.33871,4.70833,4.63415,4.32955,4.15789,4,4.46429,4.56757,5,3.94444,5,4.83333,4.09091,4.5,4.25581,4.02273,4.75,3
5.0,"{'value': 0, 'count': 0}",4.0,3.825,3.9,3.89286,3.98182,3.45,3.66667,3.80769,3.75,"{'value': 0, 'count': 0}",3.875,4,4.25,3.33333,4.08108,4.0,3.625,4.0,"{'value': 0, 'count': 0}"


In [11]:
userGenreRatingsMissingFilled=fillMissingDataPreProcessing(userGenreRatingsWithMissing,missing_data_completion='column_mean')
userGenreRatingsMissingFilled.head()

Unnamed: 0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
1.0,3.82143,2.8,2.16667,2.0,2.5,2.0,2.5,3.83904,2.57143,2.125,3.90912,3.0,3.76406,3.0,3.74746,3.5,2.4,2.83333,2.0,3.0
2.0,3.82143,3.31579,3.75,3.66667,3.5,3.09375,3.66667,3.83904,3.61538,3.4,3.90912,3.5,4.0,3.75,4.0,3.59091,3.8,3.33333,3.8,5.0
3.0,3.82143,3.46875,3.3,2.875,2.7,3.61111,3.7,3.66667,3.92,3.0,3.90912,3.375,2.875,2.5,3.75,3.65,3.14286,3.54545,4.16667,4.0
4.0,3.82143,4.27419,4.33871,4.70833,4.63415,4.32955,4.15789,4.0,4.46429,4.56757,5.0,3.94444,5.0,4.83333,4.09091,4.5,4.25581,4.02273,4.75,3.0
5.0,3.82143,4.0,3.825,3.9,3.89286,3.98182,3.45,3.66667,3.80769,3.75,3.90912,3.875,4.0,4.25,3.33333,4.08108,4.0,3.625,4.0,3.61254


In [12]:
del userGenreRatingsMissingFilled['(no genres listed)']
userGenreRatingsMissingFilled.head()

Unnamed: 0,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
1.0,2.8,2.16667,2.0,2.5,2.0,2.5,3.83904,2.57143,2.125,3.90912,3.0,3.76406,3.0,3.74746,3.5,2.4,2.83333,2.0,3.0
2.0,3.31579,3.75,3.66667,3.5,3.09375,3.66667,3.83904,3.61538,3.4,3.90912,3.5,4.0,3.75,4.0,3.59091,3.8,3.33333,3.8,5.0
3.0,3.46875,3.3,2.875,2.7,3.61111,3.7,3.66667,3.92,3.0,3.90912,3.375,2.875,2.5,3.75,3.65,3.14286,3.54545,4.16667,4.0
4.0,4.27419,4.33871,4.70833,4.63415,4.32955,4.15789,4.0,4.46429,4.56757,5.0,3.94444,5.0,4.83333,4.09091,4.5,4.25581,4.02273,4.75,3.0
5.0,4.0,3.825,3.9,3.89286,3.98182,3.45,3.66667,3.80769,3.75,3.90912,3.875,4.0,4.25,3.33333,4.08108,4.0,3.625,4.0,3.61254


In [13]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold,StratifiedShuffleSplit
import random
import numpy as np

In [16]:
  kmeans = KMeans(n_clusters=15).fit(userGenreRatingsMissingFilled)

In [44]:
cluster_map = pd.DataFrame()
cluster_map['userId'] = userGenreRatingsMissingFilled.index.values
cluster_map['cluster'] = kmeans.labels_


In [45]:
cluster_map.head()

Unnamed: 0,userId,cluster
0,1.0,2
1,2.0,1
2,3.0,10
3,4.0,7
4,5.0,3


In [48]:
clusters=[]
for i in range(15):
    x=list( cluster_map['userId'][cluster_map['cluster'] == i] )
    clusters.append(x)

In [47]:
cluster_map.to_csv('UsersClusters.csv',index=False)