In [1]:
import pandas as pd
import numpy as np
import datetime
from collections import Counter
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
#load 3 dataset
genome_scores_data = pd.read_csv('./ml-20m/genome-scores.csv') 
movies_data = pd.read_csv('./ml-20m/movies.csv') 
ratings_data = pd.read_csv('./ml-20m/ratings.csv')

In [3]:
users_df = pd.DataFrame(ratings_data['userId'].unique(), columns=['userId'])
users_df.head()

Unnamed: 0,userId
0,1
1,2
2,3
3,4
4,5


In [4]:
#create movies_df
movies_df = movies_data.drop('genres', axis = 1)
#calculate mean of ratings for each movies
agg_rating_avg = ratings_data.groupby(['movieId']).agg({'rating': np.mean}).reset_index()
agg_rating_avg.columns = ['movieId', 'rating_mean']
#merge
movies_df = movies_df.merge(agg_rating_avg, left_on='movieId', right_on='movieId', how='left')
movies_df.head()

Unnamed: 0,movieId,title,rating_mean
0,1,Toy Story (1995),3.92124
1,2,Jumanji (1995),3.211977
2,3,Grumpier Old Men (1995),3.15104
3,4,Waiting to Exhale (1995),2.861393
4,5,Father of the Bride Part II (1995),3.064592


In [5]:
genres = [
    "Action",
    "Adventure",
    "Animation",
    "Children",
    "Comedy",
    "Crime",
    "Documentary",
    "Drama",
    "Fantasy",
    "Film-Noir",
    "Horror",
    "Musical",
    "Mystery",
    "Romance",
    "Sci-Fi",
    "Thriller",
    "War",
    "Western",
    "(no genres listed)"]
genres_df = pd.DataFrame(genres, columns=['genres'])
genres_df.head()

Unnamed: 0,genres
0,Action
1,Adventure
2,Animation
3,Children
4,Comedy


In [6]:
users_movies_df = ratings_data.drop('timestamp', axis = 1)
users_movies_df.head()

Unnamed: 0,userId,movieId,rating
0,1,2,3.5
1,1,29,3.5
2,1,32,3.5
3,1,47,3.5
4,1,50,3.5


In [7]:
movies_genres_df = movies_data.drop('title', axis = 1)

In [8]:
#define a function to split genres field
def get_movie_genres(movieId):
    movie = movies_genres_df[movies_genres_df['movieId']==movieId]
    genres = movie['genres'].tolist()
    df = pd.DataFrame([b for a in [i.split('|') for i in genres] for b in a], columns=['genres'])
    df.insert(loc=0, column='movieId', value=movieId)
    return df

In [9]:
#create empty df
movies_genres=pd.DataFrame(columns=['movieId','genres'])
for x in movies_genres_df['movieId'].tolist():
    movies_genres=movies_genres.append(get_movie_genres(x))
movies_genres.head()

Unnamed: 0,movieId,genres
0,1,Adventure
1,1,Animation
2,1,Children
3,1,Comedy
4,1,Fantasy


In [10]:
#join to movies data to get genre information
user_genres_df = ratings_data.merge(movies_data, left_on='movieId', right_on='movieId', how='left')
#drop columns that will not be used
user_genres_df.drop(['movieId','rating','timestamp','title'], axis = 1, inplace=True)
user_genres_df.head()

Unnamed: 0,userId,genres
0,1,Adventure|Children|Fantasy
1,1,Adventure|Drama|Fantasy|Mystery|Sci-Fi
2,1,Mystery|Sci-Fi|Thriller
3,1,Mystery|Thriller
4,1,Crime|Mystery|Thriller


In [11]:
def get_favorite_genre(userId):
    user = user_genres_df[user_genres_df['userId']==userId]
    genres = user['genres'].tolist()
    movie_list = [b for a in [i.split('|') for i in genres] for b in a]
    counter = Counter(movie_list)
    return counter.most_common(1)[0][0]

In [13]:
#create empty df
users_genres = pd.DataFrame(columns=['userId','genre'])
for x in users_df['userId'].tolist():
    users_genres = users_genres.append(pd.DataFrame([[x,get_favorite_genre(x)]], columns=['userId','genre']))
users_genres.head()

Unnamed: 0,userId,genre
0,1,Adventure
0,2,Sci-Fi
0,3,Sci-Fi
0,4,Action
0,5,Drama


In [14]:
users_df.to_csv('users.csv', sep='|', header=True, index=False)
movies_df.to_csv('movies.csv', sep='|', header=True, index=False)
genres_df.to_csv('genres.csv', sep='|', header=True, index=False)
users_movies_df.to_csv('users_movies.csv', sep='|', header=True, index=False)
movies_genres.to_csv('movies_genres.csv', sep='|', header=True, index=False)
users_genres.to_csv('users_genres.csv', sep='|', header=True, index=False)