In [1]:
import pandas as pd
import numpy as np

In [2]:
df_ratings = pd.read_table('./app/datasets/ml-1m/ratings.dat', delimiter='::', names=['userId', 'movieId', 'rating', 'timestamp'], engine='python')

In [9]:
df_movies = pd.read_csv('./app/datasets/movies_imdb.csv')
print(df_movies.columns)
df_movies = df_movies.dropna(subset=['genres', 'cast','directors', 'runtimes', 'title', 'year'])
df_movies = df_movies.loc[:, ['movieId', 'genres','cast', 'directors', 'runtimes', 'title', 'year']]

Index(['cast', 'color', 'directors', 'genres', 'movieId', 'plots', 'producers',
       'rating', 'runtimes', 'synopsis', 'title', 'writers', 'year'],
      dtype='object')


In [10]:
df_ratings = df_ratings[df_ratings['movieId'].isin(df_movies['movieId'])]

In [11]:
df_ratings = df_ratings[df_ratings['userId'] == 6040]

In [19]:
df_movies['genres'].str.split('|')

TypeError: unhashable type: 'list'

In [42]:
def get_weights(df_movies):
    def add_in_dict(d, l):
        for e in l:
            n = d.get(e, 0)
            d[e] = n+1

    genres = {}
    cast = {}
    directors = {}
    for index, movie in df_movies.iterrows():
        movie_genres = movie['genres'].split('|')
        add_in_dict(genres, movie_genres)
        movie_cast = movie['cast'].split('|')
        add_in_dict(cast, movie_cast)
        movie_directors = movie['directors'].split('|')
        add_in_dict(directors, movie_directors)
    genres = pd.Series(genres, name='genres')
    cast = pd.Series(cast, name='cast')
    directors = pd.Series(directors, name='directors')
    years = df_movies['year'].value_counts()
    return genres, cast, directors, years

In [53]:
import json
def split(df_ratings, user):
        df_movies = df_ratings[df_ratings['userId'] == user]
        df_movies = df_movies.set_index('movieId')
        data = {}
        with open('./app/datasets/index.txt') as json_file:
            data = json.load(json_file)
        user_index = data[str(user)]
        train = df_movies.loc[user_index['train'], :]
        test = df_movies.loc[user_index['test'], :]
        return train, test
train, test = split(df_ratings, 6040)

In [54]:
train

Unnamed: 0_level_0,userId,rating,timestamp
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2391,6040,4,956705112
246,6040,5,956704613
2303,6040,5,960971857
3334,6040,5,960971875
535,6040,4,964828734
...,...,...,...
913,6040,5,956716845
1300,6040,5,956715421
2076,6040,5,956715676
2395,6040,4,960972782


In [43]:
genres, cast, directors, years = get_weights(df_movies)

In [55]:
def get_gcdy_preference(train, df_movies):
    def add_in_list(d, l):
        for e in l:
            if e not in d:
                d.append(e)
            
    df = df_movies.loc[train[train.rating>=3].index, :]
    genres = []
    cast = []
    directors = []
    for index, movie in df.iterrows():
        movie_genres = movie['genres'].split('|')
        add_in_list(genres, movie_genres)
        movie_cast = movie['cast'].split('|')
        add_in_list(cast, movie_cast)
        movie_directors = movie['directors'].split('|')
        add_in_list(directors, movie_directors)  
    return genres, cast, directors, df.year.unique().tolist()

g, c, d, y = get_gcdy_preference(train, df_movies)

g    

['Drama',
 'War',
 'Biography',
 'Music',
 'Romance',
 'Comedy',
 'Musical',
 'Action',
 'Crime',
 'Thriller',
 'Mystery',
 'Animation',
 'Family',
 'Fantasy',
 'Horror',
 'Sci-Fi',
 'History',
 'Adventure',
 'Documentary',
 'Sport',
 'Western',
 'Short',
 'Film-Noir']

In [57]:
def get_wi(lista, preference, all_movies, total, inc):
    wi = 0
    for valor in lista:
        wi += all_movies[valor]/total
        if valor in preference:
            wi += inc
    return wi
def get_wa(lista, c, cast, total):
    return get_wi(lista, c, cast, total, 0.4)

def get_wd(lista, d, directors, total):
    return get_wi(lista, d, directors, total, 0.3)

def get_wy(lista, y, years, total):
    return get_wi(lista, y, years, total, 0.1)
        
            
        

973

In [None]:
def get_user_weights(train, test, df_movies):
    genres, cast, directors, years = get_weights(df_movies)
    g, c, d, y = get_gcdy_preference(train, df_movies)
    df_test = df_movies.loc[test.index, :]
    weights = {}
    for index, movie in df_test.iterrows():
        