In [1]:
import pandas as pd
import numpy as np

In [60]:
df_ratings = pd.read_table('./app/datasets/ml-1m/ratings.dat', delimiter='::', names=['userId', 'movieId', 'rating', 'timestamp'], engine='python')

In [9]:
df_movies = pd.read_csv('./app/datasets/movies_imdb.csv')
print(df_movies.columns)
df_movies = df_movies.dropna(subset=['genres', 'cast','directors', 'runtimes', 'title', 'year'])
df_movies = df_movies.loc[:, ['movieId', 'genres','cast', 'directors', 'runtimes', 'title', 'year']]

Index(['cast', 'color', 'directors', 'genres', 'movieId', 'plots', 'producers',
       'rating', 'runtimes', 'synopsis', 'title', 'writers', 'year'],
      dtype='object')


In [10]:
df_ratings = df_ratings[df_ratings['movieId'].isin(df_movies['movieId'])]

In [11]:
df_ratings = df_ratings[df_ratings['userId'] == 6040]

In [42]:
def get_weights(df_movies):
    def add_in_dict(d, l):
        for e in l:
            n = d.get(e, 0)
            d[e] = n+1

    genres = {}
    cast = {}
    directors = {}
    for index, movie in df_movies.iterrows():
        movie_genres = movie['genres'].split('|')
        add_in_dict(genres, movie_genres)
        movie_cast = movie['cast'].split('|')
        add_in_dict(cast, movie_cast)
        movie_directors = movie['directors'].split('|')
        add_in_dict(directors, movie_directors)
    genres = pd.Series(genres, name='genres')
    cast = pd.Series(cast, name='cast')
    directors = pd.Series(directors, name='directors')
    years = df_movies['year'].value_counts()
    return genres, cast, directors, years

In [53]:
import json
def split(df_ratings, user):
        df_movies = df_ratings[df_ratings['userId'] == user]
        df_movies = df_movies.set_index('movieId')
        data = {}
        with open('./app/datasets/index.txt') as json_file:
            data = json.load(json_file)
        user_index = data[str(user)]
        train = df_movies.loc[user_index['train'], :]
        test = df_movies.loc[user_index['test'], :]
        return train, test
train, test = split(df_ratings, 6040)

In [54]:
train

Unnamed: 0_level_0,userId,rating,timestamp
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2391,6040,4,956705112
246,6040,5,956704613
2303,6040,5,960971857
3334,6040,5,960971875
535,6040,4,964828734
...,...,...,...
913,6040,5,956716845
1300,6040,5,956715421
2076,6040,5,956715676
2395,6040,4,960972782


In [43]:
genres, cast, directors, years = get_weights(df_movies)

In [55]:
def get_gcdy_preference(train, df_movies):
    def add_in_list(d, l):
        for e in l:
            if e not in d:
                d.append(e)
            
    df = df_movies.loc[train[train.rating>=3].index, :]
    genres = []
    cast = []
    directors = []
    for index, movie in df.iterrows():
        movie_genres = movie['genres'].split('|')
        add_in_list(genres, movie_genres)
        movie_cast = movie['cast'].split('|')
        add_in_list(cast, movie_cast)
        movie_directors = movie['directors'].split('|')
        add_in_list(directors, movie_directors)  
    return genres, cast, directors, df.year.unique().tolist()

g, c, d, y = get_gcdy_preference(train, df_movies)

g    

['Drama',
 'War',
 'Biography',
 'Music',
 'Romance',
 'Comedy',
 'Musical',
 'Action',
 'Crime',
 'Thriller',
 'Mystery',
 'Animation',
 'Family',
 'Fantasy',
 'Horror',
 'Sci-Fi',
 'History',
 'Adventure',
 'Documentary',
 'Sport',
 'Western',
 'Short',
 'Film-Noir']

In [92]:
def get_wi(lista, preference, all_movies, total, inc):
    wi = 0.0
    for valor in lista:
        wi += all_movies[valor]/total
        if valor in preference:
            wi += inc
    return wi
def get_wa(lista, c, cast, total):
    return get_wi(lista, c, cast, total, 0.4)

def get_wd(lista, d, directors, total):
    return get_wi(lista, d, directors, total, 0.3)

def get_wy(lista, y, years, total):
    return get_wi(lista, y, years, total, 0.1)
        
def get_wr(movieId, df_ratings):
    ratings = df_ratings[df_ratings['movieId'] == movieId].rating.value_counts()
    wr = 0.0
    
    for index, count in ratings.iteritems():
        if index >=3:
            if count <= 100:
                wr += index
            elif count <= 1000:
                wr += index*2
            else:
                wr += index*3
        else:
            if count <= 100:
                wr += 1
            elif count <= 1000:
                wr += 2
            else:
                wr += 3
    return wr
        

In [93]:
def get_user_weights(user, train, test, df_movies):
    total = df_movies.shape[0]
    genres, cast, directors, years = get_weights(df_movies)
    g, c, d, y = get_gcdy_preference(train, df_movies)
    ratings_test = df_ratings[df_ratings.userId == user][df_ratings.movieId.isin(test.index)]
    df_ratings_w_test = df_ratings[~df_ratings.index.isin(ratings_test.index)]
    weights = []
    for index, movie in df_movies.iterrows():
        wa = get_wa(movie['cast'].split('|'), c, cast, total)
        wd = get_wd(movie['directors'].split('|'), d, directors, total)
        wy = get_wy([movie['year']], y, years, total)
        wr = get_wr(movie['movieId'], df_ratings_w_test)
        weights.append([wa, wd, wy, wr])
    weights = pd.DataFrame(weights, index=df_movies.movieId, columns=['Wa', 'Wd', 'Wy', 'Wr'])
    return weights
    #weights = pd.DataFrame(weights, columns=['Wa', 'Wd', 'Wy', 'Wr'])
    
        

In [94]:
weights = get_user_weights(6040, train, test, df_movies)

  """


In [95]:
weights

Unnamed: 0_level_0,Wa,Wd,Wy,Wr
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,4.833201,0.000225,0.117952,26.0
2,3.223286,0.000376,0.117952,22.0
3,2.018178,0.000413,0.117952,21.0
4,2.821145,0.000150,0.117952,14.0
5,1.218891,0.000263,0.117952,17.0
...,...,...,...,...
131254,0.003793,0.000075,0.032900,0.0
131256,0.403868,0.000075,0.025163,0.0
131258,0.002291,0.000038,0.026816,0.0
131260,0.009126,0.000263,0.023436,0.0


In [70]:
df_movies

Unnamed: 0,movieId,genres,cast,directors,runtimes,title,year
0,1,Animation|Adventure|Comedy|Family|Fantasy,Tom Hanks|Tim Allen|Don Rickles|Jim Varney|Wal...,John Lasseter,81.0,Toy Story,1995.0
1,2,Adventure|Comedy|Family|Fantasy,Robin Williams|Jonathan Hyde|Kirsten Dunst|Bra...,Joe Johnston,104.0,Jumanji,1995.0
2,3,Comedy|Romance,Walter Matthau|Jack Lemmon|Sophia Loren|Ann-Ma...,Howard Deutch,101.0,Grumpier Old Men,1995.0
3,4,Comedy|Drama|Romance,Whitney Houston|Angela Bassett|Loretta Devine|...,Forest Whitaker,124.0,Waiting to Exhale,1995.0
4,5,Comedy|Family|Romance,Steve Martin|Diane Keaton|Martin Short|Kimberl...,Charles Shyer,106.0,Father of the Bride Part II,1995.0
...,...,...,...,...,...,...,...
27273,131254,Comedy,Franz Dinda|Florian Lukas|Axel Stein|Kailas Ma...,Granz Henman,85.0,Military Academy,2007.0
27274,131256,Comedy,Rick Kavanian|Axel Stein|Eva Habermann|Christo...,Matthias Dinter,83.0,"Feuer, Eis & Dosenbier",2002.0
27275,131258,Action|Adventure|Comedy|Fantasy,Nam-gil Kim|Ye-jin Son|Hae-jin Yoo|Kyeong-yeon...,Seok-hoon Lee,130.0,Pirates,2014.0
27276,131260,Biography|Drama|Music,Martti Suosalo|Ilkka Koivula|Vexi Salmi|Riitta...,Timo Koivusalo,102.0,The Rose of the Rascal,2001.0
