In [1]:
import numpy as np
import pandas as pd
from collections import Counter



In [2]:
item_list = pd.read_csv('movies.csv')
transaction_list = pd.read_csv('ratings.csv')

## Item Feature Table

In [3]:
def create_item_feature_table(item_list, encode_genre=False):
    """
    Creates item feature table from the movies.csv file
    Input:
    
    item_list (pd.DataFrame)      : MovieLens movies.csv Datraframe containing movieId, title, and genre
    encode_genre (bool)           : Include 1-hot encoding of movie into its indicated genres (default: False)

    
    Output
    item_feature_table : Dataframe containing  itemId, movieId, title, genres, year, genre 1-hot encoding (optional) 
    """
    
    item_table = item_list.copy()
    item_table.index.rename('item_id', inplace=True)
    item_table['year'] = item_table['title'].str.slice(-5,-1)
    item_table['genres'] = item_table['genres'].str.split('|')
    
    item_feature_table = item_table
    
    if (encode_genre):
        genre_list = list(set().union(*item_table['genres']))
        genre_list.sort()
        item_feature_table[genre_list] = 0
        
        def set_genres(row):
            row[row['genres']] = 1
            return row
        
        item_feature_table = item_feature_table.apply(set_genres, axis=1)
    
    return item_feature_table
    
    

In [4]:
item_feature = create_item_feature_table(item_list, encode_genre=True)
item_feature.head()

Unnamed: 0_level_0,movieId,title,genres,year,(no genres listed),Action,Adventure,Animation,Children,Comedy,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",1995,0,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]",1995,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),"[Comedy, Romance]",1995,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]",1995,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
4,5,Father of the Bride Part II (1995),[Comedy],1995,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


## Transaction Table

In [5]:
def create_transaction_table(transaction_list, item_list, include_timestamp=False):
    """
    Creates user feature table from the ratings.csv file
    Ratings are normalized for each user (min-max scaling)
    Input:
    
    transaction_list (pd.DataFrame)     : MovieLens movies.csv Datraframe containing movieId, title, and genre
    item_list (pd.DataFrame)            : MovieLens movies.csv Datraframe containing movieId, title, and genre
    include_timestamp (bool)            : Include timestamp in output 
    
    Output
    user_feature_table : Dataframe containing  user_id, item_id, rating 
    """
    
    df_table = transaction_list.copy()
    df_table.rename(columns = {'userId':'user_id'}, inplace=True)
    
    # create new index based on zero
    df_table['user_id'] = df_table.groupby('user_id').ngroup()
    df_table = df_table.set_index('user_id')

    # replace movieId with item_id from the item_feature table
    df_table['item_id'] = 0
    item_feature = create_item_feature_table(item_list)
    
    def get_item_id(row):
        row['item_id'] =  int(item_feature.index[item_feature['movieId'] == row['movieId']][0])
        return row
        
    df_table = df_table.apply(get_item_id, axis=1)
    df_table['item_id'] = df_table['item_id'].astype(int)
    del df_table['movieId']
    
    # normalize ratings per each user
    norm_ratings = df_table.groupby('user_id')['rating'].transform(lambda x: (x - x.min()) / (x.max()-x.min()))
    df_table['rating'] = norm_ratings * 5.0
    
    # option to include/remove timestamp column
    if not include_timestamp:
        del df_table['timestamp']
    
    
    return df_table
    
    

In [6]:
transaction_table = create_transaction_table(transaction_list, item_list)
transaction_table.head(20)

Unnamed: 0_level_0,rating,item_id
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,3.75,0
0,3.75,2
0,3.75,5
0,5.0,43
0,5.0,46
0,2.5,62
0,5.0,89
0,3.75,97
0,5.0,124
0,5.0,130


## User Feature Table

In [22]:
def create_user_feature_table(transaction_list, item_list, rating=0, by_genre=True):
    """
    Create a uer-feature table from the transaction_list and item_list
    Can return a 1-hot encoding of the items associated with each user or genre of items
    
    Possible genres include: 'IMAX', 'Adventure', 'Mystery', 'Animation', 'Documentary', 'Comedy',
       'Western', 'War', 'Film-Noir', 'Crime', 'Drama', 'Thriller', 'Fantasy',
       'Action', 'Sci-Fi', 'Children', 'Romance', 'Horror', 'Musical',
       '(no genres listed)'
       
    Input:
    
    transaction_list (pd.DataFrame)  : MovieLens movies.csv Datraframe containing movieId, title, and genre
    item_list (pd.DataFrame)         : MovieLens movies.csv Datraframe containing movieId, title, and genre
    rating (float)                   : Filter movies with ratings better or equal this user-defined input (default = 0)
    by_genre (bool)                  : use genre/category as user_feature, else use 1-hot encoding of items (default = True)
    
    Output
    user_feature : Dataframe containing userId, item_id, rating
    """
    
    transaction_table = create_transaction_table(transaction_list, item_list)
    # for user-defined rating, filter dataframe based on input-rating
    if rating > 0:
        if rating > 5.0:
            rating = 5.0
        transaction_table = transaction_table[transaction_table['rating'] >= rating]
    
    user_id_list = list(set(transaction_table.index))
    user_feature = pd.DataFrame({'user_id': user_id_list})
    user_feature = user_feature.set_index('user_id')
    
    item_feature = create_item_feature_table(item_list)
    
    if by_genre:
        column_name = 'genres'
        category_list = list(set().union(*item_feature['genres']))
        
    else:
        # item 1-hot encoding
        category_list = list(item_feature.index)
        
    category_list.sort()
    user_feature[category_list] = 0
    user_item_list = transaction_table.groupby('user_id')['item_id'].apply(list)
#     print(user_item_list)

    for user in user_id_list:
        if by_genre:
            temp = list(item_feature.iloc[user_item_list[user]]['genres'])
            user_genre_list = [item for sublist in temp for item in sublist]
            user_genre_dict = Counter(user_genre_list)
            genre_list = user_genre_dict.keys()
            genre_list_count = user_genre_dict.values()
            user_feature.loc[user, genre_list] = genre_list_count
            
        else:
            user_feature.loc[user, user_item_list[user]] = 1


    return user_feature
    
    

In [24]:
user_feature = create_user_feature_table(transaction_list, item_list, rating = 4)
user_feature.head()

Unnamed: 0_level_0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
0,0.0,44.0,45.0,22.0,28.0,38.0,24.0,0.0,42.0,22.0,1.0,3.0,0.0,17.0,11.0,10.0,17.0,25.0,13.0,3.0
1,0.0,3.0,1.0,0.0,0.0,2.0,3.0,2.0,6.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,2.0,1.0,0.0
2,0.0,8.0,4.0,0.0,0.0,1.0,0.0,0.0,1.0,2.0,0.0,7.0,0.0,0.0,1.0,0.0,11.0,5.0,0.0,0.0
3,0.0,6.0,10.0,2.0,3.0,26.0,10.0,1.0,31.0,7.0,2.0,1.0,0.0,6.0,6.0,17.0,3.0,13.0,3.0,3.0
4,0.0,0.0,1.0,3.0,3.0,2.0,3.0,0.0,8.0,3.0,0.0,0.0,1.0,3.0,0.0,2.0,0.0,1.0,1.0,1.0


In [26]:
user_feature2 = create_user_feature_table(transaction_list, item_list, by_genre=False)
user_feature2.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,9732,9733,9734,9735,9736,9737,9738,9739,9740,9741
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Unit Testing

In [None]:
import unittest

class TestGetRec(unittest.TestCase):
    
    def test_create_item_feature_table(self):
        pass
    
    def test_create_transaction_table(self):
        pass
    
    def test_create_user_feature_table(self):
        pass
    
    
# unittest.main(argv=[''], verbosity=2, exit=False)