In [1]:
import numpy as np
import pandas as pd
from collections import Counter



In [2]:
item_list = pd.read_csv('movies.csv')
transaction_list = pd.read_csv('ratings.csv')

## Item Feature Table

In [6]:
def create_item_feature_table(item_list, encode_genre=False):
    
    """
    
    Creates item feature table with information extracted from the Movies.csv file from MovieLens dataset
    
    Parameters:    
        item_list (pd.DataFrame)      : MovieLens movies.csv Datraframe containing movieId, title, and genre
        encode_genre (bool)           : Include 1-hot encoding of movie into its indicated genres (default: False)

    
    Returns:
        item_feature_table (pd.DataFrame) :  List containing items and associated details, 1-hot encoding based on category (optional) 
    
    """
    
    item_table = item_list.copy()
    item_table = item_table.set_index('movieId')
    item_table.index.rename('item_id', inplace=True)
    item_table['year'] = item_table['title'].str.slice(-5,-1)
    item_table['genres'] = item_table['genres'].str.split('|')
    
    item_feature_table = item_table
    
    if (encode_genre):
        genre_list = list(set().union(*item_table['genres']))
        genre_list.sort()
        item_feature_table[genre_list] = 0
        
        def set_genres(row):
            row[row['genres']] = 1
            return row
        
        item_feature_table = item_feature_table.apply(set_genres, axis=1)
    
    return item_feature_table
    
    

In [7]:
item_feature = create_item_feature_table(item_list, encode_genre=True)
item_feature.head()

Unnamed: 0_level_0,title,genres,year,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",1995,0,0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
2,Jumanji (1995),"[Adventure, Children, Fantasy]",1995,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Grumpier Old Men (1995),"[Comedy, Romance]",1995,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]",1995,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
5,Father of the Bride Part II (1995),[Comedy],1995,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


## Transaction Table

In [5]:
def create_transaction_table(transaction_list, include_timestamp=False):
    """
    Creates user feature table from the ratings.csv file
    
    Parameters:
    
        transaction_list (pd.DataFrame)     : DataFrame containing list of items (movies) per user and given rating
        include_timestamp (bool)            : Include timestamp in output 
    
    Output
        user_feature_table (pd.DataFrame)   : List containing user transactions (movies) and corresponding ratings given by user
        
    """
    
    df_table = transaction_list.copy()
    df_table.rename(columns = {'userId':'user_id'}, inplace=True)
    df_table.rename(columns = {'movieId':'item_id'}, inplace=True)
    
    # create new index based on zero
    df_table['user_id'] = df_table.groupby('user_id').ngroup()
    df_table = df_table.set_index('user_id')
    
    
    
    # option to include/remove timestamp column
    if not include_timestamp:
        del df_table['timestamp']
    
    
    return df_table
    
    

In [6]:
transaction_table = create_transaction_table(transaction_list)
transaction_table.head(20)

Unnamed: 0_level_0,item_id,rating
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1,4.0
0,3,4.0
0,6,4.0
0,47,5.0
0,50,5.0
0,70,3.0
0,101,5.0
0,110,4.0
0,151,5.0
0,157,5.0


## User Feature Table

In [7]:
def create_user_feature_table(transaction_list):
    """
    Create a uer-feature table from the transaction_list and item_list
    Returns the ratings for the items from each user
       
    Input:
    
    transaction_list (pd.DataFrame)  : DataFrame containing list of items (movies) per user and given rating
    
    Output
    user_feature (pd.DataFrame)      : Table containing the ratings given by each user (0 for items/movies not rated)
    """
    
    transaction_table = create_transaction_table(transaction_list)
    
    user_id_list = list(set(transaction_table.index))
    user_feature = pd.DataFrame({'user_id': user_id_list})
    user_feature = user_feature.set_index('user_id')
    
    category_list = list(set(transaction_table['item_id']))
        
    category_list.sort()
    user_feature[category_list] = 0
    
    user_item_list = transaction_table.groupby('user_id')['item_id'].apply(list)
    user_rating_list = transaction_table.groupby('user_id')['rating'].apply(list)

    for user in user_id_list:
        # return ratings given by each user to columns of items
        current_user_ratings = user_rating_list[user]
        current_item_list   = user_item_list[user]
        user_feature.loc[user, current_item_list] = current_user_ratings


    return user_feature
    
    

In [8]:
user_feature = create_user_feature_table(transaction_list)
user_feature.head()

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
