In [1]:
import numpy as np
import pandas as pd
from collections import Counter



In [2]:
transaction_list = pd.read_csv('ratings.csv')

In [15]:
def create_transaction_table(transaction_list, include_timestamp=False):
    """
    Creates user feature table from the ratings.csv file
    
    Parameters:
    
        transaction_list (pd.DataFrame)     : DataFrame containing list of items (movies) per user and given rating
        include_timestamp (bool)            : Include timestamp in output 
    
    Output
        user_feature_table (pd.DataFrame)   : List containing user transactions (movies) and corresponding ratings given by user
        
    """
    
    df_table = transaction_list.copy()
    df_table.rename(columns = {'userId':'user_id'}, inplace=True)
    df_table.rename(columns = {'movieId':'item_id'}, inplace=True)
    
    # create new index based on zero
    df_table['user_id'] = df_table.groupby('user_id').ngroup()
    df_table = df_table.set_index('user_id')
    
    
    # option to include/remove timestamp column
    if not include_timestamp:
        del df_table['timestamp']
    
    
    return df_table
    
    

## User Feature Table

In [21]:
def create_user_feature_table(transaction_list):
    """
    Create a uer-feature table from the transaction_list and item_list
    Returns the ratings for the items from each user
       
    Input:
    
    transaction_list (pd.DataFrame)  : DataFrame containing list of items (movies) per user and given rating
    
    Output
    user_feature (pd.DataFrame)      : Table containing the ratings given by each user (0 for items/movies not rated)
    """
    
    transaction_table = create_transaction_table(transaction_list)
    
    user_id_list = list(set(transaction_table.index))
    user_feature = pd.DataFrame({'user_id': user_id_list})
    user_feature = user_feature.set_index('user_id')
    
    category_list = list(set(transaction_table['item_id']))
        
    category_list.sort()
    user_feature[category_list] = 0
    
    user_item_list = transaction_table.groupby('user_id')['item_id'].apply(list)
    user_rating_list = transaction_table.groupby('user_id')['rating'].apply(list)

    for user in user_id_list:
        # return ratings given by each user to columns of items
        current_user_ratings = user_rating_list[user]
        current_item_list   = user_item_list[user]
        user_feature.loc[user, current_item_list] = current_user_ratings


    return user_feature
    
    

In [22]:
user_feature = create_user_feature_table(transaction_list)
user_feature.head()

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [41]:
user_feature.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,9732,9733,9734,9735,9736,9737,9738,9739,9740,9741
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
user_feature2 = create_user_feature_table(transaction_list, item_list, by_genre=True)
user_feature2.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,9732,9733,9734,9735,9736,9737,9738,9739,9740,9741
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Unit Testing

In [None]:
import unittest

class TestGetRec(unittest.TestCase):
    
    def test_create_item_feature_table(self):
        item_list = pd.read_csv('movies.csv')
        transaction_list = pd.read_csv('ratings.csv')
        
        item_feature = create_item_feature_table(item_list, encode_genre=True)
        num_cols, num_rows = item_feature.shape
        
        # test that data has been read properly
        self.assertGreater(num_rows, 0)
        self.assertGreater(num_cols, 0)
        
        
        
       
        
    def test_create_item_feature_table_2(self):
        item_list = pd.read_csv('movies.csv')
        transaction_list = pd.read_csv('ratings.csv')
        
        
        
    
    def test_create_transaction_table(self):
        item_list = pd.read_csv('movies.csv')
        transaction_list = pd.read_csv('ratings.csv')
        
        transaction_table = create_transaction_table(transaction_list, item_list)
        num_cols, num_rows = transaction_table.shape
        
        # test that data has been read properly
        self.assertGreater(num_rows, 0)
        self.assertGreater(num_cols, 0)
        
        
    
    def test_create_user_feature_table(self):
        item_list = pd.read_csv('movies.csv')
        transaction_list = pd.read_csv('ratings.csv')
        
        user_feature = create_user_feature_table(transaction_list, item_list, rating = 4)
        num_cols, num_rows = user_feature.shape
        
        # test that data has been read properly
        self.assertGreater(num_rows, 0)
        self.assertGreater(num_cols, 0)
        
        
        
    
    
# unittest.main(argv=[''], verbosity=2, exit=False)