In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [6]:
df_gs = pd.read_csv('./data/genome-scores.csv')
df_gt = pd.read_csv('./data/genome-tags.csv')
df_links = pd.read_csv('./data/links.csv')
df_movies = pd.read_csv('./data/movies.csv')
df_ratings = pd.read_csv('./data/ratings.csv')
df_tags = pd.read_csv('./data/tags.csv')

In [7]:
df_ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000263 entries, 0 to 20000262
Data columns (total 4 columns):
userId       int64
movieId      int64
rating       float64
timestamp    int64
dtypes: float64(1), int64(3)
memory usage: 610.4 MB


## Randomly select 10000 unique users

In [21]:
all_users = df_ratings['userId'].unique()
np.random.seed(42)
random_users = np.random.choice(all_users, size=10000, replace=False)
df_ratings_random = df_ratings.loc[df_ratings.userId.isin(random_users)]

## Pivot table - indexing on userId and use movieId as columns

In [22]:
df_ratings_pivot = pd.pivot_table(df_ratings_random, values='rating', index=['userId'], columns=['movieId'])

In [23]:
df_ratings_pivot.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,130500,130502,130506,130508,130510,130682,130964,131019,131164,131166
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
24,4.0,,,,2.0,4.0,3.0,,,3.0,...,,,,,,,,,,
36,,,,,,,,,,,...,,,,,,,,,,
88,,1.0,,,,,1.0,,,,...,,,,,,,,,,
90,3.5,,,,,,,,,3.5,...,,,,,,,,,,
111,,,,,,,,,,,...,,,,,,,,,,


## Check how many movies the users have rated

In [69]:
summary = df_ratings_pivot.count(axis=1).describe()
summary

count    10000.000000
mean       144.804800
std        233.007248
min         20.000000
25%         34.000000
50%         67.000000
75%        152.000000
max       4236.000000
dtype: float64

In [70]:
summary['25%']

34.0

## Split the data frame into two category - inactive users and active users (25% quantile)

In [50]:
df_inactive = df_ratings_pivot.loc[df_ratings_pivot.count(axis=1)<=34]
df_active = df_ratings_pivot.loc[df_ratings_pivot.count(axis=1)>34]

In [51]:
print(df_inactive.shape, df_active.shape)

(2579, 16581) (7421, 16581)


In [60]:
df_inactive.count(axis=1).describe()

count    2579.000000
mean       25.795657
std         4.385484
min        20.000000
25%        22.000000
50%        25.000000
75%        29.000000
max        34.000000
dtype: float64

In [61]:
df_active.count(axis=1).describe()

count    7421.000000
mean      186.163725
std       257.920097
min        35.000000
25%        56.000000
50%       100.000000
75%       201.000000
max      4236.000000
dtype: float64

## Stratified randomization in both inactive and active users to get train and test set

In [63]:
def split(data, train_size=0.8):
    msk = np.random.rand(len(data)) < train_size
    train = data[msk]  
    test = data[~msk]
    return train, test

In [64]:
train_active, test_active = split(df_active)
train_inactive, test_inactive = split(df_inactive)

In [65]:
train = pd.concat([train_active, train_inactive])
test = pd.concat([test_active, test_inactive])

## Check the distribution of the train and test sets, as you can see, pretty much consistent

In [66]:
train.count(axis=1).describe()

count    7978.000000
mean      145.160065
std       234.448364
min        20.000000
25%        34.000000
50%        67.000000
75%       153.750000
max      4236.000000
dtype: float64

In [67]:
test.count(axis=1).describe()

count    2022.000000
mean      143.403066
std       227.283555
min        20.000000
25%        33.000000
50%        63.500000
75%       149.000000
max      3488.000000
dtype: float64

## Deploy the code in a function

In [84]:
def train_test_split(data,
                     random_size=10000,
                     train_size=0.8, 
                     threshold='25%', 
                     index='userId',
                     values='rating',
                     columns='movieId'):
    
    # randomly select 10000 users 
    np.random.seed(42)
    all_users = data[index].unique()
    random_users = np.random.choice(all_users, 
                                    size=random_size, 
                                    replace=False)
    df_random = data.loc[data.userId.isin(random_users)]
    
    # Pivot table - indexing on userId and use movieId as columns
    df_pivot = pd.pivot_table(df_random, 
                              values=[values], 
                              index=[index], 
                              columns=[columns])
    
    # default threshold is 25% quantile but could be specified
    cut = df_pivot.count(axis=1).describe()[threshold]
    
    # select active and inactive users based on 25% cut 
    df_inactive = df_pivot.loc[df_pivot.count(axis=1)<=cut]
    df_active = df_pivot.loc[df_pivot.count(axis=1)>cut]
    
    # helper function to split train and test set 
    def split(data):
        msk = np.random.rand(len(data)) < train_size
        train = data[msk]  
        test = data[~msk]
        return train, test
    
    # split train test for active users and inactive users 
    # seperately
    train_active, test_active = split(df_active)
    train_inactive, test_inactive = split(df_inactive)
    
    # combine train and test sets for active and inactive users 
    train = pd.concat([train_active, train_inactive])
    test = pd.concat([test_active, test_inactive])
    
    return train, test

In [81]:
train, test = train_test_split(df_ratings)

In [82]:
train.count(axis=1).describe()

count    7978.000000
mean      145.160065
std       234.448364
min        20.000000
25%        34.000000
50%        67.000000
75%       153.750000
max      4236.000000
dtype: float64

In [83]:
test.count(axis=1).describe()

count    2022.000000
mean      143.403066
std       227.283555
min        20.000000
25%        33.000000
50%        63.500000
75%       149.000000
max      3488.000000
dtype: float64