In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
cd ..

C:\Projects\python\recommender


In [3]:
import numpy as np
import pandas as pd
import scipy.sparse as sp

from utils import build_logger

In [4]:
logger = build_logger()

In [5]:
user_min = 5
item_min = 5

df = pd.read_csv("inputs/ml-100k/u.data",
                 header=None,
                 sep="\t",
                 names=["user_id", "item_id", "rating", "time"],
                 dtype={
                     'user_id': 'int32',
                     'item_id': 'int32',
                     'rating': 'int32',
                     'time': 'int32'
                 })
df.head()

Unnamed: 0,user_id,item_id,rating,time
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [6]:
user_counts = df.user_id.value_counts()
item_counts = df.item_id.value_counts()

logger.info("Original user size: {}".format(user_counts.size))
logger.info("Original item size: {}".format(item_counts.size))

# get user and item category info
user_counts = user_counts[user_counts >= user_min]
item_counts = item_counts[item_counts >= item_min]

logger.info("Filter user size: {}".format(user_counts.size))
logger.info("Filter item size: {}".format(item_counts.size))

user_one_hot = sp.identity(user_counts.size).tocsr()
item_one_hot = sp.identity(item_counts.size).tocsr()

# remove sparse item
df = df[df.user_id.isin(user_counts.index)]
df = df[df.item_id.isin(item_counts.index)]

# Add previous item
df['prev_item_id'] = df.item_id
df['prev_item_id'] = df['prev_item_id'].shift(
    periods=1).fillna(0)

# Add negtive item
df['neg_item_id'] = df.item_id.sample(df.shape[0]).values

# split train and test ddataframe
df = df.sort_values(by=['time'])
duplicate_mask = df.duplicated(subset=['user_id'], keep='last')
remain_df = df[duplicate_mask]
test_df = df[~duplicate_mask]
duplicate_mask = remain_df.duplicated(subset=['user_id'], keep='last')
train_df = remain_df[duplicate_mask]
valid_df = remain_df[~duplicate_mask]

# Set first item non for each user
train_df.sort_values(by=['user_id'])
first_mask = ~train_df.duplicated(subset=['user_id'], keep='first')
train_df['prev_item_id'][first_mask] = -1

cat_names = ['user_id', 'item_id', 'prev_item_id', 'neg_item_id']

2019-09-07 16:34:43,396 - C:\Projects\python\recommender\utils.py - INFO - Original user size: 943
2019-09-07 16:34:43,397 - C:\Projects\python\recommender\utils.py - INFO - Original item size: 1682
2019-09-07 16:34:43,400 - C:\Projects\python\recommender\utils.py - INFO - Filter user size: 943
2019-09-07 16:34:43,401 - C:\Projects\python\recommender\utils.py - INFO - Filter item size: 1349
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  exec

In [7]:
from sklearn.preprocessing import OneHotEncoder

In [8]:
user_encoder = OneHotEncoder(categories='auto')

In [9]:
user_encoder.fit(train_df[['user_id']])

OneHotEncoder(categorical_features=None, categories='auto', drop=None,
              dtype=<class 'numpy.float64'>, handle_unknown='error',
              n_values=None, sparse=True)

In [10]:
data = user_encoder.transform(train_df[['user_id']].head())
data.shape

(5, 943)

In [11]:
data = user_encoder.transform(train_df[['user_id']].sample(n=10))
data

<10x943 sparse matrix of type '<class 'numpy.float64'>'
	with 10 stored elements in Compressed Sparse Row format>

In [12]:
print(data.indices)
print(data.nonzero())

[787 206 129 847 378 594  95 658 192 540]
(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), array([787, 206, 129, 847, 378, 594,  95, 658, 192, 540]))


In [13]:
item_encoder = OneHotEncoder(categories='auto', handle_unknown='ignore')

In [14]:
item_encoder.fit(train_df[['item_id']])

OneHotEncoder(categorical_features=None, categories='auto', drop=None,
              dtype=<class 'numpy.float64'>, handle_unknown='ignore',
              n_values=None, sparse=True)

In [15]:
item_encoder.transform(train_df[['prev_item_id']].head()).shape

(5, 1349)

## Handle Movie Item Dataframe

In [16]:
base_cols = [
    'movie_id', 'movie_title', 'release_date', 'video_release_date', 'imdb_url'
]
movie_cat_cols = [
    'unkown', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy',
    'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror',
    'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'
]

In [17]:
item_df = pd.read_csv('./inputs/ml-100k/u.item',
                      header=None,
                      index_col=False,
                      sep='|',
                      names=base_cols + movie_cat_cols)
item_df.head()

Unnamed: 0,movie_id,movie_title,release_date,video_release_date,imdb_url,unkown,Action,Adventure,Animation,Children,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [18]:
item_df.columns = base_cols + movie_cat_cols
item_df.head()

Unnamed: 0,movie_id,movie_title,release_date,video_release_date,imdb_url,unkown,Action,Adventure,Animation,Children,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [19]:
item_df = item_df[['movie_id'] + movie_cat_cols]
item_df.head()

Unnamed: 0,movie_id,unkown,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,4,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,5,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0


In [20]:
item_df = item_df.set_index('movie_id')
item_df.head()

Unnamed: 0_level_0,unkown,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
4,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0


In [28]:
# append not exist row
col = item_df.shape[1]
item_df.loc[-1] = np.zeros(col, dtype=np.float64)
item_df.head()

Unnamed: 0_level_0,unkown,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [30]:
item_df.dtypes

unkown         float64
Action         float64
Adventure      float64
Animation      float64
Children       float64
Comedy         float64
Crime          float64
Documentary    float64
Drama          float64
Fantasy        float64
Film-Noir      float64
Horror         float64
Musical        float64
Mystery        float64
Romance        float64
Sci-Fi         float64
Thriller       float64
War            float64
Western        float64
dtype: object

In [21]:
item_sample = train_df['item_id'].sample(n=10)
item_sample

46965     905
16522    1039
46058    1131
48373     316
10582       9
15882     409
12206     739
87001     541
33375     250
94378      83
Name: item_id, dtype: int32

In [22]:
item_sample.values

array([ 905, 1039, 1131,  316,    9,  409,  739,  541,  250,   83])

In [23]:
item_df.index

Int64Index([   1,    2,    3,    4,    5,    6,    7,    8,    9,   10,
            ...
            1673, 1674, 1675, 1676, 1677, 1678, 1679, 1680, 1681, 1682],
           dtype='int64', name='movie_id', length=1682)

In [27]:
print(item_df.loc[item_sample] == item_df.loc[item_sample.values])

          unkown  Action  Adventure  Animation  Children  Comedy  Crime  \
movie_id                                                                  
905         True    True       True       True      True    True   True   
1039        True    True       True       True      True    True   True   
1131        True    True       True       True      True    True   True   
316         True    True       True       True      True    True   True   
9           True    True       True       True      True    True   True   
409         True    True       True       True      True    True   True   
739         True    True       True       True      True    True   True   
541         True    True       True       True      True    True   True   
250         True    True       True       True      True    True   True   
83          True    True       True       True      True    True   True   

          Documentary  Drama  Fantasy  Film-Noir  Horror  Musical  Mystery  \
movie_id             

In [5]:
import torch as T
from torch.utils.data import Dataset, DataLoader

In [7]:
class MovielenDs(Dataset):
    def __init__(self, df: pd.DataFrame):
        self.df = df
        
    def __getitem__(self, idx):
        return self.df.iloc[idx]
        
    def __len__(self):
        return self.df.shape[0]

In [9]:
ds = MovielenDs(df)
ds

<__main__.MovielenDs at 0x2ae6f03e7f0>

In [20]:
from typing import List

def my_collate(batch: List[pd.DataFrame]):
    df_batch = pd.DataFrame(batch)
    return df_batch

In [21]:
dl = DataLoader(ds, batch_size=2, collate_fn=my_collate)
dl

<torch.utils.data.dataloader.DataLoader at 0x2ae7076c898>

In [22]:
ds_it = iter(dl)
ds_it

<torch.utils.data.dataloader._DataLoaderIter at 0x2ae7076c7b8>

In [23]:
print(ds_it.next())

   user_id  item_id  rating       time
0      196      242       3  881250949
1      186      302       3  891717742


In [24]:
user_counts = df['user_id'].value_counts()
item_counts = df['item_id'].value_counts()

In [26]:
user_one_hot = sp.identity(user_counts.size).tocsr()
item_one_hot = sp.identity(item_counts.size).tocsr()

In [27]:
df_batch = ds_it.next()
df_batch

Unnamed: 0,user_id,item_id,rating,time
2,22,377,1,878887116
3,244,51,2,880606923


In [28]:
user_vector = user_one_hot[df_batch['user_id']]
item_vector = user_one_hot[df_batch['item_id']]

In [30]:
feat_matrix = sp.hstack([user_vector, item_vector])
feat_matrix

<2x1886 sparse matrix of type '<class 'numpy.float64'>'
	with 4 stored elements in COOrdinate format>

In [31]:
feat_matrix.data

array([1., 1., 1., 1.])

In [34]:
feat_matrix.row

array([0, 1, 0, 1], dtype=int32)

In [35]:
feat_matrix.col

array([  22,  244, 1320,  994], dtype=int32)

In [37]:
feat_index = np.vstack(feat_matrix.nonzero())
feat_index

array([[   0,    1,    0,    1],
       [  22,  244, 1320,  994]], dtype=int32)

In [39]:
index_tensor = T.LongTensor(feat_index.tolist())

In [40]:
value_tensor = T.FloatTensor(feat_matrix.data)
value_tensor

tensor([1., 1., 1., 1.])

In [41]:
feat_tensor = T.sparse.FloatTensor(index_tensor, value_tensor)
feat_tensor

tensor(indices=tensor([[   0,    1,    0,    1],
                       [  22,  244, 1320,  994]]),
       values=tensor([1., 1., 1., 1.]),
       size=(2, 1321), nnz=4, layout=torch.sparse_coo)