In [69]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [70]:
import numpy as np
import pandas as pd
from collections import defaultdict

In [71]:
user_min = 5
item_min = 5

df = pd.read_csv("../inputs/ml-100k/u.data",
                 header=None,
                 sep="\t",
                 names=["user_id", "item_id", "rating", "time"])
df.head()

Unnamed: 0,user_id,item_id,rating,time
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [72]:
print('First pass')
print('num_users = {}'.format(df["user_id"].unique().size))
print('num_items = {}'.format(df["item_id"].unique().size))
print('df_shape  = {}'.format(df.shape))

First pass
num_users = 943
num_items = 1682
df_shape  = (100000, 4)


In [73]:
user_counts = df["user_id"].value_counts()
user_counts.head()

405    737
655    685
13     636
450    540
276    518
Name: user_id, dtype: int64

In [74]:
item_counts = df["item_id"].value_counts()
item_counts.head()

50     583
258    509
100    508
181    507
294    485
Name: item_id, dtype: int64

In [75]:
remain_item = item_counts[item_counts >= item_min]
remain_item.head()

50     583
258    509
100    508
181    507
294    485
Name: item_id, dtype: int64

In [76]:
print('previous item shape: {}'.format(item_counts.size))
print('next item size: {}'.format(remain_item.size))

previous item shape: 1682
next item size: 1349


In [77]:
remain_item.index[0:4]

Int64Index([50, 258, 100, 181], dtype='int64')

In [78]:
remain_user = user_counts[user_counts >= user_min]
print('previous user size: {}'.format(user_counts.size))
print('next user size: {}'.format(remain_user.size))

previous user size: 943
next user size: 943


In [79]:
df = df[df.user_id.isin(remain_user.index)]
df = df[df.item_id.isin(remain_item.index)]

In [80]:
print('Second pass')
print('num_users = {}'.format(df["user_id"].unique().size))
print('num_items = {}'.format(df["item_id"].unique().size))
print('df_shape  = {}'.format(df.shape))

Second pass
num_users = 943
num_items = 1349
df_shape  = (99287, 4)


In [81]:
df = df.sort_values(by=['user_id', 'time'])
df.head()

Unnamed: 0,user_id,item_id,rating,time
59972,1,168,5,874965478
92487,1,172,5,874965478
74577,1,165,5,874965518
48214,1,156,4,874965556
15764,1,196,5,874965677


In [82]:
df['prev_item'] = df['item_id']
df.head()    

Unnamed: 0,user_id,item_id,rating,time,prev_item
59972,1,168,5,874965478,168
92487,1,172,5,874965478,172
74577,1,165,5,874965518,165
48214,1,156,4,874965556,156
15764,1,196,5,874965677,196


In [83]:
user_counts = df.user_id.value_counts()
user_counts.head()

405    648
655    621
13     614
450    535
276    516
Name: user_id, dtype: int64

In [84]:
user_counts = user_counts.sort_index()
user_counts.head()

1    271
2     62
3     54
4     24
5    174
Name: user_id, dtype: int64

In [86]:
data = user_counts.values
data = data.cumsum()
data[:4]

array([271, 333, 387, 411], dtype=int64)

In [87]:
data = np.roll(data, 1)
data[0] = 0
data[:4]

array([  0, 271, 333, 387], dtype=int64)

In [88]:
df.head()

Unnamed: 0,user_id,item_id,rating,time,prev_item
59972,1,168,5,874965478,168
92487,1,172,5,874965478,172
74577,1,165,5,874965518,165
48214,1,156,4,874965556,156
15764,1,196,5,874965677,196


In [89]:
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,user_id,item_id,rating,time,prev_item
0,1,168,5,874965478,168
1,1,172,5,874965478,172
2,1,165,5,874965518,165
3,1,156,4,874965556,156
4,1,196,5,874965677,196


In [90]:
df['prev_item'][data] = -1
df.head()

Unnamed: 0,user_id,item_id,rating,time,prev_item
0,1,168,5,874965478,-1
1,1,172,5,874965478,172
2,1,165,5,874965518,165
3,1,156,4,874965556,156
4,1,196,5,874965677,196


In [91]:
df.iloc[270: 275]

Unnamed: 0,user_id,item_id,rating,time,prev_item
270,1,102,2,889751736,102
271,2,286,4,888549960,-1
272,2,258,3,888549961,258
273,2,305,3,888550065,305
274,2,307,3,888550066,307


In [93]:
df = df.sort_values(by=['time'])
last_mask = df.duplicated(subset=['user_id'], keep='last')

train_df = df[last_mask]
test_df = df[~last_mask]

In [96]:
train_df = train_df.sort_values(by=['time'])
last_mask = train_df.duplicated(subset=['user_id'], keep='last')

valid_df = train_df[~last_mask]
train_df = train_df[last_mask]

In [97]:
train_df.shape

(97401, 5)

In [None]:
train_df['user_id'].value_counts()