In [1]:
import pandas as pd
import numpy as np

In [3]:
data = pd.read_csv("../data/processed/interactions.csv")

In [4]:
data.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,A1N070NS9CJQ2I,60009810,5.0,1026864000
1,A3P0KRKOBQK1KN,60009810,5.0,1025913600
2,A192HO2ICJ75VU,60009810,5.0,1025654400
3,A2T278FKFL3BLT,60009810,4.0,1025395200
4,A2ZUXVTW8RXBXW,60009810,5.0,1025222400


In [7]:
data.isna().sum()

user_id      0
item_id      0
rating       0
timestamp    0
dtype: int64

In [10]:
data.duplicated().sum()

np.int64(122)

In [11]:
data = data.drop_duplicates(subset=["user_id", "item_id", "timestamp"])

In [12]:
data.duplicated().sum()

np.int64(0)

In [13]:
user_counts = data.user_id.value_counts()
item_counts = data.item_id.value_counts()

In [15]:
active_users = user_counts[user_counts >= 5].index
popular_items = item_counts[item_counts >= 5].index

In [18]:
print(len(active_users))
print(len(popular_items))

71
1012


In [19]:
interactions = data[
    data.user_id.isin(active_users) &
    data.item_id.isin(popular_items)
]

In [20]:
interactions["interaction"] = 1

In [21]:
interactions = interactions[[
    "user_id", "item_id", "interaction", "timestamp"
]]

In [22]:
interactions.head()

Unnamed: 0,user_id,item_id,interaction,timestamp
222,A3MV1KKHX51FYT,380709473,1,1384992000
447,A1TIQNQJZ2LDNW,545105668,1,1437177600
448,A1TIQNQJZ2LDNW,545105668,1,1436832000
449,A1TIQNQJZ2LDNW,545105668,1,1435968000
450,A1TIQNQJZ2LDNW,545105668,1,1434758400


### Split interaction data into test and train based on Time

In [23]:
interactions = interactions.sort_values("timestamp")
split_time = interactions.timestamp.quantile(0.8)

In [24]:
train = interactions[interactions.timestamp <= split_time]
test = interactions[interactions.timestamp > split_time]

In [25]:
print("Train interactions:", len(train))
print("Test interactions:", len(test))

Train interactions: 418
Test interactions: 103


In [26]:
print("Users in train:", train.user_id.nunique())
print("Users in test:", test.user_id.nunique())

Users in train: 58
Users in test: 25


In [28]:
train.to_csv("../data/processed/train.csv", index=False)
test.to_csv("../data/processed/test.csv", index=False)