In [1]:
import numpy as np
import pandas as pd

SEED = 12345
np.random.seed(SEED)

# Load original dataset
- dataset can be obtained from https://grouplens.org/datasets/movielens/25m/
- place `ratings.csv` in `./data/` folder

In [2]:
DATA_FOLDER = "./data/"

In [3]:
ratings = pd.read_csv(DATA_FOLDER + "ratings.csv")

In [4]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [5]:
ratings["datetime"] = pd.to_datetime(ratings["timestamp"], unit = "s")

- rename `movieId` to `item_id`

In [6]:
ratings = ratings.rename(columns={"movieId": "item_id"})

In [7]:
ratings.sort_values(["userId","timestamp"], inplace=True)
ratings

Unnamed: 0,userId,item_id,rating,timestamp,datetime
36,1,5952,4.0,1147868053,2006-05-17 12:14:13
13,1,2012,2.5,1147868068,2006-05-17 12:14:28
12,1,2011,2.5,1147868079,2006-05-17 12:14:39
11,1,1653,4.0,1147868097,2006-05-17 12:14:57
9,1,1250,4.0,1147868414,2006-05-17 12:20:14
...,...,...,...,...,...
24999971,162541,1259,4.5,1240953609,2009-04-28 21:20:09
24999972,162541,1266,5.0,1240953613,2009-04-28 21:20:13
24999982,162541,1556,1.0,1240953650,2009-04-28 21:20:50
24999925,162541,293,4.0,1240953789,2009-04-28 21:23:09


In [None]:
ratings["sessionId"] = 1

In [None]:
import seaborn as sns
sns.histplot(ratings["datetime"])

### Limit to only recent data (2015+)

In [10]:
ratings_recent = ratings.loc[ratings["datetime"] >= np.datetime64('2015', 'Y')]
ratings_recent.shape

(7563741, 6)

In [11]:
len(ratings_recent.item_id.unique()),len(ratings_recent.userId.unique())

(58133, 45450)

## Data binarization
- ratings < 2.5 are negative (-1), ratings > 3.5 are positive (+1); remove middle ones
- for testing, identify user-session pairs where there is at least three positive and three negative rating

In [16]:
df = ratings_recent.loc[~((ratings_recent.rating >=2.5)&(ratings_recent.rating <=3.5)) ]
df.shape

(4644545, 6)

In [17]:
df.loc[df.rating <2.5,"rating"] = -1
df.loc[df.rating >3.5,"rating"] = 1
df

Unnamed: 0,userId,item_id,rating,timestamp,datetime,sessionId
266,3,356,1.0,1439472199,2015-08-13 13:23:19,1
272,3,593,1.0,1439472203,2015-08-13 13:23:23,1
254,3,1,1.0,1439472215,2015-08-13 13:23:35,1
268,3,480,-1.0,1439472219,2015-08-13 13:23:39,1
317,3,2571,1.0,1439472221,2015-08-13 13:23:41,1
...,...,...,...,...,...,...
24999746,162538,69757,1.0,1438785456,2015-08-05 14:37:36,1
24999729,162538,49286,1.0,1438785469,2015-08-05 14:37:49,1
24999715,162538,8533,1.0,1438785506,2015-08-05 14:38:26,1
24999762,162538,93988,1.0,1438785545,2015-08-05 14:39:05,1


In [18]:
# create a user_session identifier
df = df.copy()
df['user_session_id'] = (df['userId'].astype(str) + '_' + df['sessionId'].astype(str))
df = df.sort_values(by=['user_session_id', 'timestamp'])

### Define sessions suitable for testing

In [19]:
testable_sessions = df.groupby('user_session_id').agg({"userId": "first", "rating": "count"}).rename(columns={"rating": "ratingsCount"}).reset_index()
testable_sessions["positiveCount"] = 0
positive_counts = df[df.rating == 1.].groupby('user_session_id').rating.count()
testable_sessions.loc[testable_sessions["user_session_id"].isin(positive_counts.index) ,"positiveCount"] = testable_sessions.loc[testable_sessions["user_session_id"].isin(positive_counts.index) ,"user_session_id"].apply(lambda x: positive_counts[x])
testable_sessions["negativeCount"] = testable_sessions["ratingsCount"] - testable_sessions["positiveCount"]

testable_sessions

Unnamed: 0,user_session_id,userId,ratingsCount,positiveCount,negativeCount
0,100001_1,100001,71,69,2
1,100001_2,100001,106,103,3
2,100007_1,100007,8,8,0
3,10000_1,10000,76,34,42
4,100014_1,100014,133,133,0
...,...,...,...,...,...
283448,9998_8,9998,2,2,0
283449,9998_9,9998,1,1,0
283450,99991_1,99991,112,75,37
283451,99999_1,99999,505,456,49


In [20]:
print(len(testable_sessions))
# we need at least 3 positive interactions and 3 negative interactions
testable_sessions = testable_sessions[(testable_sessions.positiveCount >= 3) & (testable_sessions.negativeCount >= 3)]
print(len(testable_sessions))

283453
34112


### Distinguish between validation and test sessions

In [21]:
# shuffle rows
testable_sessions = testable_sessions.sample(frac=1, random_state=SEED).reset_index(drop=True)
# add sessions randomly to validation set (0) or one of five CV splits (1-5)
testable_sessions["split"] = testable_sessions.index.values

split_number_to_string = {
    0: "val",  # 25% of sessions are validation
    1: "test",
    2: "test",
    3: "test",
}

testable_sessions["split"] = testable_sessions["split"].apply(lambda x: x % 4).apply(lambda x: split_number_to_string[x])

In [22]:
testable_sessions

Unnamed: 0,user_session_id,userId,ratingsCount,positiveCount,negativeCount,split
0,41935_1,41935,118,44,74,val
1,108975_1,108975,217,156,61,test
2,152928_22,152928,9,3,6,test
3,12510_1,12510,37,28,9,test
4,2610_6,2610,26,19,7,val
...,...,...,...,...,...,...
34107,11750_14,11750,12,6,6,test
34108,38435_2,38435,48,17,31,val
34109,109208_1,109208,22,18,4,test
34110,149900_1,149900,53,48,5,test


In [23]:
# check if every row is a unique user session
assert len(testable_sessions) == testable_sessions.user_session_id.nunique()

In [24]:
for split in ["train", "val", "test"]:
    print(f"number of testable sessions in split {split}: {len(testable_sessions[testable_sessions.split == split])}")

number of testable sessions in split train: 0
number of testable sessions in split val: 8528
number of testable sessions in split test: 25584


In [25]:
# map user_session_id to split
user_session_id_to_split = {usid: split for usid, split in zip(testable_sessions.user_session_id.values, testable_sessions.split.values)}

In [26]:
# assign dataframe entries to split
df["split"] = df["user_session_id"].apply(lambda x: user_session_id_to_split.get(x, "train"))

In [27]:
train_df = df[df.split == "train"]
eval_df = df[df.split != "train"]

# in training split, keep only interactions with (positive) rating 1
train_df = train_df[train_df.rating == 1.]

df = pd.concat([train_df, eval_df])

count    36130.000000
mean        49.365652
std        330.643594
min          1.000000
25%          1.000000
50%          2.000000
75%          8.000000
max      11858.000000
Name: count, dtype: float64
count    36130.000000
mean         6.269278
std         14.097070
min          1.000000
25%          1.000000
50%          2.000000
75%          4.000000
max        100.000000
Name: count, dtype: float64


  train_df = train_df.groupby('item_id', group_keys=False).apply(


In [28]:
df.shape

(3049349, 8)

In [29]:
df.groupby("split")["rating"].count()

split
test     2139835
train     188385
val       721129
Name: rating, dtype: int64

## Save processed dataset

In [30]:
df.to_csv(DATA_FOLDER + "ratings_processed_MovieLens_dense.csv", index=False)