In [1]:
import numpy as np
import pandas as pd

SEED = 12345
np.random.seed(SEED)

# Load original dataset
- dataset can be obtained from https://grouplens.org/datasets/movielens/25m/
- place `ratings.csv` in `./data/yelp2018/` folder

In [2]:
DATA_FOLDER = "./data/yelp2018/"

In [3]:
users = []
items = []
# we combine the train and test splits (weak generalization)
# and will construct strong generalization splits the same way we do it in other experiments
for split in ["train.txt", "test.txt"]:
    with open(DATA_FOLDER + "train.txt") as f:
        for line in f.readlines():
            entries = line.strip().split()
            users += [entries[0]] * (len(entries) - 1)
            items += entries[1:]

ratings = pd.DataFrame({"userId": users, "item_id": items})
ratings["rating"] = 1.

In [4]:
ratings.head()

Unnamed: 0,userId,item_id,rating
0,0,0,1.0
1,0,1,1.0
2,0,2,1.0
3,0,3,1.0
4,0,4,1.0


In [5]:
ratings.sort_values(["userId"], inplace=True)
ratings

Unnamed: 0,userId,item_id,rating
0,0,0,1.0
1237275,0,16,1.0
1237274,0,15,1.0
1237273,0,14,1.0
1237272,0,13,1.0
...,...,...,...
476870,9999,11262,1.0
476871,9999,25380,1.0
476872,9999,14000,1.0
476866,9999,3122,1.0


In [6]:
ratings["sessionId"] = 1  # just for consistency with other notebooks

In [7]:
len(ratings.item_id.unique()),len(ratings.userId.unique())

(38048, 31668)

## Data binarization
- data is already binary
- (unlike in MovieLens and BeerAdvocate) for testing, identify user-session pairs where there is at least 50 interactions (there is no negative feedback in this dataset, and individual users have many interactions)

In [8]:
df = ratings
df.shape

(2474518, 4)

In [9]:
# create a user_session identifier
df = df.copy()
df['user_session_id'] = (df['userId'].astype(str) + '_' + df['sessionId'].astype(str))
df = df.sort_values(by=['user_session_id'])

### Define sessions suitable for testing

In [10]:
testable_sessions = df.groupby('user_session_id').agg({"userId": "first", "rating": "count"}).rename(columns={"rating": "ratingsCount"}).reset_index()
testable_sessions["positiveCount"] = 0
positive_counts = df[df.rating == 1.].groupby('user_session_id').rating.count()
testable_sessions.loc[testable_sessions["user_session_id"].isin(positive_counts.index) ,"positiveCount"] = testable_sessions.loc[testable_sessions["user_session_id"].isin(positive_counts.index) ,"user_session_id"].apply(lambda x: positive_counts[x])
testable_sessions["negativeCount"] = testable_sessions["ratingsCount"] - testable_sessions["positiveCount"]

testable_sessions

Unnamed: 0,user_session_id,userId,ratingsCount,positiveCount,negativeCount
0,0_1,0,36,36,0
1,10000_1,10000,154,154,0
2,10001_1,10001,192,192,0
3,10002_1,10002,40,40,0
4,10003_1,10003,36,36,0
...,...,...,...,...,...
31663,9998_1,9998,46,46,0
31664,9999_1,9999,144,144,0
31665,999_1,999,136,136,0
31666,99_1,99,36,36,0


In [11]:
print(len(testable_sessions))
# we need at least 50 interactions
testable_sessions = testable_sessions[testable_sessions.positiveCount >= 50]
print(len(testable_sessions))

31668
16451


### Distinguish between validation and test sessions

In [12]:
# shuffle rows
testable_sessions = testable_sessions.sample(frac=1, random_state=SEED).reset_index(drop=True)
# add sessions randomly to validation set (0) or one of five CV splits (1-5)
testable_sessions["split"] = testable_sessions.index.values

split_number_to_string = {
    0: "val",  # 25% of sessions are validation
    1: "test",
    2: "test",
    3: "test",
}

testable_sessions["split"] = testable_sessions["split"].apply(lambda x: x % 4).apply(lambda x: split_number_to_string[x])

In [13]:
testable_sessions

Unnamed: 0,user_session_id,userId,ratingsCount,positiveCount,negativeCount,split
0,7018_1,7018,320,320,0,val
1,18474_1,18474,88,88,0,test
2,23539_1,23539,68,68,0,test
3,851_1,851,162,162,0,test
4,27164_1,27164,116,116,0,val
...,...,...,...,...,...,...
16446,2352_1,2352,98,98,0,test
16447,16942_1,16942,70,70,0,test
16448,30842_1,30842,56,56,0,val
16449,1361_1,1361,136,136,0,test


In [14]:
# check if every row is a unique user session
assert len(testable_sessions) == testable_sessions.user_session_id.nunique()

In [15]:
for split in ["train", "val", "test"]:
    print(f"number of testable sessions in split {split}: {len(testable_sessions[testable_sessions.split == split])}")

number of testable sessions in split train: 0
number of testable sessions in split val: 4113
number of testable sessions in split test: 12338


In [16]:
# map user_session_id to split
user_session_id_to_split = {usid: split for usid, split in zip(testable_sessions.user_session_id.values, testable_sessions.split.values)}

In [17]:
# assign dataframe entries to split
df["split"] = df["user_session_id"].apply(lambda x: user_session_id_to_split.get(x, "train"))

In [18]:
df.shape

(2474518, 6)

In [19]:
df.groupby("split")["rating"].count()

split
test     1424302
train     581810
val       468406
Name: rating, dtype: int64

## Save processed dataset

In [20]:
df.to_csv(DATA_FOLDER + "ratings_processed_Yelp.csv", index=False)