## Data source

Data are from the famous [MovieLens 10M dataset](https://grouplens.org/datasets/movielens/10m/); 'MovieLens 10M movie ratings. Stable benchmark dataset. 10 million ratings and 100,000 tag applications applied to 10,000 movies by 72,000 users. Released 1/2009'

Data are from the famous [MovieLens 1M dataset](https://grouplens.org/datasets/movielens/11m/); (quote here)

In [1]:
# set project root
import sys
sys.path.append("../")

import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
import seaborn as sns
import sys
import pandas as pd
import itertools

from datetime import datetime, timedelta

from reco_utils.dataset import movielens
from reco_utils.dataset.download_utils import maybe_download
from reco_utils.dataset.python_splitters import (
    python_random_split, 
    python_chrono_split, 
    python_stratified_split
)

In [2]:
MOVIELENS_DATA_SIZE = '100k'
SEED = 1066

COL_USER = "userID"
COL_ITEM = "itemID"
COL_RATING = "rating"
COL_TIMESTAMP = "timestamp"

In [3]:
full = movielens.load_pandas_df(size=MOVIELENS_DATA_SIZE).drop_duplicates()

full.head()

100%|██████████| 4.81k/4.81k [00:00<00:00, 22.5kKB/s]


Unnamed: 0,userID,itemID,rating,timestamp
0,196,242,3.0,881250949
1,186,302,3.0,891717742
2,22,377,1.0,878887116
3,244,51,2.0,880606923
4,166,346,1.0,886397596


In [4]:
full.describe()

Unnamed: 0,userID,itemID,rating,timestamp
count,100000.0,100000.0,100000.0,100000.0
mean,462.48475,425.53013,3.52986,883528900.0
std,266.61442,330.798356,1.125674,5343856.0
min,1.0,1.0,1.0,874724700.0
25%,254.0,175.0,3.0,879448700.0
50%,447.0,322.0,4.0,882826900.0
75%,682.0,631.0,4.0,888260000.0
max,943.0,1682.0,5.0,893286600.0


In [5]:
popularity = full.groupby(['itemID']).count()
popularity.to_csv('popularity.csv')

In [6]:
def moviedata_describe(df, part="full"):
    print(
        "==={}===".format(part),
        "Total number of ratings are\t{}".format(df.shape[0]),
        "Total number of users are\t{}".format(df[COL_USER].nunique()),
        "Total number of items are\t{}".format(df[COL_ITEM].nunique()),
        "Minimum # ratings / user is\t{}" \
            .format(full[COL_USER].value_counts().iloc[-1]),
        sep="\n"
    )

moviedata_describe(full)

===full===
Total number of ratings are	100000
Total number of users are	943
Total number of items are	1682
Minimum # ratings / user is	20


Want to split into 70% TRAIN / 15% VALIDATE / 15% TEST, stratified such that each partition has at least one rating from every user and at least one rating for every movie.  reco_utils only lets us stratify by one or the other; fortunately [Taylor G. Smith has a gist](https://gist.github.com/tgsmith61591/ce7d614d7a0442f94cd5ae5d1e51d3c2) that might help us.

In [7]:
from upstream.collab_split import get_stratified_tr_mask

def get_mask(df, train_size):
    return get_stratified_tr_mask(
        df[COL_USER].to_numpy(), 
        df[COL_ITEM].to_numpy(), 
        train_size=train_size, 
        random_state=SEED
    )

train_mask = get_mask(full, 0.7)
train = full[train_mask]
rest = full[~train_mask]

val_mask = get_mask(rest, 0.5)
val = rest[val_mask]
test = rest[~val_mask]



In [8]:
moviedata_describe(train, part="train")
moviedata_describe(val, part="val")
moviedata_describe(test, part="test")

===train===
Total number of ratings are	69998
Total number of users are	943
Total number of items are	1682
Minimum # ratings / user is	20
===val===
Total number of ratings are	15219
Total number of users are	943
Total number of items are	1443
Minimum # ratings / user is	20
===test===
Total number of ratings are	14783
Total number of users are	940
Total number of items are	1243
Minimum # ratings / user is	20


Well, UNFORTUNATELY, that didn't seem to do quite what was wanted, but it looks like it got us close.  Let's just try the set intersection and see what we get.

In [9]:
all_users = set.intersection(set(train[COL_USER]), set(val[COL_USER]), set(test[COL_USER]))
all_movies = set.intersection(set(train[COL_ITEM]), set(val[COL_ITEM]), set(test[COL_ITEM]))

In [10]:
def intersect_ratings(df):
    return df[
        df[COL_ITEM].isin(all_movies) & df[COL_USER].isin(all_users)
    ]

train_final = intersect_ratings(train)
val_final = intersect_ratings(val)
test_final = intersect_ratings(test)

In [11]:
moviedata_describe(train_final, part="train")
moviedata_describe(val_final, part="val")
moviedata_describe(test_final, part="test")

===train===
Total number of ratings are	68582
Total number of users are	940
Total number of items are	1243
Minimum # ratings / user is	20
===val===
Total number of ratings are	14936
Total number of users are	940
Total number of items are	1243
Minimum # ratings / user is	20
===test===
Total number of ratings are	14783
Total number of users are	940
Total number of items are	1243
Minimum # ratings / user is	20


In [12]:
print(
    set(train_final[COL_USER]) == set(test_final[COL_USER]),
    set(val_final[COL_USER]) == set(train_final[COL_USER]),
    set(train_final[COL_ITEM]) == set(test_final[COL_ITEM]),
    set(val_final[COL_ITEM]) == set(train_final[COL_ITEM]),
    sep="\n"
)

# any false and we need to check


True
True
True
True


In [13]:
# to_feather
train_final.reset_index().to_feather('../data/train' + MOVIELENS_DATA_SIZE + '.feather')
val_final.reset_index().to_feather('../data/val' + MOVIELENS_DATA_SIZE + '.feather')
test_final.reset_index().to_feather('../data/test' + MOVIELENS_DATA_SIZE + '.feather')


In [14]:
# to_parquet
train_final.reset_index().to_parquet('../data/train' + MOVIELENS_DATA_SIZE + '.parquet')
val_final.reset_index().to_parquet('../data/val' + MOVIELENS_DATA_SIZE + '.parquet')
test_final.reset_index().to_parquet('../data/test' + MOVIELENS_DATA_SIZE + '.parquet')

In [15]:
# make full user / item table
# 
# leave it to pandas to take a simple concept and make it super difficult
df_users = pd.DataFrame(list(all_users))
df_users['key'] = 1

df_movies = pd.DataFrame(list(all_movies))
df_movies['key'] = 1

full = pd.merge(df_users, df_movies, on='key').drop('key', 1)
full.columns = ['userID', 'itemID']

full.reset_index().to_feather('../data/full_indices' + MOVIELENS_DATA_SIZE + '.feather')