In [None]:
import numpy as np
import pandas as pd

In [ ]:
DATA_FOLDER = "data/"

In [None]:
ratings = pd.read_csv(DATA_FOLDER + "ratings.csv")

In [None]:
ratings.head()

In [None]:
ratings["datetime"] = pd.to_datetime(ratings["timestamp"], unit = "s")

In [None]:
ratings.sort_values(["userId","timestamp"], inplace=True)
ratings

In [None]:
ratings["sessionId"] = 1

In [None]:
import seaborn as sns
sns.histplot(ratings["datetime"])

### Limit to only recent data (2015+)

In [None]:
ratings_2015plus = ratings.loc[ratings["datetime"] >= np.datetime64('2015', 'Y')]
ratings_2015plus.shape

In [None]:
len(ratings_2015plus.movieId.unique()),len(ratings_2015plus.userId.unique())

### Make data session-based
- new session if a gap of at least 24 hours occured between consecutive records

In [None]:
#takes a couple minutes to complete
sessionBound = 86400 # one day
i=0
for u in ratings_2015plus.userId.unique():
    i = i+1
    r_u = ratings_2015plus.loc[ratings_2015plus.userId == u]
    
    n_time = r_u['timestamp'].to_numpy()    
    sessIncrease = np.ediff1d(n_time, to_begin=0) > sessionBound
    sessNo = np.cumsum(sessIncrease) +1
    
    ratings_2015plus.loc[r_u.index, "sessionId"] = sessNo
    
    if i % 50 == 0:
        print(i)
        #break
ratings_2015plus

In [None]:
countData = ratings_2015plus.groupby(["userId","sessionId"])[["rating"]].count()
countData

In [None]:
val_counts = countData["rating"].value_counts()
val_counts.sort_index(inplace=True)
val_counts

In [None]:
#most of the sessions will be small with at most a couple items in there
from matplotlib import pyplot as plt
plt.plot(val_counts)
plt.gca().set_xlim(0,100)

In [None]:
# ... but there are a few that are very large
plt.plot(val_counts)
plt.gca().set_ylim(0,50)

In [None]:
#mean number of items in a session with the given Id (i.e., less items in later sessions)
countData.reset_index().groupby("sessionId")[["rating"]].mean()

In [None]:
# average volume of sessions per user
(countData.reset_index().groupby("userId")["sessionId"].count()).mean()

### Prepare binary training data
- ratings < 2.5 are negative, ratings > 3.5 are positive; remove middle ones
- for testing, identify user-session pairs where there is at least one positive and one negative rating

In [None]:
ratings_2015plusRestr = ratings_2015plus.loc[~((ratings_2015plus.rating >=2.5)&(ratings_2015plus.rating <=3.5)) ]
ratings_2015plusRestr.shape

In [None]:
ratings_2015plusRestr.loc[ratings_2015plus.rating <2.5,"rating"] = -1
ratings_2015plusRestr.loc[ratings_2015plus.rating >3.5,"rating"] = 1
ratings_2015plusRestr

In [None]:
sessionData = ratings_2015plusRestr.groupby(["userId","sessionId"]).agg({"rating":["min","max","count"]})
sessionData

In [None]:
testableSessions = sessionData.loc[((sessionData["rating"]["min"]==-1.0)&(sessionData["rating"]["max"]==1.0)&(sessionData["rating"]["count"]>=3))]
testableSessions.columns = ["minValue","maxValue", "ratingsCount"]
testableSessions

In [None]:
ratings_2015plusRestr.to_csv(DATA_FOLDER + "processedRatingsMovielens.csv")

In [None]:
testableSessions.to_csv(DATA_FOLDER + "testableSessionsMovielens.csv")

In [None]:
positiveCount = ratings_2015plusRestr.loc[ratings_2015plusRestr.rating ==1].groupby(["userId","sessionId"])[["rating"]].count()
positiveCount.columns = ["positiveCount"]
positiveCount.loc[positiveCount.positiveCount>1]

In [None]:
testableSessions = testableSessions.set_index(["userId","sessionId"]).join(positiveCount)
testableSessions

In [None]:
testableSessions = testableSessions.loc[testableSessions.positiveCount >= 2]

In [None]:
testableSessions

In [None]:
testableSessions.to_csv(DATA_FOLDER + "testableSessionsMovielens.csv")

In [None]:
ratings_2015plusRestr = pd.read_csv(DATA_FOLDER + "processedRatingsMovielens.csv")
testableSessions = pd.read_csv(DATA_FOLDER + "testableSessionsMovielens.csv")

In [None]:
ratings_2015plusRestr.groupby(["userId","sessionId"]).count()

In [None]:
def numpy_combinations(x):
   
    idx = np.stack(np.triu_indices(len(x), k=1), axis=-1)

    return x[idx].tolist()

In [None]:
records = {}
i=0
for uid in ratings_2015plusRestr.userId.unique():
    i = i+1
    u_dt = ratings_2015plusRestr.loc[ratings_2015plusRestr.userId == uid]
    for sid in u_dt.sessionId.unique():
        u_s_dt = u_dt.loc[u_dt.sessionId==sid]
        items = np.sort(u_s_dt.movieId.unique())
        pairs = numpy_combinations(np.array(items))
        pairs = [(p[0],p[1]) for p in pairs]
        
        values = [1]*len(pairs)
        dct = dict(zip(pairs, values))
        records.update(dct)
    if i % 50 == 0:
        print(i)
        #break
len(records)
        