In [0]:
%pip install recommenders
%pip install datetime

In [0]:
# set the environment path to find Recommenders
import sys

import pyspark
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

from recommenders.utils.spark_utils import start_or_get_spark
from recommenders.datasets.download_utils import maybe_download
from recommenders.datasets.python_splitters import (
    python_random_split, 
    python_chrono_split, 
    python_stratified_split
)
from recommenders.datasets.spark_splitters import spark_random_split

print("System version: {}".format(sys.version))
print("Pyspark version: {}".format(pyspark.__version__))

In [0]:
%scala
//<USER INPUT FILEPATH PARQUET OR CSV>
val filepath1= "abfss://.../mldata/MoviesDataRecommendation/ratings.csv"
var df1=spark.read.format("csv").option("header", "true").option("delimiter", ",").load(filepath1)
df1.createOrReplaceTempView("ratings")


In [0]:
%scala
val filepath2= "abfss://.../mldata/MoviesDataRecommendation/movies.csv"
var df2=spark.read.format("csv").option("header", "true").option("delimiter", ",").load(filepath2)
df2.createOrReplaceTempView("movies")

In [0]:
#Convert spark sql df to pyspark df
df1= spark.sql("""select * from ratings""")
#dtype conversion-rating df
import pandas as pd
import numpy as np
from pyspark.sql.functions import col
cols_all=['userId'
,'movieId'
,'rating']
cols_string=['userId'
,'movieId']
cols_int=['rating']
cols_bool=[]
cols_Float=[]
for col_name in cols_int:
    df1 = df1.withColumn(col_name, col(col_name).cast('Int'))  
for col_name in cols_Float:
    df1 = df1.withColumn(col_name, col(col_name).cast('float')) 
for col_name in cols_bool:
    df1 = df1.withColumn(col_name, col(col_name).cast('bool')) 
    
df_ratings = df1.toPandas()

#persist for handy use
outdir = '/dbfs/FileStore/df_ratings.csv'
df_ratings.to_csv(outdir, index=False)
#input_dataframe = pd.read_csv("/dbfs/FileStore/Dataframe.csv", header='infer')

In [0]:
#Convert spark sql df to pyspark df
df1= spark.sql("""select * from movies""")
#dtype conversion-rating df
import pandas as pd
import numpy as np
from pyspark.sql.functions import col
cols_all=['movieId'
,'title'
,'genres']
cols_string=['movieId'
,'title'
,'genres']
cols_int=[]
cols_bool=[]
cols_Float=[]
for col_name in cols_int:
    df1 = df1.withColumn(col_name, col(col_name).cast('Int'))  
for col_name in cols_Float:
    df1 = df1.withColumn(col_name, col(col_name).cast('float')) 
for col_name in cols_bool:
    df1 = df1.withColumn(col_name, col(col_name).cast('bool')) 
    
df_ratings = df1.toPandas()

#persist for handy use
outdir = '/dbfs/FileStore/df_movies.csv'
df_ratings.to_csv(outdir, index=False)
#input_dataframe = pd.read_csv("/dbfs/FileStore/Dataframe.csv", header='infer')

## DATA PREPARATION

In [0]:
import pandas as pd
data=pd.read_csv("/dbfs/FileStore/df_ratings.csv", header='infer')
COL_USER = "userId"
COL_ITEM = "movieId"
COL_RATING = "rating"
COL_PREDICTION = "rating"
COL_TIMESTAMP = "timestamp"

In [0]:
print(
    "Total number of ratings are\t{}".format(data.shape[0]),
    "Total number of users are\t{}".format(data[COL_USER].nunique()),
    "Total number of items are\t{}".format(data[COL_ITEM].nunique()),
    sep="\n"
)

In [0]:
#Original timestamps are converted to ISO format.
print("*****************Data Transformation********************\n")
from datetime import timedelta
data[COL_TIMESTAMP]= data.apply(
    lambda x: datetime.strftime(datetime(1970, 1, 1, 0, 0, 0) + timedelta(seconds=x[COL_TIMESTAMP].item()), "%Y-%m-%d %H:%M:%S"), 
    axis=1
)

#Deduplicating data
data = data.drop_duplicates()

display(data)


#Data aggregation / Affinity Scores
#data is aggregated by user to generate some scores that represent preferences (called affinity score). The affinity score is a percentile measure of how confident we are that a particular product fits the wants and needs of an individual customer


#1. Count- The most simple technique is to count times of interactions between user and item for producing affinity scores 
print("*****************1.Count********************\n")
data_count = data.groupby(['userId', 'movieId']).agg({'timestamp': 'count'}).reset_index()
data_count.columns = ['userId', 'movieId', 'Affinity']
display(data_count.head())


#2. Weighted Count- 
#Assuming weights of the differen ratings are 1, 2, 3, 4 and 5, respectively. Then assign weights to distinct ratings and take the weighted count
print("*****************2.Weighted Count********************\n")
data_w = data.copy()
#data['rating'].unique() #Find the distinct values to be put in coindition.
#numpy.select() function return an array drawn from elements in choicelist, depending on conditions.
conditions = [
    data_w['rating'] == 0,
    data_w['rating'] == 1,
    data_w['rating'] == 2,
    data_w['rating'] == 3,
    data_w['rating'] == 4,
    data_w['rating'] == 5
]
#Choices or weights for each rating in order.
choices = [1,2,3,4,5,6]
data_w['Weight'] = np.select(conditions, choices, default=0)
# Convert to numeric type.
data_w['Weight'] = pd.to_numeric(data_w['Weight'])
# Do count with weight.
data_w_count = data_w.groupby(['userId', 'movieId'])['Weight'].sum().reset_index()
data_w_count.columns = ['userId', 'movieId', 'Affinity']
display(data_w_count.head())


#3. Time dependent count
#In many scenarios, time dependency plays a critical role in preparing dataset for building a collaborative filtering model that captures user interests drift over time. One of the common techniques for achieving time dependent count is to add a time decay factor in the counting. This technique is used in SAR. Formula for getting affinity score for each user-item pair is-
# a_{ij}=sum_k w_k * (0.5)^{{t_0-t_k}/{T}}
# where a_ij is the affinity score, w_k is the interaction weight,  t_0 is a reference time, t_k is the timestamp for the k-th interaction, and T is a hyperparameter that controls the speed of
# The following shows how to apply time decay in aggregating counts,In this case we use 5 days as the half-life parameter, and use the latest time in the dataset as the time reference.
print("*****************3.Time Decay Count********************\n")
T = 5
t_ref = pd.to_datetime(data_w['timestamp']).max()
# Calculate the weighted count with time decay.
data_w['Timedecay'] = data_w.apply(
    lambda x: x['Weight'] * np.power(0.5, (t_ref - pd.to_datetime(x['timestamp'])).days / T), 
    axis=1
)
data_timedecay_count = data_w.groupby(['userId', 'movieId'])['Timedecay'].sum().reset_index()
data_timedecay_count.columns = ['userId', 'movieId', 'Affinity']
display(data_timedecay_count.head())

userId,movieId,rating,timestamp
1,296,5,2006-05-17 15:34:04
1,306,3,2006-05-17 12:26:57
1,307,5,2006-05-17 12:27:08
1,665,5,2006-05-17 15:13:40
1,899,3,2006-05-17 12:21:50
1,1088,4,2006-05-17 12:21:35
1,1175,3,2006-05-17 12:27:06
1,1217,3,2006-05-17 15:05:26
1,1237,5,2006-05-17 12:27:19
1,1250,4,2006-05-17 12:20:14


In [0]:
#Negative Sampling
#Original dataset with implicit interaction records can be binarized into one that has only 1 or 0, indicating if a user has interacted with an item, respectively. "Negative sampling" is a technique that samples negative feedback. Similar to the aggregation techniques, negative feedback cna be defined differently in different scenarios. In this case, for example, we can regard the items that a user has not interacted as those that the user does not like. This may be a strong assumption in many user cases, but it is reasonable to build a model when the interaction times between user and item are not that many


# A)The following generates data that contains already existing interactions between users and items
print("Data with existing Interractions")
data_A = data.copy()
data_A = data_A[['userId', 'movieId']].copy()
data_A['Feedback'] = 1
data_A = data_A.drop_duplicates()

# B)Negative Sampling. Create dummy data for all with Feedback=0, join with the data of Feedback=1 set for existing interractions, fill rest with 0
data_B = data.copy()
users = data_B['userId'].unique()
items = data_B['movieId'].unique()
interaction_lst = []
for user in users:
    for item in items:
        interaction_lst.append([user, item, 0])

data_B_all_dummy = pd.DataFrame(data=interaction_lst, columns=["userId", "movieId", "FeedbackAll"])
data_B_all_dummy
data_B_ns = pd.merge(data_B_all_dummy, data_A, on=['userId', 'movieId'], how='outer').fillna(0).drop('FeedbackAll', axis=1)
display(data_B_ns.head())

1. Random Split: this is the simplest way to split the data, it randomly assigns entries to either the train set or the test set based on the allocation ratio desired.
2. Chronological Split: in many cases accounting for temporal variations when evaluating your model can provide more realistic measures of performance. This approach will split the train and test set based on timestamps by user or item.
3. Stratified Split: it may be preferable to ensure the same set of users or items are in the train and test sets, this method of splitting will ensure that is the case.

In [0]:
''' 1.Random split
Random split simply takes in a data set and outputs the splits of the data, given the split ratios.
'''
print("******************************Random split******************************\n")
data_train_random, data_validate_random, data_test_random = python_random_split(data, ratio=[0.6, 0.2, 0.2])
print("Size of Train: ",data_train_random.shape[0])
print("Size of Validate: ",data_validate_random.shape[0])
print("Size of Test: ",data_test_random.shape[0])

''' 2. Chronological split
* Chronogically splitting method takes in a dataset and splits it on timestamp.
* Suppose Recommender is to recommend fashion products to customers. It makes sense that evaluation of the recommender considers time-dependency of customer purchases, as apparently, tastes of the customers in fashion items may be drifting over time. In this case, a chronologically splitting should be used.
* Chronogical splitting can be either by "user" or "item". For example, if it is by "user" and the splitting ratio is 0.7, it means that first 70% ratings for each user in the data will be put into one split while the other 30% is in another. It is worth noting that a chronological split is not "random" because splitting is timestamp-dependent.
'''
print("******************************Chronological split******************************\n")
data_train_chrono, data_test_chrono = python_chrono_split(
    data, ratio=0.7, filter_by="user",
    col_user=COL_USER, col_item=COL_ITEM, col_timestamp=COL_TIMESTAMP
)
print("The last 10 rows of the train data:")
display(data_train_chrono[data_train_chrono[COL_USER] == 1].tail(10))
print('The first 10 rows of the test data:Timestamps of train data are all precedent to those in test data')
display(data_test_chrono[data_test_chrono[COL_USER] == 1].head(10))

'''
A min-rating filter is applied to data before it is split by using chronological splitter. The reason of doing this is that, for multi-split, there should be sufficient number of ratings for user/item in the data.
For example, the following means splitting only applies to users that have at least 10 ratings.
Number of rows in the yielded splits of data may not sum to the original ones as users with fewer than 10 ratings are filtered out in the splitting.
'''

#data_train_chrono, data_test_chrono = python_chrono_split(
#    data, filter_by="user", min_rating=10, ratio=0.7,
#    col_user=COL_USER, col_item=COL_ITEM, col_timestamp=COL_TIMESTAMP
#)
#print(data_train.shape[0] + data_test.shape[0], data.shape[0])


''' 3. Stratified split
* This split is stratified so that the same set of users or items will appear in both training and testing data sets.
* Similar to chronological splitter, filter_by and min_rating_filter also apply to the stratified splitter.
* The following example shows the split of the sample data with a ratio of 0.7, and for each user there should be at least 10 ratings.
Number of rows in the yielded splits of data may not sum to the original ones as users with fewer than 10 ratings are filtered out in the splitting.
'''
print("******************************Stratified split******************************\n")
data_train_stratify, data_test_stratify = python_stratified_split(
    data, filter_by="user", min_rating=10, ratio=0.7,
    col_user=COL_USER, col_item=COL_ITEM
)
print("Size of Train: ",data_train_stratify.shape[0])
print("Size of Validate: ",data_validate_stratify.shape[0])
print("Size of Test: ",data_test_stratify.shape[0])

userId,movieId,rating,timestamp
1,4703,4,2006-05-17 12:33:43
1,31956,3,2006-05-17 14:53:30
1,5147,4,2006-05-17 14:54:14
1,8786,4,2006-05-17 14:57:33
1,1260,3,2006-05-17 14:57:37
1,2351,4,2006-05-17 14:59:17
1,7940,4,2006-05-17 14:59:27
1,7209,4,2006-05-17 14:59:46
1,8685,1,2006-05-17 15:00:23
1,7820,2,2006-05-17 15:00:50


userId,movieId,rating,timestamp
1,7937,3,2006-05-17 15:00:55
1,7938,2,2006-05-17 15:01:03
1,8405,3,2006-05-17 15:01:35
1,4325,5,2006-05-17 15:02:02
1,2632,5,2006-05-17 15:04:08
1,1217,3,2006-05-17 15:05:26
1,8729,3,2006-05-17 15:07:32
1,5912,3,2006-05-17 15:11:38
1,5767,5,2006-05-17 15:12:09
1,665,5,2006-05-17 15:13:40
