<a href="https://colab.research.google.com/github/mayapatward/-k-anonymized-ratings/blob/main/collaborativefiltering_v3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%capture
!pip install pyspark
!pip install -U -q PyDrive
!apt-get update
# Download Java JDK Version 8
!apt install openjdk-8-jdk-headless -qq
!pip install scikit-surprise

In [None]:
%%capture
import numpy as np
import pandas as pd
import os
import time
import matplotlib.pyplot as plt
%matplotlib inline
from surprise import Reader, Dataset, NMF, SVD, SVDpp
from surprise.model_selection import cross_validate
from ast import literal_eval
from sklearn.model_selection import train_test_split

ModuleNotFoundError: ignored

In [None]:
import pyspark
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating
from pyspark import SparkContext, SparkConf

from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

In [None]:
# create the session
conf = SparkConf().set("spark.ui.port", "4050")

# create the context
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()

In [None]:
from google.colab import drive
drive.mount("/content/drive",force_remount=True)

Mounted at /content/drive


In [None]:
##Collect data
READ_PATH = '/content/drive/MyDrive/CSE547_Final_Project/ml-100k/'
training_raw = spark.read.text(READ_PATH + "ratings_train.csv").rdd
validation_raw = spark.read.text(READ_PATH + "ratings_validation.csv").rdd
test_raw = spark.read.text(READ_PATH + "ratings_test.csv").rdd

In [None]:
def process(raw_data):
  parts = raw_data.map(lambda row: row.value.split(","))
  ratings_RDD_remove = parts.take(1)[0]
  parts = parts.filter(lambda p: p !=ratings_RDD_remove)
  ratingsRDD = parts.map(lambda p: Row(userId = int(p[0]), movieId = int(p[1]), rating = float(p[2])))
  df = spark.createDataFrame(ratingsRDD)
  return df

In [None]:
training_df = process(training_raw)
validation_df = process(validation_raw)
test_df = process(test_raw)

In [None]:
training_df.take(5)

[Row(userId=187, movieId=47997, rating=0.5),
 Row(userId=19, movieId=2011, rating=2.0),
 Row(userId=463, movieId=8622, rating=3.5),
 Row(userId=274, movieId=4167, rating=3.5),
 Row(userId=590, movieId=858, rating=5.0)]

In [None]:
##Make sure test users are a subset of train users
training_distinct_users = training_df.select("userId").distinct()
test_distinct_users = test_df.select("userId").distinct()
joined_users = training_distinct_users.join(test_distinct_users, on="userId", how = "inner")
print(joined_users.count())
print(test_distinct_users.count())

610
610


In [None]:
##Make sure test movies are a subset of train movies
training_distinct_movies = training_df.select("movieId").distinct()
test_distinct_movies = test_df.select("movieId").distinct()
joined_movies = training_distinct_users.join(test_distinct_users, on="movieId", how = "inner")
print(joined_movies.count())
print(test_distinct_movies.count())
print(training_distinct_movies.count())

3926
5113
6892


First we want to compare the Standard ALS with NMF 

In [None]:
start = time.time()

#fit training
als = ALS(maxIter=5, regParam=0.1, userCol="userId", itemCol="movieId", ratingCol="rating", coldStartStrategy="drop")
model = als.fit(training_df)

#test validation
predictions = model.transform(validation_df)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
end = time.time()
print("Test: explicit preferences, no nonnegative constraint:")
print("Root-mean-square error ="  + str(rmse))
print("Time run:" + str(end-start))

Test: explicit preferences, no nonnegative constraint:
Root-mean-square error =0.9840421193316466
Time run:25.905667066574097


In [None]:
start = time.time()

#fit training
als = ALS(maxIter=5, regParam=0.1, userCol="userId", itemCol="movieId", ratingCol="rating", coldStartStrategy="drop", implicitPrefs = False, nonnegative = True)
model = als.fit(training_df)

#test validation
predictions = model.transform(validation_df)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
end = time.time()
print("Test: explicit preferences, no nonnegative constraint:")
print("Root-mean-square error ="  + str(rmse))
print("Time run:" + str(end-start))

Test: explicit preferences, no nonnegative constraint:
Root-mean-square error =0.9578090736145548
Time run:13.392131090164185


We found that the nonnegative constraint improved the Root-mean-square error by around 0.015. Thus we will use the NMF ALS algorithm moving forward. Next We will find tune the hyperparameters for rank and for regularization

In [None]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
start = time.time()
als = ALS(maxIter=5, regParam=0.1, userCol="userId", itemCol="movieId", ratingCol="rating", coldStartStrategy="drop",nonnegative = True)

paramGrid = ParamGridBuilder() \
    .addGrid(als.rank, [10, 20, 40]) \
    .addGrid(als.regParam, [.1, .15, .2]) \
    .build()
evaluator_ = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
crossval = CrossValidator(estimator=als,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator_,
                          numFolds=5) 
cvModel = crossval.fit(training_df)

print("Time run:" + str(end-start))

Time run:-197.36660385131836


In [None]:
params = [{p.name: v for p, v in m.items()} for m in cvModel.getEstimatorParamMaps()]
print("Results:")
[print(x) for x in zip(params, cvModel.avgMetrics)]
print("The best rank param is:")
print(cvModel.bestModel._java_obj.parent().getRank())
print("The best reg param is:")
print(cvModel.bestModel._java_obj.parent().getRegParam())

Results:
({'rank': 10, 'regParam': 0.1}, 0.9788543354472139)
({'rank': 10, 'regParam': 0.15}, 0.9525797939930682)
({'rank': 10, 'regParam': 0.2}, 0.9470288123627228)
({'rank': 20, 'regParam': 0.1}, 0.978994369380515)
({'rank': 20, 'regParam': 0.15}, 0.9497018933418631)
({'rank': 20, 'regParam': 0.2}, 0.9461476775579445)
({'rank': 40, 'regParam': 0.1}, 0.9796769944076669)
({'rank': 40, 'regParam': 0.15}, 0.9497876847924209)
({'rank': 40, 'regParam': 0.2}, 0.9463458981020896)
The best rang param is:
20
The best reg param is:
0.2


In [None]:
import pandas as pd
COL_NAME_USER_ID = "user_id"
COL_NAME_MOVIE_ID = "movie_id"
COL_NAME_RATING = "rating"
PROCESSED_SAVE_PATH = '/content/drive/MyDrive/CSE547_Final_Project/ml-100k/postprocessed_anonymized/'
MOVIE_TO_IDX_PATH = '/content/drive/MyDrive/CSE547_Final_Project/ml-100k/movie_to_idx.json'
klist = [2, 3, 5, 8, 12, 15, 20, 25, 30]


# mapping ids to movies
with open(MOVIE_TO_IDX_PATH, 'r') as f:
    movie_to_idx = literal_eval(f.read())
idx_to_movie = {v: int(k) for k, v in movie_to_idx.items()}


def flatten_matrix_into_dataframe(matrix):
  value_vars = [v for v in matrix.columns.values if v != COL_NAME_USER_ID]
  DEFAULT_VARIABLE_NAME_IN_MELT="variable"
  DEFAULT_VALUE_NAME_IN_MELT="value"
  return matrix \
    .melt(id_vars=[COL_NAME_USER_ID], value_vars=value_vars) \
    .rename(columns={DEFAULT_VARIABLE_NAME_IN_MELT: COL_NAME_MOVIE_ID, DEFAULT_VALUE_NAME_IN_MELT: COL_NAME_RATING}) \
    .apply(pd.to_numeric)

def save_k_anonymized(path, k):
  filename = "https://raw.githubusercontent.com/mayapatward/-k-anonymized-ratings/main/ml-100k/anonymized/"+str(k)+"_anonymized.csv"
  mldf = pd.read_csv(filename, header=None, delimiter=",")
  mldf = mldf.reset_index().rename(columns={'index': COL_NAME_USER_ID})
  mldf = flatten_matrix_into_dataframe(mldf)
  mldf['movie_id'] = mldf['movie_id'].map(idx_to_movie)
  mldf.to_csv(path + str(k)+"_anonymized_processed.csv", index = False)

for k in klist:
  save_k_anonymized(PROCESSED_SAVE_PATH, k)


In [None]:
def crossjoin_to_dot(user_id, item_id, user_features, item_features):
  item_features = np.array(item_features)
  user_features = np.array(user_features)
  return Row(userId = int(user_id), movieId = int(item_id), rating = float(np.dot(item_features,user_features)))



In [None]:
from operator import add

RANK_ = 10
REG_PARAM_ = 0.15
NONNEG_ = True
READ_PATH = '/content/drive/MyDrive/CSE547_Final_Project/ml-100k/'
klist = [2, 3, 5, 8, 12, 15, 20, 25, 30]

##need to see if there is a faster way to convert from pandas -> spark rdd
def train(training_df, test_df, rank_):
  start = time.time()
  #Fit Model
  als = ALS(maxIter=5, regParam=0.15, rank=rank_, userCol="userId", itemCol="movieId", ratingCol="rating", coldStartStrategy="drop", implicitPrefs = False, nonnegative = True)
  model = als.fit(training_df)

  df_itemfactors = model.itemFactors.toDF("item_id", "item_features")
  df_userfactors = model.userFactors.toDF("user_id", "user_features")

  df_crossjoin = df_itemfactors.crossJoin(df_userfactors)
  df_dot = df_crossjoin.rdd.map(lambda p: crossjoin_to_dot(p["user_id"], p["item_id"], p["user_features"], p["item_features"]))
  res = spark.createDataFrame(df_dot)
  print("Done")

  
  return res


In [None]:
unanon_mf = train(training_df, test_df, 15)


Done


KeyboardInterrupt: ignored

In [None]:
klist = [2, 3, 5, 8, 12, 15, 20, 25, 30]
rmselist = []
READ_PATH = '/content/drive/MyDrive/CSE547_Final_Project/ml-100k/'
for k in klist:
  print("Running k =", k)
  trainpath = READ_PATH+"/postprocessed_anonymized/"+ str(k)+"_anonymized_processed.csv"
  testpath =  READ_PATH + "ratings_test.csv"
  ktrain_rdd = spark.read.text(trainpath).rdd
  ktrain_matrix = train(ktrain_rdd, test_rdd, rank_ = 15)

Running k = 2
Done
Running k = 3
Done
Running k = 5
Done
Running k = 8
Done
Running k = 12
Done
Running k = 15
Done
Running k = 20
Done
Running k = 25
Done
Running k = 30
Done
