<a href="https://colab.research.google.com/github/mayapatward/-k-anonymized-ratings/blob/hkulkar-patch-1/collaborativefiltering/collaborativefiltering_ratingsplit.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
!pip install pyspark
!pip install -U -q PyDrive
!apt-get update
# Download Java JDK Version 8
!apt install openjdk-8-jdk-headless -qq
!pip install scikit-surprise

In [2]:
%%capture
import numpy as np
import pandas as pd
import os
import time
import math
import matplotlib.pyplot as plt
%matplotlib inline
from surprise import Reader, Dataset, NMF, SVD, SVDpp
from surprise.model_selection import cross_validate
from ast import literal_eval
from sklearn.model_selection import train_test_split
import operator



In [3]:
import pyspark
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating
from pyspark.mllib.evaluation import RegressionMetrics
from pyspark import SparkContext, SparkConf
from pyspark.mllib.evaluation import RegressionMetrics, RankingMetrics, MultilabelMetrics
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql import Row

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

In [4]:
# create the session
conf = SparkConf().set("spark.ui.port", "4050")

# create the context
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()

In [5]:
from google.colab import drive
drive.mount("/content/drive",force_remount=True)

Mounted at /content/drive


In [6]:
##Collect data
READ_PATH = '/content/drive/MyDrive/CSE547_Final_Project/ml-100k/ratings_split/iter_1_fix/'

training_raw = spark.read.text(READ_PATH + "ratings_train.csv").rdd
validation_raw = spark.read.text(READ_PATH + "ratings_validation.csv").rdd
test_raw = spark.read.text(READ_PATH + "ratings_test.csv").rdd

In [7]:
def process(raw_data):
  parts = raw_data.map(lambda row: row.value.split(","))
  ratings_RDD_remove = parts.take(1)[0]
  parts = parts.filter(lambda p: p !=ratings_RDD_remove)
  ratingsRDD = parts.map(lambda p: Row(userId = int(p[0]), movieId = int(p[1]), rating = float(p[2])))
  df = spark.createDataFrame(ratingsRDD)
  return df

In [8]:
def process_usersplit(raw_data):
  parts = raw_data.map(lambda row: row.value.split(","))
  ratings_RDD_remove = parts.take(1)[0]
  parts = parts.filter(lambda p: p !=ratings_RDD_remove)
  ratingsRDD = parts.map(lambda p: Row(userId = int(p[1]), movieId = int(p[2]), rating = float(p[3])))
  df = spark.createDataFrame(ratingsRDD)
  return df

In [9]:
training_df = process(training_raw)
validation_df = process(validation_raw)
test_df = process(test_raw)

In [10]:
from operator import add

RANK_ = 10
REG_PARAM_ = 0.15
NONNEG_ = True
READ_PATH = '/content/drive/MyDrive/CSE547_Final_Project/ml-100k/ratings_split/iter_1_fix/'
klist = [5, 8, 12, 15, 20, 25]

In [14]:
model = ALS.train(training_df.rdd, rank = RANK_, nonnegative=True)
model.save(sc, READ_PATH+"rating_unanonymized.model")


In [15]:
model = MatrixFactorizationModel.load(sc, READ_PATH+"rating_unanonymized.model")

In [24]:
def evaluate(model, test_df):
  testdata = test_df.rdd.map(lambda p: (p[0], p[1]))
  predictions = model.predictAll(testdata).map(lambda r: ((r[0], r[1]), r[2]))
  ratesAndPreds = test_df.rdd.map(lambda r: ((r[0], r[1]), r[2])).join(predictions)
  MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean()

  metrics_reg = RegressionMetrics(ratesAndPreds.map(lambda x: x[1]))
  rmse = metrics_reg.rootMeanSquaredError
  
  binaryRankings = test_df.rdd.map(lambda p: (p[0], p[1], 1) if p[2] > 0 else (p[0], p[1], 0))
  userRankings = binaryRankings.filter(lambda x: x[2]==1).map(lambda x: (x[0], x[1]))
  userRankings = userRankings.groupByKey().mapValues(list)


  recommendedRankings = model.recommendProductsForUsers(10)
  recommendedRankings = recommendedRankings.mapValues(lambda x: [r[1] for r in x])
  predictionAndLabels = recommendedRankings.join(userRankings).map(lambda x: ([float(y) for y in x[1][0]], [float(y) for y in x[1][1]]))
  
  
  recommendedRankings_h = model.recommendProductsForUsers(10)
  recommendedRankings_h = recommendedRankings_h.mapValues(lambda x: [r[1] for r in x])
  predictionAndLabels_h = recommendedRankings_h.join(userRankings).map(lambda x: ([float(y) for y in x[1][0]], [float(y) for y in x[1][1]]))
  hitrate_list = predictionAndLabels_h.map(lambda x: 1 if len(set(x[0]).intersection(x[1]))!=0 else 0).collect()
  
  metrics_multi = MultilabelMetrics(predictionAndLabels)

  recall = metrics_multi.recall()
  precision = metrics_multi.precision()
  hitrate = np.array(hitrate_list).mean()

  print("Root Mean Squared Error = " + str(metrics_reg.rootMeanSquaredError))
  print("Recall = " + str(metrics_multi.recall()))
  print("Precision = " + str(metrics_multi.precision()))
  print("Hitrate = "+str(hitrate))

  print("")
  return rmse, recall, precision, hitrate


In [19]:
evaluate(model, training_df)

Root Mean Squared Error = 0.5536188032279901
Recall = 0.0005224309375338915
Precision = 0.01311475409836066
Hitrate = 0.040983606557377046



(0.5536188032279901,
 0.0005224309375338915,
 0.01311475409836066,
 0.040983606557377046)

In [20]:
evaluate(model, test_df)


Root Mean Squared Error = 0.9927334349765192
Recall = 0.0011072615242915258
Precision = 0.0016583747927031514
Hitrate = 0.01658374792703151



(0.9927334349765192,
 0.0011072615242915258,
 0.0016583747927031514,
 0.01658374792703151)

In [27]:
def map_k(id_to_idx, idx_to_kanon,user_to_cluster, x):
  idx = id_to_idx[x[0]]
  if idx in idx_to_kanon.keys():
    return (idx_to_kanon[idx], x[1], x[2], 0)
  else:
    try:
      return (user_to_cluster[x[0]], x[1], x[2], 0)
    except:
      print("EXCEPTION")
      print(x[0], x[1], x[2], 1)

In [50]:
def save_model_anonymized(metric_):
  for i in range (1,6):
    READ_PATH = '/content/drive/MyDrive/CSE547_Final_Project/ml-100k/ratings_split/iter_{}_fix/'.format(i)
    for k in klist:
        trainpath = READ_PATH+"postprocessed_anonymized/"+ str(k)+"_anonymized_processed.csv"
        ktrain_rdd = spark.read.text(trainpath).rdd
        ktrain_df = process(ktrain_rdd)
        ktrain_model = ALS.train(ktrain_df, rank = RANK_, nonnegative = True)
        try:
          ktrain_model.save(sc, READ_PATH+"rating_{}_{}.model".format(k, metric_))
        except:
          print("Model alrady exists")


In [55]:
def predict(READ_PATH, metric_):
  rmselist = []
  recalllist = []
  precisionlist = []
  hitratelist = []
  for k in klist:
    print("Running k =", k)
    mappingpath_idtoidx = READ_PATH+"user_to_idx.json"
    mappingpath_idxtokanon = READ_PATH+ str(k)+"_anonymized_idx_to_kanon_idx.json"
    
    u = Utility(split_type = "ratings")
    user_to_cluster = u.get_closest_k_cluster_to_user_id(k, metric_)
    user_to_cluster = {int(i): int(j) for i, j in user_to_cluster.items()}  

    
    with open(mappingpath_idxtokanon, 'r') as f:
        idx_to_kanon = literal_eval(f.read())
    idx_to_kanon = {int(i): int(j) for i, j in idx_to_kanon.items()}

    with open(mappingpath_idtoidx, 'r') as f:
        id_to_idx = literal_eval(f.read())
    id_to_idx = {int(i): int(j) for i, j in id_to_idx.items()}
    ktrain_model =  MatrixFactorizationModel.load(sc, READ_PATH+"rating_{}_{}.model".format(k, metric_))
    mapped = test_df.rdd.map(lambda x: map_k(id_to_idx, idx_to_kanon,user_to_cluster, x))
    mapped_row = mapped.map(lambda x: Row(userId = int(x[0]), movieId = int(x[1]), rating = float(x[2])))
    thrownout = mapped.map(lambda x: x[3]).reduce(add)
    actual = spark.createDataFrame(mapped_row)  
    print("Number points not mapped to cluster: "+str(thrownout))
    rmse, recall, precision, hitrate = evaluate(ktrain_model, actual)
    rmselist += [rmse]
    recalllist += [recall]
    precisionlist += [precision]
    hitratelist += [hitrate]
  return rmselist, recalllist, precisionlist, hitratelist



In [45]:
def driver(metric_):
  rmsetotals = np.zeros(len(klist))
  precisiontotals = np.zeros(len(klist))
  recalltotals = np.zeros(len(klist))
  hitratetotals = np.zeros(len(klist))
 
  for i in range (1,6):
    READ_PATH = '/content/drive/MyDrive/CSE547_Final_Project/ml-100k/ratings_split/iter_{}_fix/'.format(i)

    training_raw = spark.read.text(READ_PATH + "ratings_train.csv").rdd
    test_raw = spark.read.text(READ_PATH + "ratings_test.csv").rdd

    training_df = process(training_raw)
    test_df = process(test_raw)

    r, re, p, h = predict(READ_PATH, metric_)

    rmsetotals = np.vstack((rmsetotals, r))
    recalltotals = np.vstack((recalltotals, re))
    precisiontotals = np.vstack((precisiontotals, p))
    hitratetotals = np.vstack((hitratetotals, h))
    
  return rmsetotals[1:],  recalltotals[1:],precisiontotals[1:], hitratetotals[1:]


In [46]:
def save_model_unanonymized(metric_):
    for i in range (1,6): 
      READ_PATH = '/content/drive/MyDrive/CSE547_Final_Project/ml-100k/ratings_split/iter_{}_fix/'.format(i)
      training_raw = spark.read.text(READ_PATH + "ratings_train.csv").rdd
      training_df = process(training_raw)
      model = ALS.train(training_df.rdd, rank = RANK_, nonnegative=True)
      try:
        model.save(sc, READ_PATH+"rating_unanonymized_{}.model".format(metric_))
      except:
        print("model already exists")

In [66]:
def driver_unanonymized(metric_):
  rmsetotals_0 = 0
  precisiontotals_0 = 0
  recalltotals_0 = 0
  hitratetotals_0 = 0
  for i in range (1,6): 
    READ_PATH = '/content/drive/MyDrive/CSE547_Final_Project/ml-100k/ratings_split/iter_{}_fix/'.format(i)
    test_raw = spark.read.text(READ_PATH + "ratings_test.csv").rdd
    test_df = process(test_raw)
    test_raw = spark.read.text(READ_PATH + "ratings_test.csv").rdd
    model = MatrixFactorizationModel.load(sc, READ_PATH+"rating_unanonymized_{}.model".format(metric_))
    r, re, p, h = evaluate(model, test_df)

    rmsetotals_0 = rmsetotals_0+r
    recalltotals_0 = recalltotals_0+re
    precisiontotals_0 =precisiontotals_0+p
    hitratetotals_0 = hitratetotals_0+h
  return rmsetotals_0/5, recalltotals_0/5, precisiontotals_0/5, hitratetotals_0/5




In [48]:
###skip if model already saved
save_model_unanonymized("euclidean")

In [52]:
save_model_unanonymized("jaccard")

In [58]:
save_model_anonymized("jaccard")


Running k = 5


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Running k = 8


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Running k = 12


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Running k = 15


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Running k = 20


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Running k = 25


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Running k = 5


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Running k = 8


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Running k = 12


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Running k = 15


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Running k = 20


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Running k = 25


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Running k = 5


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Running k = 8


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Running k = 12


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Running k = 15


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Running k = 20


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Running k = 25


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Running k = 5


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Running k = 8


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Running k = 12


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Running k = 15


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Running k = 20


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Running k = 25


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Running k = 5


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Running k = 8


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Running k = 12


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Running k = 15


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Running k = 20


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Running k = 25


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))




In [51]:
save_model_anonymized("euclidean")


Running k = 5


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Running k = 8


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Running k = 12


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Running k = 15


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Running k = 20


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Running k = 25


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Running k = 5


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Running k = 8


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Running k = 12


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Running k = 15


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Running k = 20


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Running k = 25


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Running k = 5


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Running k = 8


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Running k = 12


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Running k = 15


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Running k = 20


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Running k = 25


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Running k = 5


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Running k = 8


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Running k = 12


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Running k = 15


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Running k = 20


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Running k = 25


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Running k = 5


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Running k = 8


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Running k = 12


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Running k = 15


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Running k = 20


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Running k = 25


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))




In [70]:
def plot(rmselist, recalllist, precisionlist, hitratelist, klist):
  plt.plot(klist, rmselist, "-o")
  plt.title("RMSE over K-values")
  plt.xticks(ticks=klist)
  plt.xlabel("k")
  plt.ylabel("RMSE")
  plt.show()

  plt.plot(klist, recalllist, "-o")
  plt.title("Recall over K-values")
  plt.xticks(ticks=klist)
  plt.xlabel("k")
  plt.ylabel("recall @ 10")
  plt.show()

  plt.plot(klist, precisionlist, "-o")
  plt.title("Precision over K-values")
  plt.xticks(ticks=klist)
  plt.xlabel("k")
  plt.ylabel("precision @ 10")
  plt.show()

  plt.plot(klist, hitratelist, "-o")
  plt.title("Hitrate over K-values")
  plt.xticks(ticks=klist)
  plt.xlabel("k")
  plt.ylabel("hit rate @ 10")
  plt.show()


In [None]:
rmsetotals, recalltotals, precisiontotals,hitratetotals  = driver(metric_="jaccard")

Running k = 5


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Number points not mapped to cluster: 0
Root Mean Squared Error = 2.4770009870638785
Recall = 0.03097652120926295
Precision = 0.2073770491803279
Hitrate = 0.860655737704918

Running k = 8


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Number points not mapped to cluster: 0
Root Mean Squared Error = 2.276074419574118
Recall = 0.023451697062953164
Precision = 0.24342105263157895
Hitrate = 0.9342105263157895

Running k = 12


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Number points not mapped to cluster: 0
Root Mean Squared Error = 2.095427728374346
Recall = 0.02095827434737461
Precision = 0.32800000000000007
Hitrate = 0.96

Running k = 15


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Number points not mapped to cluster: 0
Root Mean Squared Error = 1.9647805094512123
Recall = 0.017297907781361373
Precision = 0.325
Hitrate = 0.95

Running k = 20


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Number points not mapped to cluster: 0
Root Mean Squared Error = 1.931038703856658
Recall = 0.012746450825324841
Precision = 0.32
Hitrate = 0.9

Running k = 25


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Number points not mapped to cluster: 0
Root Mean Squared Error = 1.728331721754378
Recall = 0.010139970716896837
Precision = 0.3125
Hitrate = 0.875

Running k = 5


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Number points not mapped to cluster: 0
Root Mean Squared Error = 2.3422253429952153
Recall = 0.031028913785739343
Precision = 0.20491803278688525
Hitrate = 0.8524590163934426

Running k = 8


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Number points not mapped to cluster: 0
Root Mean Squared Error = 2.0537771183340974
Recall = 0.025827894160837656
Precision = 0.28421052631578947
Hitrate = 0.9342105263157895

Running k = 12


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Number points not mapped to cluster: 0
Root Mean Squared Error = 1.8416975501277968
Recall = 0.02049489478021235
Precision = 0.3
Hitrate = 0.9

Running k = 15


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Number points not mapped to cluster: 0
Root Mean Squared Error = 1.7229741025730152
Recall = 0.022304093827084138
Precision = 0.395
Hitrate = 1.0

Running k = 20


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Number points not mapped to cluster: 0
Root Mean Squared Error = 1.5378273227206665
Recall = 0.01591743746890045
Precision = 0.38666666666666666
Hitrate = 0.9666666666666667

Running k = 25


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Number points not mapped to cluster: 0
Root Mean Squared Error = 1.4216776850475499
Recall = 0.008998783342692279
Precision = 0.2833333333333333
Hitrate = 0.8333333333333334

Running k = 5


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Number points not mapped to cluster: 0
Root Mean Squared Error = 2.338153435723113
Recall = 0.03370041631101118
Precision = 0.21721311475409838
Hitrate = 0.8114754098360656

Running k = 8


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Number points not mapped to cluster: 0
Root Mean Squared Error = 2.047630406724392
Recall = 0.026969146143792748
Precision = 0.29342105263157897
Hitrate = 0.9210526315789473

Running k = 12


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Number points not mapped to cluster: 0
Root Mean Squared Error = 1.8395859134025252
Recall = 0.02098561805467656
Precision = 0.30000000000000004
Hitrate = 0.98

Running k = 15


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Number points not mapped to cluster: 0
Root Mean Squared Error = 1.7074001881404384
Recall = 0.017280862207247
Precision = 0.325
Hitrate = 0.975

Running k = 20


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Number points not mapped to cluster: 0
Root Mean Squared Error = 1.5436161903681036
Recall = 0.013766391072969238
Precision = 0.3433333333333334
Hitrate = 0.9666666666666667

Running k = 25


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Number points not mapped to cluster: 0
Root Mean Squared Error = 1.4484557073942481
Recall = 0.012728700025094674
Precision = 0.35833333333333334
Hitrate = 1.0

Running k = 5


In [73]:
rmsetotals_e,recalltotals_e, precisiontotals_e, hitratetotals_e = driver(metric_="euclidean")

Running k = 5


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Number points not mapped to cluster: 0
Root Mean Squared Error = 2.4865606128810644
Recall = 0.0289656528106034
Precision = 0.19426229508196724
Hitrate = 0.8032786885245902

Running k = 8


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Number points not mapped to cluster: 0
Root Mean Squared Error = 2.2698738528765263
Recall = 0.022788940870947204
Precision = 0.24473684210526317
Hitrate = 0.881578947368421

Running k = 12


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Number points not mapped to cluster: 0
Root Mean Squared Error = 2.0777672851759044
Recall = 0.019465624066821327
Precision = 0.306
Hitrate = 0.94

Running k = 15


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Number points not mapped to cluster: 0
Root Mean Squared Error = 1.976733591198687
Recall = 0.01611589558991775
Precision = 0.32999999999999996
Hitrate = 0.925

Running k = 20


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Number points not mapped to cluster: 0
Root Mean Squared Error = 1.8645284347156055
Recall = 0.014652559823695758
Precision = 0.36333333333333334
Hitrate = 0.9333333333333333

Running k = 25


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Number points not mapped to cluster: 0
Root Mean Squared Error = 1.7964777178388132
Recall = 0.011532904775474665
Precision = 0.3458333333333334
Hitrate = 0.9583333333333334

Running k = 5


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Number points not mapped to cluster: 0
Root Mean Squared Error = 2.346621620939273
Recall = 0.031408032068846334
Precision = 0.20573770491803284
Hitrate = 0.8524590163934426

Running k = 8


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Number points not mapped to cluster: 0
Root Mean Squared Error = 2.0870193640108736
Recall = 0.02554460494988458
Precision = 0.27631578947368424
Hitrate = 0.9210526315789473

Running k = 12


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Number points not mapped to cluster: 0
Root Mean Squared Error = 1.8713874844968135
Recall = 0.02226803518631171
Precision = 0.33599999999999997
Hitrate = 0.98

Running k = 15


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Number points not mapped to cluster: 0
Root Mean Squared Error = 1.6928516729823495
Recall = 0.019967503694609905
Precision = 0.36
Hitrate = 0.975

Running k = 20


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Number points not mapped to cluster: 0
Root Mean Squared Error = 1.5477706502701645
Recall = 0.015118755745290247
Precision = 0.36666666666666664
Hitrate = 0.9333333333333333

Running k = 25


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Number points not mapped to cluster: 0
Root Mean Squared Error = 1.4819669144121892
Recall = 0.007479801688527338
Precision = 0.25
Hitrate = 0.75

Running k = 5


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Number points not mapped to cluster: 0
Root Mean Squared Error = 2.311425445925604
Recall = 0.034397706080814335
Precision = 0.2344262295081967
Hitrate = 0.8934426229508197

Running k = 8


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Number points not mapped to cluster: 0
Root Mean Squared Error = 2.063357503850235
Recall = 0.026925354989476007
Precision = 0.29342105263157897
Hitrate = 0.9210526315789473

Running k = 12


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Number points not mapped to cluster: 0
Root Mean Squared Error = 1.8128425295730548
Recall = 0.02142039014986337
Precision = 0.316
Hitrate = 0.94

Running k = 15


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Number points not mapped to cluster: 0
Root Mean Squared Error = 1.6904580653513692
Recall = 0.018424513195977984
Precision = 0.35
Hitrate = 0.925

Running k = 20


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Number points not mapped to cluster: 0
Root Mean Squared Error = 1.6152876027644232
Recall = 0.016000349155694687
Precision = 0.38333333333333336
Hitrate = 0.9666666666666667

Running k = 25


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Number points not mapped to cluster: 0
Root Mean Squared Error = 1.470867717824866
Recall = 0.011670997357297036
Precision = 0.3541666666666667
Hitrate = 0.9583333333333334

Running k = 5


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Number points not mapped to cluster: 0
Root Mean Squared Error = 2.300887845110802
Recall = 0.0325539643925366
Precision = 0.22295081967213112
Hitrate = 0.8360655737704918

Running k = 8


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Number points not mapped to cluster: 0
Root Mean Squared Error = 2.0561948501592515
Recall = 0.02649181661929503
Precision = 0.28289473684210525
Hitrate = 0.9473684210526315

Running k = 12


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Number points not mapped to cluster: 0
Root Mean Squared Error = 1.8686684183996423
Recall = 0.02146481200907965
Precision = 0.33399999999999996
Hitrate = 0.92

Running k = 15


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Number points not mapped to cluster: 0
Root Mean Squared Error = 1.6950000823779567
Recall = 0.018980358586456355
Precision = 0.34750000000000003
Hitrate = 0.95

Running k = 20


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Number points not mapped to cluster: 0
Root Mean Squared Error = 1.5863894262455807
Recall = 0.011443495292152198
Precision = 0.2833333333333333
Hitrate = 1.0

Running k = 25


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Number points not mapped to cluster: 0
Root Mean Squared Error = 1.4917834319301324
Recall = 0.009737872275262382
Precision = 0.3041666666666667
Hitrate = 0.9583333333333334

Running k = 5


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Number points not mapped to cluster: 0
Root Mean Squared Error = 2.345166497904475
Recall = 0.03184599664978822
Precision = 0.2065573770491803
Hitrate = 0.8770491803278688

Running k = 8


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Number points not mapped to cluster: 0
Root Mean Squared Error = 2.0581777863534336
Recall = 0.028378855658134132
Precision = 0.3013157894736842
Hitrate = 0.9605263157894737

Running k = 12


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Number points not mapped to cluster: 0
Root Mean Squared Error = 1.8089261205465383
Recall = 0.023949664173665022
Precision = 0.36
Hitrate = 0.98

Running k = 15


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Number points not mapped to cluster: 0
Root Mean Squared Error = 1.7130357081009526
Recall = 0.01750434082889376
Precision = 0.32999999999999996
Hitrate = 0.925

Running k = 20


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Number points not mapped to cluster: 0
Root Mean Squared Error = 1.557585201774865
Recall = 0.015103508734839543
Precision = 0.37333333333333335
Hitrate = 0.9666666666666667

Running k = 25


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))


Number points not mapped to cluster: 0
Root Mean Squared Error = 1.473517524077002
Recall = 0.013144933997244215
Precision = 0.3916666666666666
Hitrate = 0.9583333333333334



In [None]:
rmsetotals_0, recalltotals_0, precisiontotals_0,hitratetotals_0 = driver_unanonymized("jaccard")

In [None]:
##full, jaccard
klist = [0, 5, 8, 12, 15, 20, 25]
rmse_mean_totals = np.insert(np.mean(rmsetotals, axis = 0),0, rmsetotals_0)
precision_mean_totals = np.insert(np.mean(precisiontotals, axis = 0),0, precisiontotals_0)
recall_mean_totals = np.insert(np.mean(recalltotals, axis = 0),0, recalltotals_0)
hitrate_mean_totals = np.insert(np.mean(hitratetotals, axis = 0),0, hitratetotals_0)

plot(rmse_mean_totals,recall_mean_totals,precision_mean_totals,hitrate_mean_totals, klist)


In [None]:
##truncated, jaccard
klist = [5, 8, 12, 15, 20, 25]

rmse_mean_totals = np.mean(rmsetotals, axis = 0)
precision_mean_totals = np.mean(precisiontotals, axis = 0)
recall_mean_totals = np.mean(recalltotals, axis = 0)
hitrate_mean_totals = np.mean(hitratetotals, axis = 0)
plot(rmse_mean_totals,recall_mean_totals,precision_mean_totals, hitrate_mean_totals, klist)

In [None]:
##truncated, euclid
klist = [5, 8, 12, 15, 20, 25]

rmse_mean_totals_e = np.mean(rmsetotals_e, axis = 0)
precision_mean_totals_e = np.mean(precisiontotals_e, axis = 0)
recall_mean_totals_e = np.mean(recalltotals_e, axis = 0)
hitrate_mean_totals_e = np.mean(hitratetotals_e, axis = 0)
plot(rmse_mean_totals_e,recall_mean_totals_e,precision_mean_totals_e, hitrate_mean_totals_e, klist)

In [32]:
"""Utility.ipynb
Automatically generated by Colaboratory.
Original file is located at
    https://colab.research.google.com/drive/1WG2j9BIjfuZSZskv7sRB5D_XDu6eclz-
"""
from tqdm.notebook import tqdm
import os
import json
import numpy as np
import pandas as pd
from scipy.spatial import distance

COL_NAME_USER_ID = "userId"
COL_NAME_MOVIE_ID = "movieId"
COL_NAME_ITEM_ID = COL_NAME_MOVIE_ID
COL_NAME_RATING = "rating"
COL_NAME_MOVIE_RATINGS = "movies_and_ratings"

#from google.colab import drive
#drive.mount("/content/drive",force_remount=True)

BASE_PATH = '/content/drive/MyDrive/CSE547_Final_Project/ml-100k/'
class Utility:
    
    def __init__(self, split_type, base_path=BASE_PATH, iteration=1):
        """Init the utility class
        Keyword arguments
        base_path -- The directory where files are located. 
        """
        splits_and_paths = self.get_splits_and_paths()
        if split_type not in splits_and_paths:
          print(f"split_type must be one of {','.join(list(splits_and_paths.keys()))}")
        valid_iterations = set(range(1,6))
        if iteration not in valid_iterations:
          print(f"iteration must be one of {','.join(list(valid_iterations))}")
        if not os.path.isdir(base_path):
            raise OSError(f'{base_path} is not a directory')
        
        self.base_path = base_path
        self.anonymized_data_path = os.path.join(self.base_path, splits_and_paths[split_type], f"iter_{iteration}_fix")

        self.ratings_file_name =  'ratings.csv'

        self.training_file_name = 'ratings_train.csv'
        self.test_file_name = 'ratings_train.csv'
        self.validation_file_name = 'ratings_validation.csv'
        self.evaluation_file_name = 'evaluation.csv'
        self.user_movie_ratings_matrix_file_name = 'user_movie_ratings_matrix.csv'
        self.user_to_idx_file_name =  "user_to_idx.json"
        self.movie_to_idx_file_name =  "movie_to_idx.json"

        self.k_anonymized_postfix = '_anonymized.csv'
        self.k_anonymized_map_postfix = '_anonymized_idx_to_kanon_idx.json'
    
    def get_training_user_itemlist(self):
      training_data,_,_ = self.get_unanonymized_training_data()
      return training_data.groupby(COL_NAME_USER_ID)[COL_NAME_MOVIE_ID].agg(set)
  
    def get_test_user_itemlist(self):
      test_data = self.get_test_data()
      return test_data.groupby(COL_NAME_USER_ID)[COL_NAME_MOVIE_ID].agg(set)
  

    def generate_evaluation_dataframe(self, num_negative_items_to_sample_per_user=100):
      training_data = self.get_training_user_itemlist()
      test_data = self.get_test_user_itemlist()
      users = np.unique(np.concatenate([training_data.index.values, test_data.index.values]))
      all_items_to_consider = set(self.get_movie_to_col_index().keys())
      evaluation_df = self.get_test_data()
      rows_to_add = []
      for user in users:
          items_in_train = training_data.loc[user]
          items_in_test = test_data.loc[user]
          neg_items = all_items_to_consider - items_in_train - items_in_test
          neg_items = np.random.choice(list(neg_items), size=num_negative_items_to_sample_per_user, replace=False)
          rows_to_add.extend([{COL_NAME_USER_ID:user, COL_NAME_MOVIE_ID:item, COL_NAME_RATING:0.0} for item in neg_items])
      evaluation_df = evaluation_df.append(rows_to_add, True)
      return evaluation_df
    
    def get_evaluation_path(self):
      return os.path.join(self.anonymized_data_path,self.evaluation_file_name)

    def save_evaluation_df_to(self, df, overwrite=False):
      if os.path.exists(self.get_evaluation_path()) and not(overwrite):
        print("File exists, if you want to overwrite, then pass the arugment to")
        return
      df.to_csv(self.get_evaluation_path())
    
    def get_evaluation_data(self):
      return pd.read_csv(self.get_evaluation_path())

    def get_splits_and_paths(self):
      return {
            'ratings': 'ratings_split',
            'users': 'train_test_v2/rawdata_split'
        }

    def euclidean_distance(self, point1, point2):
        return np.linalg.norm(point1 - point2) ** 2
    
    def manhattan_distance(self, point1, point2):
        return abs(point1 - point2).sum()
    
    def jaccard_distance(self, point1, point2):
      return distance.jaccard(point1, point2)
  
    def find_closest_point(self, points_list, point, metric='euclidean'):

        min_distance = float('inf')
        min_distance_idx = -1
        i = 0
        for p in points_list:
            if metric == 'euclidean':
                distance = self.euclidean_distance(p, point)
            elif metric == 'manhattan':
                distance = self.manhattan_distance(p, point)
            elif metric == 'jaccard':
                distance = self.jaccard_distance(p, point)
            else:
                print('Function not implemented!')

            if distance < min_distance:
                min_distance = distance
                min_distance_idx = i
            
            i += 1
        
        return min_distance_idx

    def flatten_matrix_into_dataframe(self, matrix):
      value_vars = [v for v in matrix.columns.values if v != COL_NAME_USER_ID]
      DEFAULT_VARIABLE_NAME_IN_MELT="variable"
      DEFAULT_VALUE_NAME_IN_MELT="value"
      movies_dict = self.get_col_to_movie_index()
      matrix = matrix \
        .melt(id_vars=[COL_NAME_USER_ID], value_vars=value_vars) \
        .rename(columns={DEFAULT_VARIABLE_NAME_IN_MELT: COL_NAME_MOVIE_ID, DEFAULT_VALUE_NAME_IN_MELT: COL_NAME_RATING}) \
        .apply(lambda x : pd.to_numeric(x, downcast='integer'))
      matrix[COL_NAME_MOVIE_ID] = matrix[COL_NAME_MOVIE_ID].apply(lambda x: movies_dict[x])
      matrix[COL_NAME_USER_ID] = matrix[COL_NAME_USER_ID] + 1
      matrix = matrix[matrix[COL_NAME_RATING] > 0]
      return matrix

    def get_training_data_file_path(self, k:int):
      if k ==0:
        return os.path.join(self.anonymized_data_path, "user_movie_ratings_matrix.csv")
      return os.path.join(self.anonymized_data_path, f"{k}{self.k_anonymized_postfix}")

    def get_test_data_file_path(self):
      return os.path.join(self.anonymized_data_path, self.test_file_name)

    def get_training_data(self, k:int):
      mldf = pd.read_csv(self.get_training_data_file_path(k), header=None).reset_index().rename(columns={'index': COL_NAME_USER_ID})
      mldf= self.flatten_matrix_into_dataframe(mldf)
      return mldf, len(mldf[COL_NAME_USER_ID].unique()), len(mldf[COL_NAME_MOVIE_ID].unique())

    def get_training_data_for_evaluation(self):
      mldf = pd.read_csv(self.get_training_data_path())
      return self._group_and_get_sets_for_evaluation(mldf)

    def get_test_data(self):
      return pd.read_csv(self.get_test_data_file_path()).apply(lambda x : pd.to_numeric(x, downcast='integer'))

    def _group_and_get_sets_for_evaluation(self, df):
      testdf=df.sort_values(by=COL_NAME_RATING, ascending=False)
      grouped=testdf.groupby(COL_NAME_USER_ID).agg({COL_NAME_MOVIE_ID:lambda x: list(x), COL_NAME_RATING:lambda x: list(x)})
      grouped[COL_NAME_MOVIE_RATINGS] = grouped.apply(lambda x: list(zip(x[COL_NAME_MOVIE_ID], x[COL_NAME_RATING])), axis=1)
      grouped=grouped.drop(columns=[COL_NAME_RATING])
      return grouped

    def get_unanonymized_training_data(self):
      df = pd.read_csv(self.get_training_data_path())
      return df.apply(pd.to_numeric), len(df[COL_NAME_USER_ID].unique()), len(df[COL_NAME_MOVIE_ID].unique())

    def get_complete_data_path(self):
      return os.path.join(self.base_path, self.ratings_file_name)

    def get_complete_data(self):
      df = pd.read_csv(self.get_complete_data_path())
      return df.apply(pd.to_numeric), len(df[COL_NAME_USER_ID].unique()), len(df[COL_NAME_MOVIE_ID].unique())
  
    def get_evaluation_data_for_evaluation(self):
      df = self.get_evaluation_data()
      return self._group_and_get_sets_for_evaluation(df)
    
    def get_test_data_for_evaluation(self):
      testdf=self.get_test_data()
      return self._group_and_get_sets_for_evaluation(testdf)
  
    def get_training_data_path(self):
        """Get path to training file
        Keyword arguments
        base_path -- The directory where files are located. 
        """        
        return os.path.join(self.anonymized_data_path, self.training_file_name)       

    def get_test_data_path(self):
        """Get path to test file
        Keyword arguments
        base_path -- The directory where files are located. 
        """
        return os.path.join(self.anonymized_data_path, self.test_file_name)
    
    def get_validation_data_path(self):
        """Get path to validation file
        Keyword arguments
        base_path -- The directory where files are located. 
        """
        return os.path.join(self.anonymized_data_path, self.validation_file_name)
    
    def get_train_data_user_map_path(self):
        """Get path to user id to train index file map
        Keyword arguments
        base_path -- The directory where files are located. 
        """
        return os.path.join(self.anonymized_data_path, self.user_to_idx_file_name)
    
    def get_train_data_movie_map_path(self):
        """Get path to movie id to train index file map
        Keyword arguments
        base_path -- The directory where files are located. 
        """
        return os.path.join(self.anonymized_data_path, self.movie_to_idx_file_name)
   
    def _coalesce_path(self,path1, path2):
      return path1 if path1 != "" else path2

    def get_utility_matrix_from_train(self):      
        # df = pd.read_csv(file_path) # Bug fix -- Check the whole dataset now and map all movies
        df, _, _ = self.get_complete_data()
        
        movie_id_to_idx_dict = self.get_movie_to_col_index()

        users = df.userId.unique()
        test_user_dict = {users[i]: i for i in range(len(users))}

        num_users = len(df.userId.unique())
        num_movies = len(movie_id_to_idx_dict) + 1
        utility_matrix = np.zeros((num_users, num_movies))

        for index, row in df.iterrows():
            movie_id, rating = int(row[COL_NAME_MOVIE_ID]), float(row[COL_NAME_RATING])
            utility_matrix[test_user_dict[row[COL_NAME_USER_ID]]][0] = int(row[COL_NAME_USER_ID])
            
            if movie_id in movie_id_to_idx_dict:
                utility_matrix[test_user_dict[row[COL_NAME_USER_ID]]][movie_id_to_idx_dict[movie_id]] = rating
            #else:
            #   print(f'Movie ID {movie_id} not found!')
        
        return utility_matrix
    
    def get_k_anonymized_map_path(self, k, base_path=""):
        """Get path to k-anonymzied map. 
        {k}_anonymized_idx_to_kanon_idx.json --> the mapping from user index 
        (row in the ratings_train.csv) to the row index in the 
        corresponding {k}_anonymized.csv file
        Keyword arguments
        k -- 
        base_path -- The directory where files are located. 
        """
        return os.path.join(self.anonymized_data_path,f"{k}{self.k_anonymized_map_postfix}")
    
    def get_col_to_movie_index(self):
      movie_to_idx_path = self.get_train_data_movie_map_path()
      with open(movie_to_idx_path) as json_file:
          movie_id_to_idx_dict = json.load(json_file)
      return {v:int(k) for k,v in movie_id_to_idx_dict.items()}

    def get_movie_to_col_index(self):
      movie_to_idx_path = self.get_train_data_movie_map_path()
      with open(movie_to_idx_path) as json_file:
          movie_id_to_idx_dict = json.load(json_file)
      return {int(k):v for k,v in movie_id_to_idx_dict.items()}

    def get_feature_vector_for_user(self, movie_ratings:list):
        """Returns a vector, with the same dimentions as the 
        training dataset. 
        Keyword arguments
        movie_ratings -- A list of tuples. Each tuple should be
        as follows: (movie_id, rating)
        """
        movie_to_idx_path = self.get_train_data_movie_map_path()
        with open(movie_to_idx_path) as json_file:
            movie_id_to_idx_dict = json.load(json_file)
        
        feature_vec = [0 for _ in range(len(movie_id_to_idx_dict))]
        movie_not_found= 0
        for movie_id, rating in movie_ratings:
            movie_id, rating = int(movie_id), int(rating)

            if movie_id in movie_id_to_idx_dict.keys():
                feature_vec[movie_id_to_idx_dict[movie_id]] = rating
            else:
                movie_not_found += 1
                #print(f'Movie ID {movie_id} not found!')
        print(f"{movie_not_found} movies not found.")
        return feature_vec
       
    def get_col_to_user_index(self):
      id_to_idx_path = self.get_train_data_user_map_path()
      with open(id_to_idx_path) as json_file:
          id_to_idx_dict = json.load(json_file)
      return {int(k):v for k,v in id_to_idx_dict.items()}
    
    def get_col_to_kanoncol_index(self, k):
      idx_to_kidx_path = self.get_k_anonymized_map_path(k, self.base_path)
      with open(idx_to_kidx_path) as json_file:
        idx_to_kidx_path_dict = json.load(json_file)
      return {int(k):v for k,v in idx_to_kidx_path_dict.items()}

    def get_ks(self):
      return [0, 5,8,12,15,20,25]

    def get_closest_k_cluster_to_user_id(self, k:int, metric="jacard"):
        id_to_idx_dict = self.get_col_to_user_index()
        idx_to_kidx_path_dict = self.get_col_to_kanoncol_index(k) if k!=0 else None
        k_anon_data_path = self.get_training_data_file_path(k)
       

        k_anaon_data = pd.read_csv(k_anon_data_path, sep=',', header=None).apply(pd.to_numeric).values

        u_matrix = self.get_utility_matrix_from_train()

        user_to_cluster_dict = {}
        for um in tqdm(u_matrix):
            user_id = int(um[0])
            row_vec = um[1:]
        
            # Easy case - We have trained on this user before
            # Just need to lookup, to see what cluster they belong to
            if user_id in id_to_idx_dict and\
            k!=0 and id_to_idx_dict[user_id] in idx_to_kidx_path_dict:            
              user_to_cluster_dict[user_id] = idx_to_kidx_path_dict[id_to_idx_dict[user_id]]
            elif user_id in id_to_idx_dict and k==0: # user maps to themselves, they were in training data
              user_to_cluster_dict[user_id] = id_to_idx_dict[user_id]
            else:
              #print(f'{user_id} not found in training data!')
              user_to_cluster_dict[user_id] = self.find_closest_point(k_anaon_data, row_vec, metric)
              #print(f'{user_id} mapped to {user_to_cluster_dict[user_id]}')
        
        return user_to_cluster_dict
      
    
def avg_mahalanobis_dist(user_movie_matrix, anon_matrix, useridx_to_cluster):
    ##user_movie_matrix and anon_matrix are numpy arrays
    #useridx_to_cluster is a dictionary mapping the index to a cluster
    d = dict()
    stdev = np.std(anon_matrix, axis = 0)
    for u, user in enumerate(user_movie_matrix):
        ##some users are not mapped to clusters
        if u in useridx_to_cluster.keys():
            cluster_idx = idx_to_cluster[u]
            cluster = anon_matrix[cluster_idx]
            d[u] = mahalanobis_dist(user, cluster, stdev)
    return np.mean(list(d.values()))