In [54]:
!pip install pyspark
!apt install openjdk-8-jdk-headless -qq

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

openjdk-8-jdk-headless is already the newest version (8u292-b10-0ubuntu1~18.04).
0 upgraded, 0 newly installed, 0 to remove and 39 not upgraded.


In [55]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.decomposition import SparsePCA

import scipy.sparse as sparse

import pyspark
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark import SparkContext, SparkConf
from pyspark.mllib.linalg.distributed import RowMatrix
from pyspark.mllib.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler

from datetime import datetime
import os

import json

In [None]:
# create the session
conf = SparkConf().set("spark.ui.port", "4050")

# create the context
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()

In [57]:
from google.colab import drive
drive.mount("/content/drive",force_remount=True)

Mounted at /content/drive


In [58]:
EUCLIDEAN_DISTANCE = 1
MANHATTAN_DISTANCE = 2

In [59]:
# files_path = '/content/drive/MyDrive/CSE547_Final_Project/ml-100k'
files_path = '/content/drive/MyDrive/CSE547_Final_Project/ml-100k/train_test_v2/rawdata_split/iter_5_fix'
# files_path = '/content/drive/MyDrive/CSE547_Final_Project/ml-100k/ratings_split/iter_1_fix'
# user_movie_ratings_matrix = os.path.join(files_path, "user_movie_ratings_matrix.csv")
user_movie_ratings_matrix = os.path.join(files_path, "user_movie_ratings_matrix.csv")
# mediods_400 = os.path.join(files_path, "400_mediods.csv")
# mediods_4000 = os.path.join(files_path, "4000_mediods.csv")

In [60]:
def euclidean_distance(point1, point2):
    return np.linalg.norm(point1 - point2) ** 2

In [61]:
def manhattan_distance(point1, point2):
    return abs(point1 - point2).sum()

In [62]:
def get_distance(point, centroid, distance_measure = EUCLIDEAN_DISTANCE):

    point_info = ()

    if distance_measure == EUCLIDEAN_DISTANCE:
        point_info = (point, euclidean_distance(point, centroid)) # value
    elif distance_measure == MANHATTAN_DISTANCE:
        point_info = (point, manhattan_distance(point, centroid)) # value
    else:
        raise NotImplementedError(f'{distance_measure} is not implemented.')

    return point_info

In [63]:
def get_centroid(data_rdd, distance_measure = EUCLIDEAN_DISTANCE):
    num_points = data_rdd.count()
    
    center_point = data_rdd.values().sum() / num_points

    # distances = (index, (point, distance))
    distances = data_rdd.map(lambda p: (p[0], get_distance(p[1], center_point, distance_measure)) )
    centroid = distances.takeOrdered(1, lambda x: x[1][1])[0][1][0]
    # centroid = distances.sortBy(lambda x: x[1][1]).collect()[0][1][0]

    return np.array(centroid)

In [64]:
def get_farthest_from_centroid(data_rdd, centroid, distance_measure = EUCLIDEAN_DISTANCE):
    # distances = (index, (point, distance))
    distances = data_rdd.map(lambda p: (p[0], get_distance(p[1], centroid, distance_measure)) )
    farthest_point = distances.takeOrdered(1, lambda x: -x[1][1])[0][1][0]
    # farthest_point = distances.sortBy(lambda x: x[1][1]).collect()[-1][1][0]

    return np.array(farthest_point)

In [65]:
def microaggregate(points_list, method = 'avg'):
    k = len(points_list)
    
    if method == 'avg':
        #microaggregated = np.sum(points_list, axis=0) / k
        '''Bug Fix: Divide by number of users that have seen the movie only!'''
        r = np.array(points_list, dtype=np.float)
        r[np.where(r == 0)] = np.nan
        microaggregated = np.nan_to_num(np.nanmean(r, axis=0))
    else:
        raise NotImplementedError(f'{method} is not implemented.')
    
    return microaggregated

In [66]:
def group_k_points(data_rdd, center, k, method = 'avg', distance_measure = EUCLIDEAN_DISTANCE):

    distances = data_rdd.map(lambda p: (p[0], get_distance(p[1], center, distance_measure)) )
    k_closest = distances.takeOrdered(k, lambda x: x[1][1])
    k_closest = [(x[0], x[1][0]) for x in k_closest]

    microaggregated = microaggregate([(x[1]) for x in k_closest], method)

    return microaggregated, k_closest

In [67]:
def MDAV(data_rdd, k = 5, distance_measure = EUCLIDEAN_DISTANCE, method = 'avg'):
    num_total_points = data_rdd.count()
    num_points = num_total_points

    num_features = data_rdd.top(1)[0][1].shape
    anonymized = np.zeros(num_features)
    user_idx_to_k_anon_idx, k_anon_index = {}, 0
    while num_points >= 2*k:
        C = get_centroid(data_rdd, distance_measure)
        P = get_farthest_from_centroid(data_rdd, C, distance_measure)
        Q = get_farthest_from_centroid(data_rdd, P, distance_measure)
        
        for k_anon_center in [P, Q]:
            microaggregated, grouped = group_k_points(data_rdd, k_anon_center, k, method, distance_measure)
            grouped_rdd = sc.parallelize(grouped)
            grouped_idx = grouped_rdd.keys().collect()
            # print(f'\tGrouped {grouped_rdd.keys().collect()}')

            # Add the anonymized group to an array
            anonymized = np.vstack([anonymized, microaggregated])

            # Remove the grouped points from the main datapoints
            data_rdd = data_rdd.subtractByKey(grouped_rdd)

            # Update the index, to keep mapping from user_id --> k_anon_index
            for user_idx in grouped_idx:
                user_idx_to_k_anon_idx[user_idx] = k_anon_index
            k_anon_index += 1
        
        num_points = data_rdd.count()
        # print(f'\t\tNumber of points remaining: {num_points} / {num_total_points}')
        # print()
    
    if k < num_points < (2*k - 1):
        remain = data_rdd.map(lambda p: p[1]).collect()
        microaggregated = microaggregate(remain, method)
        
        # Add the anonymized group to an array
        anonymized = np.vstack([anonymized, microaggregated])
    else:
        # We should force these into points into the previous microcell
        # However, these may corrupt the data. Therefore, we ignore these points
        pass
    
    anonymized = np.delete(anonymized, 0, axis=0)
    return anonymized, user_idx_to_k_anon_idx

In [68]:
distance_measure = EUCLIDEAN_DISTANCE
method = 'avg'

unreduced_files = [user_movie_ratings_matrix]
# k_values = reversed([2, 3, 5, 8, 12, 15, 20, 25])
# k_values = reversed([5, 8, 12, 15, 20, 25])
k_values = reversed([5])

for unreduced_file in unreduced_files:
    unreduced_file_df = spark.read.csv(unreduced_file, header=True, inferSchema=True)
    # print(unreduced_file_df.printSchema())
    feature_columns = unreduced_file_df.columns
    vector_assembler = VectorAssembler(inputCols = feature_columns, outputCol = 'features_sparse')
    unreduced_file_df = vector_assembler.transform(unreduced_file_df).select(['features_sparse'])
    unreduced_file_df = unreduced_file_df.select("features_sparse").rdd
    # unreduced_file_df = unreduced_file_df.rdd
    unreduced_file_df = unreduced_file_df.zipWithIndex()
    unreduced_file_df = unreduced_file_df.map(lambda x: (x[1], np.array(x[0])) )
    # unreduced_file_df = unreduced_file_df.map(lambda x: (x[0], x[1][np.newaxis, :] ))

    print(f'Starting file={unreduced_file} --> Confirmed: {unreduced_file_df.collect()[0][1].shape}')

    for k in k_values:
        anonymized, user_idx_to_k_anon_idx = MDAV(unreduced_file_df, k, distance_measure, method)

        anon_file_name = f'{k}_anonymized.csv'
        anon_file_path = os.path.join(files_path, anon_file_name)
        np.savetxt(anon_file_path, anonymized, delimiter=",")

        user_to_k_anon_idx_file = f'{k}_anonymized_idx_to_kanon_idx.json'
        with open(os.path.join(files_path, user_to_k_anon_idx_file), "w") as f1:
            json.dump(user_idx_to_k_anon_idx, f1, indent=4)

        print(f'\tDone with k = {k}')

Starting file=/content/drive/MyDrive/CSE547_Final_Project/ml-100k/train_test_v2/rawdata_split/iter_5_fix/user_movie_ratings_matrix.csv --> Confirmed: (1, 8374)


  if __name__ == '__main__':


	Done with k = 5


In [69]:
# distance_measure = EUCLIDEAN_DISTANCE
# method = 'avg'

# # mediod_files = [mediods_400, mediods_4000]
# mediod_files = [mediods_4000]
# k_values = reversed([2, 3, 5, 8, 12, 15, 20, 25])
# # k_values = reversed([25])

# for mediod_file in mediod_files:
#     num_mediods = int(mediod_file.split('/')[-1].split('_')[0])

#     mediod_df = spark.read.csv(mediod_file, header=False, inferSchema=True)

#     feature_columns = mediod_df.columns
#     print(feature_columns)
#     vector_assembler = VectorAssembler(inputCols = feature_columns, outputCol = 'features_sparse')
#     mediod_df = vector_assembler.transform(mediod_df).select(['features_sparse'])
#     mediod_df = mediod_df.select("features_sparse").rdd
#     mediod_df = mediod_df.zipWithIndex()
#     mediod_df = mediod_df.map(lambda x: (x[1], np.array(x[0])) )

#     print(f'Starting mediods={num_mediods} --> Confirmed: {mediod_df.collect()[0][1].shape}')

#     for k in k_values:
#         anonymized, user_idx_to_k_anon_idx = MDAV(mediod_df, k, distance_measure, method)

#         anon_file_name = f'{num_mediods}_mediods_{k}_anonymized.csv'
#         anon_file_path = os.path.join(files_path, anon_file_name)
#         np.savetxt(anon_file_path, anonymized, delimiter=",")

#         user_to_k_anon_idx_file = f'{num_mediods}_mediods_{k}_anonymized_idx_to_kanon_idx.json'
#         with open(os.path.join(files_path, user_to_k_anon_idx_file), "w") as f1:
#             json.dump(user_idx_to_k_anon_idx, f1, indent=4)

#         print(f'\tDone with k = {k}')

In [70]:
l = [np.array([1,4,5,0]), np.array([2,0,3,0]), np.array([4,0,0,0])]
r = np.array(l, dtype=np.float)

In [71]:
r

array([[1., 4., 5., 0.],
       [2., 0., 3., 0.],
       [4., 0., 0., 0.]])

In [72]:
r[np.where(r == 0)] = np.nan

In [73]:
r

array([[ 1.,  4.,  5., nan],
       [ 2., nan,  3., nan],
       [ 4., nan, nan, nan]])

In [74]:
microaggregated = np.nan_to_num(np.nanmean(r, axis=0))
microaggregated

  """Entry point for launching an IPython kernel.


array([2.33333333, 4.        , 4.        , 0.        ])