# Exercise 1

## Imports

In [15]:
import pickle
import os.path
import pyspark.sql.functions as F
from pyspark import Broadcast
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.types import StringType, ArrayType, FloatType
from itertools import combinations
from typing import Iterable, Any, List, Set

import pandas as pd
import numpy as np
import math
import random
from typing import Union

## Spark initialization

In [2]:
spark = SparkSession.builder \
    .appName('exercise1') \
    .config('spark.master', 'local[*]') \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/04/17 12:08:54 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Prepare the data

In [3]:
tracks_df = (spark.read
    .option("multiline", "true")
    .option("quote", '"')
    .option("escape", '"')
    .csv('data/tracks.csv')threshold
)

# rename columns with row values from first row to second row
column_categories = list(zip(*tracks_df.take(2)))
columns = tracks_df.columns
tracks_df = tracks_df.select(F.col(columns[0]).alias('track_id'),
    *(F.col(column).alias("-".join(map(str, categories)))
    for column, categories in zip(columns[1:], column_categories[1:]))
)

tracks_df = (tracks_df
    .filter(F.col("track_id").isNotNull()) 
    .filter(F.col("track_id") != "track_id")
)

tracks_df.show()

23/04/17 12:08:59 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
+--------+--------------+-------------------+-------------------+--------------+---------------+--------+--------------------+-------------+--------------------+----------+--------------------+------------+----------------+------------------------+----------------------+------------------------+--------------------+---------------+-------------------+----------------+---------+---------------+------------------+----------------+--------------------+--------------------+-----------------------+--------------------+--------------------+---------------------+---------+----------+--------------+--------------+--------------+-------------------+-------------------+--------------+---------------+---------------+------------+-----------------+--------------------+--------------+-------------------+------------------

In [4]:
# TODO: feature selection? not needed necessarily but...
features_df = (spark.read
    .csv('data/features.csv')
)

# rename columns with row values from first row to second row
column_categories = list(zip(*features_df.take(3)))
columns = features_df.columns
# TODO: DoubleType instead of FloatType?
features_df = features_df.select(F.col(columns[0]).alias('track_id'),
    *(F.col(column).cast(FloatType()).alias("-".join(map(str, categories)))
    for column, categories in zip(columns[1:], column_categories[1:]))
)

features_df = (features_df
    .filter(F.col("track_id") != "feature")
    .filter(F.col("track_id") != "statistics")
    .filter(F.col("track_id") != "number")
    .filter(F.col("track_id") != "track_id")
)

## Agglomerative clustering (in-memory)

In [5]:
small_tracks_df = tracks_df.filter(F.col("set-subset") == "small")
small_features_df = (features_df
    .join(small_tracks_df, "track_id", "left")
    .filter(F.col("set-subset").isNotNull())
    .select(features_df.columns)
)

music_features_pd = (small_features_df
    .drop("track_id")
    .toPandas()
)

                                                                                

In [6]:
# calculate the metrics (radius, diameter, density_r, density_d) for each cluster
def calculate_metrics(pd_df,centroids):
    
    cluster = pd_df["cluster"].values[0]
    
    metrics = pd.DataFrame({'radius': [0], 'diameter': [0],'density_r': [0],'density_d': [0]}, columns=['radius', 'diameter','density_r','density_d'])

    centroid = centroids[cluster].reshape(1,-1)

    matrix = pd_df.drop(columns=['cluster']).to_numpy()
    
    matrix_radius = np.sqrt(np.sum((matrix - centroid)**2, axis=1))

    metrics.loc[0,'radius'] = np.max(matrix_radius)

    # calculater density with radius
    metrics.loc[0,'density_r'] = len(pd_df) / metrics.loc[0,'radius']**2

    for i in range(matrix.shape[0]):

        matrix_diameter = np.sqrt(np.sum((matrix[i:,:] - matrix[i,:])**2, axis=1))

        max_diameter = np.max(matrix_diameter)
        if max_diameter > metrics.loc[0,'diameter']:
            metrics.loc[0,'diameter'] = max_diameter
        
    # calculater density with diameter
    metrics.loc[0,'density_d'] = len(pd_df) / metrics.loc[0,'diameter']**2

    return metrics

In [8]:
from sklearn.cluster import AgglomerativeClustering
from sklearn.neighbors import NearestCentroid

metrics_pd_array = []

music_features_pd["cluster"] = np.zeros((len(music_features_pd), 1), dtype=np.int32)

if os.path.exists("./results/metrics_pd_array_pickle.pkl"):
    with open("metrics_pd_array_pickle.pkl",'rb') as f:
        metrics_pd_array = pickle.load(f)

else:
    # i = 8 until 16
    for i in range(8, 17):
        n_clusters = i
        clusterer = AgglomerativeClustering(n_clusters=n_clusters)
        clusterer.fit(music_features_pd.drop("cluster", axis=1))
        print(f"n_clusters: {n_clusters}, cluster_labels: {clusterer.labels_}")

        # calculate centroids

        centroid_calculator = NearestCentroid()

        centroid_calculator.fit(music_features_pd.drop("cluster",axis=1), clusterer.labels_)

        music_features_pd["cluster"] = clusterer.labels_
        
        metrics_pd_array.append(music_features_pd.groupby("cluster").apply(calculate_metrics,centroid_calculator.centroids_))

    # TODO: weird column at 0s without name??
    with open("./results/metrics_pd_array_pickle.pkl",'wb') as f:
        pickle.dump(metrics_pd_array, f)

n_clusters: 8, cluster_labels: [0 0 5 ... 1 3 3]
n_clusters: 9, cluster_labels: [8 8 5 ... 1 3 3]
n_clusters: 10, cluster_labels: [3 3 5 ... 0 1 1]
n_clusters: 11, cluster_labels: [3 3 2 ... 5 1 1]
n_clusters: 12, cluster_labels: [3 3 2 ... 5 0 0]
n_clusters: 13, cluster_labels: [ 1  1  2 ...  5  3 12]
n_clusters: 14, cluster_labels: [ 0  0  2 ...  5  1 12]
n_clusters: 15, cluster_labels: [14 14  2 ...  5  0 12]
n_clusters: 16, cluster_labels: [14 14  2 ...  5 15 12]


In [None]:
# TODO: justify N-clusters choice, maybe create a colored matrix?
for i in metrics_pd_array:
    density_r_average = i["density_r"].mean()
    density_d_average = i["density_d"].mean()
    density_r_variance = i["density_r"].var()
    density_d_variance = i["density_d"].var()

    print(f"Average density_r: {density_r_average}, Average density_d: {density_d_average}, Variance density_r: {density_r_variance}, Variance density_d: {density_d_variance}")
    print("\n")

## BFR Algorithm

In [None]:
# TODO: results can be: density of clusters, number of nodes in each cluster, etc., but not strictly necessary

In [9]:
n_clusters = 9
dimensions = len(features_df.columns) - 1   # don't consider 'track_id'

max_memory_used_megabytes = 4000
# Assumes all columns are floats/integers, and so therefore 4 bytes
rows_per_iteration = max_memory_used_megabytes // (4 * len(features_df.columns))
total_rows = features_df.count()

seed_random = 0

                                                                                

### Initialize clusters

In [10]:
# TODO: initialize K clusters/centroids with the small dataset that was processed

k_centroids_ids = random.choices(range(0, total_rows), k=n_clusters)

k_centroids = features_df.filter(F.col("track_id").isin(k_centroids_ids)).drop("track_id").collect()

                                                                                

In [24]:
# k_centroids não é igual ao numero de clusters, ISTO DEVE SER PORQUE HÁ IDS QUE NÃO EXISTEM NO DATASET
# !!! RESOLVER ISTO !!!
len(k_centroids)

5

### Loop

In [17]:
import dataclasses
import numpy.typing as npt

@dataclasses.dataclass(init=False)
class SummarizedCluster:
    n:      int                     
    sum_:   npt.NDArray[np.float32]
    sumsq_: npt.NDArray[np.float32]
    id_:    Union[int, None]

    def __init__(self, dimensions: int, id_: int=None):
        self.n = 0
        self.sum_ = np.zeros((dimensions,), dtype=np.float32)
        self.sumsq = np.zeros((dimensions,), dtype=np.float32)
        self.id_ = id_
    
    def summarize(self, point: npt.NDArray[np.float32], track_id: int):
        self.n += 1
        self.sum_ += point
        self.sumsq_ += np.pow(point, 2)

    def centroid(self) -> npt.NDArray[np.float32]:
        return self.sum_ / self.n

    def variance(self) -> npt.NDArray[np.float32]:
        return (self.sumsq_ / self.n) - np.pow(self.sum_ / self.n, 2)

    def standard_deviation(self) -> npt.NDArray[np.float32]:
        return np.sqrt(self.variance())

    def __add__(self, other) -> 'SummarizedCluster':
        if self.id_ is not None and self.other is not None and self.id_ != self.other:
            raise ValueError(f"Clusters {self} and {other} have different explicit ids ({self.id_} != {other.id_}).")
        res = SummarizedCluster(self.dimensions, )
        res.n = self.n + other.n
        res.sum_ = self.sum_ + other.sum_
        res.sumsq_ = self.sumsq_ + other.sumsq_
        res.id_ = self.id_ if self.id_ is not None else other.id_
        return res

In [18]:
discard_sets: List[SummarizedCluster] = [SummarizedCluster(dimensions, id_) for id_ in range(n_clusters)]
compression_sets: List[SummarizedCluster] = []
retained_set: List[npt.NDArray[np.float32]] = []

# Threshold in terms of standard deviations away from centroid, in each dimension
cluster_distance_threshold_standard_deviations = 1
cluster_distance_threshold = ((cluster_distance_threshold_standard_deviations**2) * dimensions) ** 0.5

assert len(k_centroids) == n_clusters, "The number of clusters does not coincide with the number of random centroids!"

for centroid, discard_set in zip(k_centroids, discard_sets):
    discard_set.summarize(np.array(centroid))

def mahalanobis_distance(x: npt.NDArray[np.float32], s: SummarizedCluster) -> float:
    return np.sqrt(np.sum(np.pow((x - s.centroid()) / s.standard_deviation(), 2)))

@F.udf(returnType=FloatType())
def mahalanobis_distance_column(*features: float):
    x = np.array(features)
    closest_cluster_distance, closest_cluster = min((mahalanobis_distance(x, d), d.id_) for d in discard_sets)
    return closest_cluster if closest_cluster_distance < cluster_distance_threshold else None

AssertionError: The number of clusters does not coincide with the number of random centroids!

In [None]:
from sklearn.neighbors import DBSCAN

features_columns = set(features_df.columns) - {"track_id"}
split_weights = [1.0] * (total_rows // rows_per_iteration)
dbscan_eps = 0.5    # TODO: parameter?
# TODO: choose threshold and justify decision
compression_set_merge_variance_threshold = float('-inf')

for loaded_points_df in features_df.filter(~F.col("track_id").isin(k_centroids_ids)).randomSplit(split_weights, seed=seed_random):
    cluster_mapping = loaded_points_df \
        .withColumn("cluster", mahalanobis_distance_column(*features_columns)) \
        .groupby("cluster") \
        .agg({
            "track_ids": F.collect_list("track_id"),
            "features_list": F.collect_list(F.array(*features_columns))
        }).collect()
    
    for row in cluster_mapping:
        cluster = row["cluster"]
        track_ids = row["track_ids"]
        features_list = row["features_list"]

        # Step 3 - check which points go to the discard sets
        if cluster is not None:
            discard_set = discard_sets[cluster]
            for track_id, features in zip(track_ids, features_list):
                discard_set.summarize(np.array(features), track_id)
        
        else:
            # Step 4 - check which points go to the compression sets or the retained set
            matrix_to_cluster = np.array(features_list + retained_set)

            # Use same distance as above
            clusterer = DBSCAN(eps=dbscan_eps, metric='euclidean')
            clusterer.fit(matrix_to_cluster)

            centroid_calculator = NearestCentroid()
            centroid_calculator.fit(matrix_to_cluster, clusterer.labels_)

            retained_set.clear()

            # Create compression sets
            compression_sets_temp = [SummarizedCluster(dimensions, None) for _ in centroid_calculator.centroids_]

            for point_idx in range(matrix_to_cluster.shape[0]):
                cluster_id = clusterer.labels_[point_idx]
                point = matrix_to_cluster[point_idx, :]
                
                if cluster_id == -1:
                    retained_set.append(point)
                else:
                    compression_sets_temp[cluster_id].summarize(point)
            
            compression_sets.extend(compression_sets_temp)
    
    # Step 5 - merge compression sets
    compressing = True
    while compressing:
        merged_compression_sets = []
        compression_sets_to_remove = []
        for (idx_1, compression_set_1), (idx_2, compression_set_2) in combinations(enumerate(compression_sets), 2):
            merged_compression_set = compression_set_1 + compression_set_2
            if merged_compression_set.variance() < compression_set_merge_variance_threshold:
                merged_compression_sets.append(merged_compression_set)
                compression_sets_to_remove

# Step 6 - merge CS and RS into DS (but we won't)
...