# Exercise 1

## Imports

In [1]:
import os.path
import pyspark.sql.functions as F
from pyspark import Broadcast
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.types import StringType, ArrayType, FloatType
from itertools import combinations
from typing import Iterable, Any, List, Set

import pandas as pd
import numpy as np
import math

## Spark initialization

In [2]:
spark = SparkSession.builder \
    .appName('exercise1') \
    .config('spark.master', 'local[*]') \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/04/16 15:24:42 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Prepare the data

In [3]:
tracks_df = (spark.read
    .option("multiline", "true")
    .option("quote", '"')
    .option("escape", '"')
    .csv('data/tracks.csv')
)

# rename columns with row values from first row to second row
column_categories = list(zip(*tracks_df.take(2)))
columns = tracks_df.columns
tracks_df = tracks_df.select(F.col(columns[0]).alias('track_id'),
    *(F.col(column).alias("-".join(map(str, categories)))
    for column, categories in zip(columns[1:], column_categories[1:]))
)

tracks_df = (tracks_df
    .filter(F.col("track_id").isNotNull()) 
    .filter(F.col("track_id") != "track_id")
)

tracks_df.show()

23/04/16 15:24:48 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
+--------+--------------+-------------------+-------------------+--------------+---------------+--------+--------------------+-------------+--------------------+----------+--------------------+------------+----------------+------------------------+----------------------+------------------------+--------------------+---------------+-------------------+----------------+---------+---------------+------------------+----------------+--------------------+--------------------+-----------------------+--------------------+--------------------+---------------------+---------+----------+--------------+--------------+--------------+-------------------+-------------------+--------------+---------------+---------------+------------+-----------------+--------------------+--------------+-------------------+------------------

In [4]:
features_df = (spark.read
    .csv('data/features.csv')
)

# rename columns with row values from first row to second row
column_categories = list(zip(*features_df.take(3)))
columns = features_df.columns
# TODO: DoubleType instead of FloatType?
features_df = features_df.select(F.col(columns[0]).alias('track_id'),
    *(F.col(column).cast(FloatType()).alias("-".join(map(str, categories)))
    for column, categories in zip(columns[1:], column_categories[1:]))
)

features_df = (features_df
    .filter(F.col("track_id") != "feature")
    .filter(F.col("track_id") != "statistics")
    .filter(F.col("track_id") != "number")
    .filter(F.col("track_id") != "track_id")
)

## Agglomerative clustering (in-memory)

In [5]:
small_tracks_df = tracks_df.filter(F.col("set-subset") == "small")
small_features_df = (features_df
    .join(small_tracks_df, "track_id", "left")
    .filter(F.col("set-subset").isNotNull())
    .select(features_df.columns)
)

music_features_pd = (small_features_df
    .drop("track_id")
    .toPandas()
)

                                                                                

In [14]:
# calculate the metrics (radius, diameter, density_r, density_d) for each cluster
def calculate_metrics(pd_df,centroids):
    
    cluster = pd_df["cluster"].values[0]
    
    metrics = pd.DataFrame({'radius': [0], 'diameter': [0],'density_r': [0],'density_d': [0]},columns=['radius', 'diameter','density_r','density_d'])

    centroid = centroids[cluster]

    for index, row in pd_df.iterrows():

        #row to array 
        a = row.to_numpy()
        #print(a.shape)
        
        radius = math.dist(a,centroid)

        if radius > metrics.loc[0,'radius']:
            metrics.loc[0,'radius'] = radius

    # calculater density with radius
    metrics.loc[0,'density_r'] = len(pd_df) / metrics.loc[0,'radius']**2

    all_combinations = combinations(range(0,len(pd_df)), 2)

    for combination in all_combinations:

        a = pd_df.iloc[combination[0]].to_numpy()
        b = pd_df.iloc[combination[1]].to_numpy()

        diameter = math.dist(a,b)

        if diameter > metrics.loc[0,'diameter']:
            metrics.loc[0,'diameter'] = diameter
        
    # calculater density with diameter
    metrics.loc[0,'density_d'] = len(pd_df) / metrics.loc[0,'diameter']**2

    return metrics
    

In [15]:
from sklearn.cluster import AgglomerativeClustering
from sklearn.neighbors import NearestCentroid

metrics_pd_array = []

# i = 8 until 16
for i in range(8, 17):
    n_clusters = i
    clusterer = AgglomerativeClustering(n_clusters=n_clusters)
    clusterer.fit(music_features_pd)
    print(f"n_clusters: {n_clusters}, cluster_labels: {clusterer.labels_}")

    # calculate centroids

    centroid_calculator = NearestCentroid()

    centroid_calculator.fit(music_features_pd, clusterer.labels_)

    music_features_pd_with_cluster = music_features_pd
    music_features_pd_with_cluster["cluster"] = clusterer.labels_
    
    metrics_pd_array.append(music_features_pd_with_cluster.groupby("cluster").apply(calculate_metrics,centroid_calculator.centroids_))

n_clusters: 8, cluster_labels: [0 0 5 ... 1 3 3]
n_clusters: 9, cluster_labels: [8 8 5 ... 1 3 3]
n_clusters: 10, cluster_labels: [3 3 5 ... 0 1 1]
n_clusters: 11, cluster_labels: [3 3 2 ... 5 1 1]
n_clusters: 12, cluster_labels: [3 3 2 ... 5 0 0]


KeyboardInterrupt: 

## BFR Algorithm

In [None]:
n_clusters = ...
dimensions = len(features_df.columns) - 1   # don't consider 'track_id'

max_memory_used_megabytes = 4000
# Assumes all columns are floats/integers, and so therefore 4 bytes
rows_per_iteration = max_memory_used_megabytes // (4 * len(features_df.columns))
total_rows = features_df.count()

seed_random = 0

### Initialize clusters

In [None]:
# TODO: initialize K clusters/centroids, should they be removed from the dataset?

k_centroids_ids = random.choices(range(0, total_rows), k=n_clusters)

k_centroids = features_df.filter(F.col("track_id").isin(k_centroids_ids)).drop("track_id").collect()

### Loop

In [None]:
import dataclasses
import numpy.typing as npt

@dataclasses.dataclass(init=False)
class SummarizedCluster:
    n:      int                     
    sum_:   npt.NDArray[np.float32]
    sumsq_: npt.NDArray[np.float32]
    id_:    int | None 
    # TODO: all in-memory, is that not good?
    tracks: Set[int]

    def __init__(self, dimensions: int, id_: int=None):
        self.n = 0
        self.sum_ = np.zeros((dimensions,), dtype=np.float32)
        self.sumsq = np.zeros((dimensions,), dtype=np.float32)
        self.id_ = id_
        self.tracks = set()
    
    def summarize(self, point: npt.NDArray[np.float32], track_id: int):
        self.n += 1
        self.sum_ += point
        self.sumsq_ += np.pow(point, 2)
        self.tracks.add(track_id)

    def unsummarize(self, point: npt.NDArray[np.float32], track_id: int):
        self.n -= 1
        self.sum_ -= point
        self.sumsq_ -= np.pow(point, 2)
        self.tracks.remove(track_id)
    
    def centroid(self) -> npt.NDArray[np.float32]:
        return self.sum_ / self.n

    def variance(self) -> npt.NDArray[np.float32]:
        return (self.sumsq_ / self.n) - np.pow(self.sum_ / self.n, 2)

    def standard_deviation(self) -> npt.NDArray[np.float32]:
        return np.sqrt(self.variance())

In [None]:
discard_sets: List[SummarizedCluster] = [SummarizedCluster(dimensions, id_) for id_ in range(n_clusters)]
compression_sets: List[SummarizedCluster] = []
retained_set: List[npt.NDArray[np.float32]] = []

# Threshold in terms of standard deviations away from centroid, in each dimension
cluster_distance_threshold_standard_deviations = 1
cluster_distance_threshold = ((cluster_distance_threshold_standard_deviations**2) * dimensions) ** 0.5

assert len(k_centroids) == n_clusters, "The number of clusters does not coincide with the number of random centroids!"

for centroid, discard_set in zip(k_centroids, discard_sets):
    discard_set.summarize(np.array(centroid))

def mahalanobis_distance(x: npt.NDArray[np.float32], s: SummarizedCluster) -> float:
    return np.sqrt(np.sum(np.pow((x - s.centroid()) / s.standard_deviation(), 2)))

@F.udf(returnType=FloatType())
def mahalanobis_distance_column(*features: float):
    x = np.array(features)
    closest_cluster_distance, closest_cluster = min((mahalanobis_distance(x, d), d.id_) for d in discard_sets)
    return closest_cluster if closest_cluster_distance < cluster_distance_threshold else None

In [None]:
features_columns = set(features_df.columns) - {"track_id"}
split_weights = [1.0] * (total_rows // rows_per_iteration)
for loaded_points_df in features_df.filter(~F.col("track_id").isin(k_centroids_ids)).randomSplit(split_weights, seed=seed_random):
    # Step 3 - check which points go to the discard sets
    cluster_mapping = loaded_points_df \
        .withColumn("cluster", mahalanobis_distance_column(*features_columns)) \
        .groupby("cluster") \
        .agg({
            "track_ids": F.collect_list("track_id"),
            "features_list": F.collect_list(F.array(*features_columns))
        }).collect()
    
    for row in cluster_mapping:
        cluster = row["cluster"]
        track_ids = row["track_ids"]
        features_list = row["features_list"]

        if cluster is not None:
            discard_set = discard_sets[cluster]
            for track_id, features in zip(track_ids, features_list):
                discard_set.summarize(np.array(features), track_id)
        
        else:

            # Step 4 - check which points go to the compression sets or the retained set
            #!!! REVIEW THIS STEP

            matrix_to_cluster = np.array(features_list + retained_set)

            # Use same distance as above
            # TODO: how
            clusterer = AgglomerativeClustering(distance_threshold=cluster_distance_threshold)
            clusterer.fit(matrix_to_cluster)

            centroid_calculator = NearestCentroid()
            centroid_calculator.fit(matrix_to_cluster, clusterer.labels_)

            retained_set.clear()

            # Create compression sets
            compression_sets_temp = [SummarizedCluster(dimensions, None) for _ in centroid_calculator.centroids_]

            for mini_cluster, compression_set in enumerate(compression_sets_temp):
                mini_cluster_indexes = [row_idx for row_idx, label in enumerate(clusterer.labels_) if label == mini_cluster]
                
                for features in matrix_to_cluster[mini_cluster_indexes, :]:
                    # TODO: missing track_ids
                    compression_set.summarize(features)
            
                # Create the retained set
                base_variance = compression_set.variance()
                for features in matrix_to_cluster[mini_cluster_indexes, :]:
                    compression_set.unsummarize(features)

                    if compression_set.variance() < base_variance:
                        retained_set.append(features)
                    
                    compression_set.summarize(features)
                
                for features in retained_set:
                    compression_set.unsummarize(features)