# Exercise 1

## Imports

In [225]:
import os.path
from pyspark import Broadcast
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import *
from pyspark.sql.types import StringType, ArrayType
from itertools import combinations
from typing import Iterable, Any, List

import pandas as pd
import numpy as np

## Spark initialization

In [226]:
spark = SparkSession.builder \
    .appName('exercise1') \
    .config('spark.master', 'local[*]') \
    .getOrCreate()

## Prepare the data

In [227]:
tracks_df = (spark.read
    .option("multiline", "true")
    .option("quote", '"')
    .option("escape", '"')
    .csv('data/tracks.csv')
)
# rename columns with row values from first row to second row
tmp = tracks_df.take(2)
columns = tracks_df.columns
for i in range(1,len(tmp[0])):
    tracks_df = tracks_df.withColumnRenamed(columns[i], str(tmp[0][i]) + "-" + str(tmp[1][i]))

tracks_df = (tracks_df.withColumnRenamed(columns[0], "track_id")
    .filter(col("track_id").isNotNull()) 
    .filter(col("track_id") != "track_id")
)

tracks_df.show()

+--------+--------------+-------------------+-------------------+--------------+---------------+--------+--------------------+-------------+--------------------+----------+--------------------+------------+----------------+------------------------+----------------------+------------------------+--------------------+---------------+-------------------+----------------+---------+---------------+------------------+----------------+--------------------+--------------------+-----------------------+--------------------+--------------------+---------------------+---------+----------+--------------+--------------+--------------+-------------------+-------------------+--------------+---------------+---------------+------------+-----------------+--------------------+--------------+-------------------+--------------------+-------------+--------------+------------+---------------+----------+--------------------+
|track_id|album-comments| album-date_created|album-date_released|album-engineer|album-

In [228]:
features_df = (spark.read
    .csv('data/features.csv')
)
# rename columns with row values from first row to second row
tmp = features_df.take(3)
columns = features_df.columns
for i in range(1,len(tmp[0])):
    features_df = features_df.withColumnRenamed(columns[i], str(tmp[0][i]) + "-" + str(tmp[1][i]) + "-" + str(tmp[2][i]))

features_df = (features_df.withColumnRenamed(columns[0], "track_id")
    .filter(col("track_id") != "feature")
    .filter(col("track_id") != "statistics")
    .filter(col("track_id") != "number")
    .filter(col("track_id") != "track_id")
)

features_df.show()

+--------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+---------------------+---------------------+---------------------+---------------------+---------------------+---------------------+---------------------+---------------------+---------------------+---------------------+--------------

## Agglomerative clustering (in-memory)

In [229]:
small_tracks_df = tracks_df.filter(col("set-subset") == "small")
small_features_df = (features_df
    .join(small_tracks_df, "track_id", "left")
    .filter(col("set-subset").isNotNull())
    .select(features_df.columns)
)

music_features_pd = (small_features_df
    .drop("track_id")
    .toPandas()
)
music_features_pd = music_features_pd.astype(np.float64)

                                                                                

In [248]:
# calculate the metrics (radius, diameter, density_r, density_d) for each cluster
#!!! IMCOMPLETE, DIAMETER AND DENSITY_D ARE NOT CALCULATED !!!
def calculate_metrics(df,centroids):
    
    cluster = df["cluster"].values[0]
    
    metrics = pd.DataFrame({'radius': [0], 'diameter': [0],'density_r': [0],'density_d': [0]},columns=['radius', 'diameter','density_r','density_d'])

    centroid = centroids[cluster]

    for index, row in df.iterrows():

        #row to array 
        a = row.to_numpy()

        radius = np.linalg.norm(a-centroid)

        if radius > metrics.loc[0,'radius']:
            metrics.loc[0,'radius'] = radius

    # calculater density with radius
    metrics.loc[0,'density_r'] = df.count()[0] / metrics.loc[0,'radius']**2

    return metrics
    

In [249]:
from sklearn.cluster import AgglomerativeClustering
from sklearn.neighbors import NearestCentroid

metrics_df_array = []

# i = 8 until 16
for i in range(8, 17):
    n_clusters = i
    clusterer = AgglomerativeClustering(n_clusters=n_clusters, compute_distances=True)
    clusterer.fit(music_features_pd)
    print(f"n_clusters: {n_clusters}, cluster_labels: {clusterer.labels_}")

    # calculate centroids

    centroid_calculator = NearestCentroid()

    centroid_calculator.fit(music_features_pd, clusterer.labels_)

    #len(centroid_calculator.centroids_)

    tmp = music_features_pd
    tmp["cluster"] = clusterer.labels_
    
    metrics_df_array.append(tmp.groupby("cluster").apply(calculate_metrics,centroid_calculator.centroids_))

n_clusters: 8, cluster_labels: [0 0 5 ... 1 3 3]
n_clusters: 9, cluster_labels: [8 8 5 ... 1 3 3]
n_clusters: 10, cluster_labels: [3 3 5 ... 0 1 1]
n_clusters: 11, cluster_labels: [3 3 2 ... 5 1 1]
n_clusters: 12, cluster_labels: [3 3 2 ... 5 0 0]
n_clusters: 13, cluster_labels: [ 1  1  2 ...  5  3 12]
n_clusters: 14, cluster_labels: [ 0  0  2 ...  5  1 12]
n_clusters: 15, cluster_labels: [14 14  2 ...  5  0 12]
n_clusters: 16, cluster_labels: [14 14  2 ...  5 15 12]


In [250]:
metrics_df_array[8]

Unnamed: 0_level_0,Unnamed: 1_level_0,radius,diameter,density_r,density_d
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0,3160.600124,0,4.4e-05,0
1,0,4022.787977,0,1.2e-05,0
2,0,2528.976526,0,0.000252,0
3,0,6675.458697,0,1e-06,0
4,0,3246.234071,0,2.6e-05,0
5,0,3313.100353,0,0.000117,0
6,0,3828.769418,0,3.2e-05,0
7,0,2905.077001,0,3.3e-05,0
8,0,2322.493093,0,0.000115,0
9,0,2315.340029,0,6.2e-05,0
