# Exercise 1

## Imports

In [1]:
import os.path
import pyspark.sql.functions as F
from pyspark import Broadcast
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.types import StringType, ArrayType, FloatType
from itertools import combinations
from typing import Iterable, Any, List

import pandas as pd
import numpy as np
import math

## Spark initialization

In [2]:
spark = SparkSession.builder \
    .appName('exercise1') \
    .config('spark.master', 'local[*]') \
    .getOrCreate()

23/04/13 11:39:25 WARN Utils: Your hostname, martinho-SATELLITE-L50-B resolves to a loopback address: 127.0.1.1; using 192.168.56.108 instead (on interface wlx200db038271f)
23/04/13 11:39:25 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/04/13 11:39:26 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Prepare the data

In [11]:
tracks_df = (spark.read
    .option("multiline", "true")
    .option("quote", '"')
    .option("escape", '"')
    .csv('data/tracks.csv')
)

# rename columns with row values from first row to second row
column_categories = list(zip(*tracks_df.take(2)))
columns = tracks_df.columns
tracks_df = tracks_df.select(F.col(columns[0]).alias('track_id'),
    *(F.col(column).alias("-".join(map(str, categories)))
    for column, categories in zip(columns[1:], column_categories[1:]))
)

tracks_df = (tracks_df
    .filter(F.col("track_id").isNotNull()) 
    .filter(F.col("track_id") != "track_id")
)

tracks_df.show()

+--------+--------------+-------------------+-------------------+--------------+---------------+--------+--------------------+-------------+--------------------+----------+--------------------+------------+----------------+------------------------+----------------------+------------------------+--------------------+---------------+-------------------+----------------+---------+---------------+------------------+----------------+--------------------+--------------------+-----------------------+--------------------+--------------------+---------------------+---------+----------+--------------+--------------+--------------+-------------------+-------------------+--------------+---------------+---------------+------------+-----------------+--------------------+--------------+-------------------+--------------------+-------------+--------------+------------+---------------+----------+--------------------+
|track_id|album-comments| album-date_created|album-date_released|album-engineer|album-

In [12]:
features_df = (spark.read
    .csv('data/features.csv')
)

# rename columns with row values from first row to second row
column_categories = list(zip(*features_df.take(3)))
columns = features_df.columns
features_df = features_df.select(F.col(columns[0]).alias('track_id'),
    *(F.col(column).cast(FloatType()).alias("-".join(map(str, categories)))
    for column, categories in zip(columns[1:], column_categories[1:]))
)

features_df = (features_df
    .filter(F.col("track_id") != "feature")
    .filter(F.col("track_id") != "statistics")
    .filter(F.col("track_id") != "number")
    .filter(F.col("track_id") != "track_id")
)

## Agglomerative clustering (in-memory)

In [13]:
small_tracks_df = tracks_df.filter(F.col("set-subset") == "small")
small_features_df = (features_df
    .join(small_tracks_df, "track_id", "left")
    .filter(F.col("set-subset").isNotNull())
    .select(features_df.columns)
)

music_features_pd = (small_features_df
    .drop("track_id")
    .toPandas()
)
music_features_pd = music_features_pd.astype(np.float64)

                                                                                

In [16]:
# calculate the metrics (radius, diameter, density_r, density_d) for each cluster
def calculate_metrics(pd_df,centroids):
    
    cluster = pd_df["cluster"].values[0]
    
    metrics = pd.DataFrame({'radius': [0], 'diameter': [0],'density_r': [0],'density_d': [0]},columns=['radius', 'diameter','density_r','density_d'])

    centroid = centroids[cluster]

    for index, row in pd_df.iterrows():

        #row to array 
        a = row.to_numpy()
        print(a.shape)

        radius = math.dist(a.flatten(),centroid)

        if radius > metrics.loc[0,'radius']:
            metrics.loc[0,'radius'] = radius

    # calculater density with radius
    metrics.loc[0,'density_r'] = pd_df.size / metrics.loc[0,'radius']**2

    all_combinations = combinations(range(0,pd_df.size), 2)

    for combination in all_combinations:

        a = pd_df.iloc[combination[0]].to_numpy()
        b = pd_df.iloc[combination[1]].to_numpy()

        diameter = math.dist(a,b)

        if diameter > metrics.loc[0,'diameter']:
            metrics.loc[0,'diameter'] = diameter
        
    # calculater density with diameter
    metrics.loc[0,'density_d'] = pd_df.size / metrics.loc[0,'diameter']**2

    return metrics
    

In [17]:
from sklearn.cluster import AgglomerativeClustering
from sklearn.neighbors import NearestCentroid

metrics_pd_array = []

# i = 8 until 16
for i in range(8, 17):
    n_clusters = i
    clusterer = AgglomerativeClustering(n_clusters=n_clusters, compute_distances=True)
    clusterer.fit(music_features_pd)
    print(f"n_clusters: {n_clusters}, cluster_labels: {clusterer.labels_}")

    # calculate centroids

    centroid_calculator = NearestCentroid()

    centroid_calculator.fit(music_features_pd, clusterer.labels_)

    column_categories = music_features_pd
    column_categories["cluster"] = clusterer.labels_
    
    metrics_pd_array.append(column_categories.groupby("cluster").apply(calculate_metrics,centroid_calculator.centroids_))

n_clusters: 8, cluster_labels: [0 0 5 ... 1 3 3]


: 

: 