In [1]:
import numpy as np
import pandas as pd
import statistics as stats

from scipy.spatial.distance import pdist, squareform

Exercice 1_1_2

In [4]:
dataset = pd.read_csv('artificial_dataset.csv')

# - Raw Euclidean Distance -
# Without normalization, features with larger ranges (Sun, Pressure, ...) dominate the distance calculation.
not_normalized_data = dataset.to_numpy()
not_normalized_dst = pdist(not_normalized_data, "euclidean")
not_normalized_sf = squareform(not_normalized_dst)

# Fill diagonal with infinity to ignore self-distance (0) when finding minimum
np.fill_diagonal(not_normalized_sf, np.inf)

# Find closest and farthest pairs
raw_min_indices = np.unravel_index(np.argmin(not_normalized_sf), not_normalized_sf.shape)
raw_max_indices = np.unravel_index(np.argmax(not_normalized_sf), not_normalized_sf.shape)

print(f"Raw Data:")
print(f"Closest samples are {raw_min_indices} with distance {not_normalized_sf[raw_min_indices]:.4f}")
print(f"Farthest samples are {raw_max_indices} with distance {not_normalized_sf[raw_max_indices]:.4f}")


# - Normalized Euclidean Distance -
# With Z-score normalization, each feature contributes equally (mean=0, std=1).
normalized_df = (dataset - dataset.mean()) / dataset.std()
normalized_data = normalized_df.to_numpy()

normalized_dst = pdist(normalized_data, "euclidean")
normalized_sf = squareform(normalized_dst)

# Fill diagonal with infinity
np.fill_diagonal(normalized_sf, np.inf)

# Find closest and farthest pairs
norm_min_indices = np.unravel_index(np.argmin(normalized_sf), normalized_sf.shape)
norm_max_indices = np.unravel_index(np.argmax(normalized_sf), normalized_sf.shape)

print(f"\nNormalized Data:")
print(f"Closest samples are {norm_min_indices} with distance {normalized_sf[norm_min_indices]:.4f}")
print(f"Farthest samples are {norm_max_indices} with distance {normalized_sf[norm_max_indices]:.4f}")


Raw Data:
Closest samples are (20, 26) with distance 6.2136
Farthest samples are (0, 0) with distance inf

Normalized Data:
Closest samples are (149, 288) with distance 0.2599
Farthest samples are (0, 0) with distance inf


Discussion:
The pairs identified differ between the two metrics.
Metric 1 is heavily influenced by variables with large units/magnitudes (like 'Sun' or 'Press').
Metric 2 (Normalized) balances the contribution of all features, providing a better measure of similarity based on patterns rather than absolute magnitudes.