In [1]:
features = [
    "ndethist", "nalerthist", "nalerthist_g", "nalerthist_r", "ra", "dec", "distnr", "magnr",
    "mean_g", "weighted_mean_g", "standard_deviation_g", "median_g", "amplitude_g",
    "beyond_1_std_g", "cusum_g", "inter_percentile_range_10_g", "kurtosis_g",
    "linear_trend_g", "linear_trend_sigma_g", "linear_trend_noise_g", "linear_fit_slope_g",
    "linear_fit_slope_sigma_g", "linear_fit_reduced_chi2_g", "magnitude_percentage_ratio_40_5_g",
    "magnitude_percentage_ratio_20_10_g", "maximum_slope_g", "median_absolute_deviation_g",
    "median_buffer_range_percentage_10_g", "percent_amplitude_g", "mean_variance_g",
    "anderson_darling_normal_g", "chi2_g", "skew_g", "stetson_K_g",
    "mean_r", "weighted_mean_r", "standard_deviation_r", "median_r", "amplitude_r",
    "beyond_1_std_r", "cusum_r", "inter_percentile_range_10_r", "kurtosis_r",
    "linear_trend_r", "linear_trend_sigma_r", "linear_trend_noise_r", "linear_fit_slope_r",
    "linear_fit_slope_sigma_r", "linear_fit_reduced_chi2_r", "magnitude_percentage_ratio_40_5_r",
    "magnitude_percentage_ratio_20_10_r", "maximum_slope_r", "median_absolute_deviation_r",
    "median_buffer_range_percentage_10_r", "percent_amplitude_r", "mean_variance_r",
    "anderson_darling_normal_r", "chi2_r", "skew_r", "stetson_K_r"
]

features_g = [
    "mean_g", "weighted_mean_g", "standard_deviation_g", "median_g",
    "amplitude_g", "beyond_1_std_g", "cusum_g", "inter_percentile_range_10_g", "kurtosis_g",
    "linear_trend_g", "linear_trend_sigma_g", "linear_trend_noise_g", "linear_fit_slope_g",
    "linear_fit_slope_sigma_g", "linear_fit_reduced_chi2_g", "magnitude_percentage_ratio_40_5_g",
    "magnitude_percentage_ratio_20_10_g", "maximum_slope_g", "median_absolute_deviation_g",
    "median_buffer_range_percentage_10_g", "percent_amplitude_g", "mean_variance_g",
    "anderson_darling_normal_g", "chi2_g", "skew_g", "stetson_K_g"
]

features_r = [
    "nalerthist_r", "mean_r", "weighted_mean_r", "standard_deviation_r", "median_r",
    "amplitude_r", "beyond_1_std_r", "cusum_r", "inter_percentile_range_10_r", "kurtosis_r",
    "linear_trend_r", "linear_trend_sigma_r", "linear_trend_noise_r", "linear_fit_slope_r",
    "linear_fit_slope_sigma_r", "linear_fit_reduced_chi2_r", "magnitude_percentage_ratio_40_5_r",
    "magnitude_percentage_ratio_20_10_r", "maximum_slope_r", "median_absolute_deviation_r",
    "median_buffer_range_percentage_10_r", "percent_amplitude_r", "mean_variance_r",
    "anderson_darling_normal_r", "chi2_r", "skew_r", "stetson_K_r"
]

In [2]:
import pyarrow.parquet as pq
import numpy as np
from sklearn.neighbors import KDTree
import json

# Полный набор фичей (без nalerthist_g, nalerthist_r - они для фильтрации)
features_all = [
    "ndethist", "nalerthist", "ra", "dec", "distnr", "magnr",
    "mean_g", "weighted_mean_g", "standard_deviation_g", "median_g",
    "amplitude_g", "beyond_1_std_g", "cusum_g", "inter_percentile_range_10_g", "kurtosis_g",
    "linear_trend_g", "linear_trend_sigma_g", "linear_trend_noise_g", "linear_fit_slope_g",
    "linear_fit_slope_sigma_g", "linear_fit_reduced_chi2_g", "magnitude_percentage_ratio_40_5_g",
    "magnitude_percentage_ratio_20_10_g", "maximum_slope_g", "median_absolute_deviation_g",
    "median_buffer_range_percentage_10_g", "percent_amplitude_g", "mean_variance_g",
    "anderson_darling_normal_g", "chi2_g", "skew_g", "stetson_K_g",
    "mean_r", "weighted_mean_r", "standard_deviation_r", "median_r",
    "amplitude_r", "beyond_1_std_r", "cusum_r", "inter_percentile_range_10_r", "kurtosis_r",
    "linear_trend_r", "linear_trend_sigma_r", "linear_trend_noise_r", "linear_fit_slope_r",
    "linear_fit_slope_sigma_r", "linear_fit_reduced_chi2_r", "magnitude_percentage_ratio_40_5_r",
    "magnitude_percentage_ratio_20_10_r", "maximum_slope_r", "median_absolute_deviation_r",
    "median_buffer_range_percentage_10_r", "percent_amplitude_r", "mean_variance_r",
    "anderson_darling_normal_r", "chi2_r", "skew_r", "stetson_K_r"
]

# Читаем данные
print("Читаем файл...")
table = pq.read_table('ad_features_all_gt1_14_08.parquet')

# Извлекаем objectId, nalerthist и нужные фичи
object_ids = table.column('objectId').to_pylist()
nalerthist = table.column('nalerthist').to_numpy()
data_full = np.column_stack([table.column(col).to_numpy() for col in features_all])

print(f"Всего объектов: {len(object_ids)}")
print(f"Размерность данных: {data_full.shape}")
print(f"Количество фичей: {len(features_all)}")

# Находим объекты без NaN
mask_no_nan = ~np.any(np.isnan(data_full), axis=1)
valid_indices = np.where(mask_no_nan)[0]
print(f"Объектов без NaN: {len(valid_indices)}")

# Данные без NaN для построения дерева
data_valid = data_full[valid_indices]
object_ids_valid = [object_ids[i] for i in valid_indices]
nalerthist_valid = nalerthist[valid_indices]

# Нормализация данных для KD-tree
data_normalized = (data_valid - np.mean(data_valid, axis=0)) / (np.std(data_valid, axis=0) + 1e-10)

# Находим индексы объектов с nalerthist > 3 (для выборки 500)
mask_nalerthist_gt3 = nalerthist_valid > 3
indices_gt3 = np.where(mask_nalerthist_gt3)[0]
print(f"Объектов с nalerthist > 3: {len(indices_gt3)}")

# 1. Выбираем 500 случайных объектов с nalerthist > 3
np.random.seed(42)
sample_indices = np.random.choice(indices_gt3, size=500, replace=False)
sample_object_ids = [object_ids_valid[i] for i in sample_indices]

print(f"\nВыбрано 500 случайных объектов с nalerthist > 3")

# ==================== С ПОЛНЫМ НАБОРОМ ФИЧЕЙ ====================
print("\n" + "="*50)
print("Строим KD-tree с полным набором фичей (all)...")

tree_full = KDTree(data_normalized, leaf_size=40)

# Ищем 11 соседей (включая сам объект)
# sklearn KDTree возвращает (distances, indices)
sample_data = data_normalized[sample_indices]
distances_full, neighbor_indices_full = tree_full.query(sample_data, k=11)

neighbors_full = {}
for i, idx in enumerate(sample_indices):
    # Исключаем сам объект (первый сосед)
    neighbor_ids = [object_ids_valid[ni] for ni in neighbor_indices_full[i][1:]]
    neighbors_full[object_ids_valid[idx]] = neighbor_ids

# Сохраняем в JSON
with open('neighbors_full_features_all.json', 'w') as f:
    json.dump(neighbors_full, f, indent=2)

print(f"Сохранено в neighbors_full_features_all.json")

# ==================== БЕЗ kurtosis_g И kurtosis_r ====================
print("\n" + "="*50)
print("Строим KD-tree без kurtosis_g и kurtosis_r...")

# Убираем kurtosis_g и kurtosis_r из списка
features_all_no_kurtosis = [f for f in features_all if f not in ["kurtosis_g", "kurtosis_r"]]
print(f"Фичей без kurtosis_g и kurtosis_r: {len(features_all_no_kurtosis)}")

# Извлекаем данные без kurtosis
data_no_kurtosis = np.column_stack([table.column(col).to_numpy() for col in features_all_no_kurtosis])

# Применяем ту же маску (объекты без NaN в полном наборе)
data_no_kurtosis_valid = data_no_kurtosis[valid_indices]

# Нормализация
data_no_kurtosis_normalized = (data_no_kurtosis_valid - np.mean(data_no_kurtosis_valid, axis=0)) / (np.std(data_no_kurtosis_valid, axis=0) + 1e-10)

tree_no_kurtosis = KDTree(data_no_kurtosis_normalized, leaf_size=40)

# Ищем соседей
sample_data_no_kurtosis = data_no_kurtosis_normalized[sample_indices]
distances_no_kurtosis, neighbor_indices_no_kurtosis = tree_no_kurtosis.query(sample_data_no_kurtosis, k=11)

neighbors_no_kurtosis = {}
for i, idx in enumerate(sample_indices):
    neighbor_ids = [object_ids_valid[ni] for ni in neighbor_indices_no_kurtosis[i][1:]]
    neighbors_no_kurtosis[object_ids_valid[idx]] = neighbor_ids

# Сохраняем в JSON
with open('neighbors_no_kurtosis_all.json', 'w') as f:
    json.dump(neighbors_no_kurtosis, f, indent=2)

print(f"Сохранено в neighbors_no_kurtosis_all.json")

# ==================== СРАВНЕНИЕ ====================
print("\n" + "="*50)
print("Сравнение результатов (все фичи)...")

intersection_percentages = []

for obj_id in sample_object_ids:
    set_full = set(neighbors_full[obj_id])
    set_no_kurtosis = set(neighbors_no_kurtosis[obj_id])
    
    intersection = len(set_full & set_no_kurtosis)
    percentage = (intersection / 10) * 100  # 10 соседей
    intersection_percentages.append(percentage)

mean_intersection = np.mean(intersection_percentages)
std_intersection = np.std(intersection_percentages)
min_intersection = np.min(intersection_percentages)
max_intersection = np.max(intersection_percentages)

print(f"\nСтатистика пересечения соседей:")
print(f"  Среднее:    {mean_intersection:.2f}%")
print(f"  Std:        {std_intersection:.2f}%")
print(f"  Минимум:    {min_intersection:.2f}%")
print(f"  Максимум:   {max_intersection:.2f}%")

# Распределение по процентам
print(f"\nРаспределение:")
for threshold in [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]:
    count = sum(1 for p in intersection_percentages if p >= threshold)
    print(f"  >= {threshold:3d}%: {count:3d} объектов ({count/5:.1f}%)")

Читаем файл...
Всего объектов: 3922520
Размерность данных: (3922520, 58)
Количество фичей: 58
Объектов без NaN: 279763
Объектов с nalerthist > 3: 279763

Выбрано 500 случайных объектов с nalerthist > 3

Строим KD-tree с полным набором фичей (all)...
Сохранено в neighbors_full_features_all.json

Строим KD-tree без kurtosis_g и kurtosis_r...
Фичей без kurtosis_g и kurtosis_r: 56
Сохранено в neighbors_no_kurtosis_all.json

Сравнение результатов (все фичи)...

Статистика пересечения соседей:
  Среднее:    88.04%
  Std:        9.85%
  Минимум:    40.00%
  Максимум:   100.00%

Распределение:
  >=   0%: 500 объектов (100.0%)
  >=  10%: 500 объектов (100.0%)
  >=  20%: 500 объектов (100.0%)
  >=  30%: 500 объектов (100.0%)
  >=  40%: 500 объектов (100.0%)
  >=  50%: 499 объектов (99.8%)
  >=  60%: 498 объектов (99.6%)
  >=  70%: 489 объектов (97.8%)
  >=  80%: 452 объектов (90.4%)
  >=  90%: 341 объектов (68.2%)
  >= 100%: 123 объектов (24.6%)


In [3]:
import pyarrow.parquet as pq
import numpy as np
from sklearn.neighbors import KDTree
import json



print("=" * 60)
print("ФИЧИ _r")
print("=" * 60)

# Фичи, оканчивающиеся на _r (без nalerthist_r - он для фильтрации)
features_r = [
    "mean_r", "weighted_mean_r", "standard_deviation_r", "median_r",
    "amplitude_r", "beyond_1_std_r", "cusum_r", "inter_percentile_range_10_r", "kurtosis_r",
    "linear_trend_r", "linear_trend_sigma_r", "linear_trend_noise_r", "linear_fit_slope_r",
    "linear_fit_slope_sigma_r", "linear_fit_reduced_chi2_r", "magnitude_percentage_ratio_40_5_r",
    "magnitude_percentage_ratio_20_10_r", "maximum_slope_r", "median_absolute_deviation_r",
    "median_buffer_range_percentage_10_r", "percent_amplitude_r", "mean_variance_r",
    "anderson_darling_normal_r", "chi2_r", "skew_r", "stetson_K_r"
]

# Читаем данные
print("Читаем файл...")
table = pq.read_table('ad_features_all_gt1_14_08.parquet')

# Извлекаем objectId, nalerthist_r и нужные фичи
object_ids = table.column('objectId').to_pylist()
nalerthist_r = table.column('nalerthist_r').to_numpy()
data_full_r = np.column_stack([table.column(col).to_numpy() for col in features_r])

print(f"Всего объектов: {len(object_ids)}")
print(f"Размерность данных: {data_full_r.shape}")

# Находим объекты без NaN
mask_no_nan_r = ~np.any(np.isnan(data_full_r), axis=1)
valid_indices_r = np.where(mask_no_nan_r)[0]
print(f"Объектов без NaN: {len(valid_indices_r)}")

# Данные без NaN для построения дерева
data_valid_r = data_full_r[valid_indices_r]
object_ids_valid_r = [object_ids[i] for i in valid_indices_r]
nalerthist_r_valid = nalerthist_r[valid_indices_r]

# Нормализация данных для KD-tree
data_normalized_r = (data_valid_r - np.mean(data_valid_r, axis=0)) / (np.std(data_valid_r, axis=0) + 1e-10)

# Находим индексы объектов с nalerthist_r > 3 (для выборки 500)
mask_nalerthist_r_gt3 = nalerthist_r_valid > 3
indices_r_gt3 = np.where(mask_nalerthist_r_gt3)[0]
print(f"Объектов с nalerthist_r > 3: {len(indices_r_gt3)}")

# Выбираем 500 случайных объектов с nalerthist_r > 3
np.random.seed(42)
sample_indices_r = np.random.choice(indices_r_gt3, size=500, replace=False)
sample_object_ids_r = [object_ids_valid_r[i] for i in sample_indices_r]

print(f"\nВыбрано 500 случайных объектов с nalerthist_r > 3")

# ==================== С ПОЛНЫМ НАБОРОМ ФИЧЕЙ _r ====================
print("\n" + "-"*50)
print("Строим KD-tree с полным набором фичей (_r)...")

tree_full_r = KDTree(data_normalized_r, leaf_size=40)

sample_data_r = data_normalized_r[sample_indices_r]
distances_full_r, neighbor_indices_full_r = tree_full_r.query(sample_data_r, k=11)

neighbors_full_r = {}
for i, idx in enumerate(sample_indices_r):
    neighbor_ids = [object_ids_valid_r[ni] for ni in neighbor_indices_full_r[i][1:]]
    neighbors_full_r[object_ids_valid_r[idx]] = neighbor_ids

with open('neighbors_full_features_r.json', 'w') as f:
    json.dump(neighbors_full_r, f, indent=2)

print(f"Сохранено в neighbors_full_features_r.json")

# ==================== БЕЗ kurtosis_r ====================
print("\n" + "-"*50)
print("Строим KD-tree без kurtosis_r...")

features_r_no_kurtosis = [f for f in features_r if f != "kurtosis_r"]
print(f"Фичей без kurtosis_r: {len(features_r_no_kurtosis)}")

data_no_kurtosis_r = np.column_stack([table.column(col).to_numpy() for col in features_r_no_kurtosis])
data_no_kurtosis_r_valid = data_no_kurtosis_r[valid_indices_r]
data_no_kurtosis_r_normalized = (data_no_kurtosis_r_valid - np.mean(data_no_kurtosis_r_valid, axis=0)) / (np.std(data_no_kurtosis_r_valid, axis=0) + 1e-10)

tree_no_kurtosis_r = KDTree(data_no_kurtosis_r_normalized, leaf_size=40)

sample_data_no_kurtosis_r = data_no_kurtosis_r_normalized[sample_indices_r]
distances_no_kurtosis_r, neighbor_indices_no_kurtosis_r = tree_no_kurtosis_r.query(sample_data_no_kurtosis_r, k=11)

neighbors_no_kurtosis_r = {}
for i, idx in enumerate(sample_indices_r):
    neighbor_ids = [object_ids_valid_r[ni] for ni in neighbor_indices_no_kurtosis_r[i][1:]]
    neighbors_no_kurtosis_r[object_ids_valid_r[idx]] = neighbor_ids

with open('neighbors_no_kurtosis_r.json', 'w') as f:
    json.dump(neighbors_no_kurtosis_r, f, indent=2)

print(f"Сохранено в neighbors_no_kurtosis_r.json")

# ==================== СРАВНЕНИЕ _r ====================
print("\n" + "-"*50)
print("Сравнение результатов (фичи _r)...")

intersection_percentages_r = []

for obj_id in sample_object_ids_r:
    set_full = set(neighbors_full_r[obj_id])
    set_no_kurtosis = set(neighbors_no_kurtosis_r[obj_id])
    
    intersection = len(set_full & set_no_kurtosis)
    percentage = (intersection / 10) * 100
    intersection_percentages_r.append(percentage)

mean_intersection_r = np.mean(intersection_percentages_r)
std_intersection_r = np.std(intersection_percentages_r)
min_intersection_r = np.min(intersection_percentages_r)
max_intersection_r = np.max(intersection_percentages_r)

print(f"\nСтатистика пересечения соседей (_r):")
print(f"  Среднее:    {mean_intersection_r:.2f}%")
print(f"  Std:        {std_intersection_r:.2f}%")
print(f"  Минимум:    {min_intersection_r:.2f}%")
print(f"  Максимум:   {max_intersection_r:.2f}%")

print(f"\nРаспределение (_r):")
for threshold in [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]:
    count = sum(1 for p in intersection_percentages_r if p >= threshold)
    print(f"  >= {threshold:3d}%: {count:3d} объектов ({count/5:.1f}%)")


#ФИЧИ, ОКАНЧИВАЮЩИЕСЯ НА _g

print("\n" + "=" * 60)
print("ФИЧИ _g")
print("=" * 60)

# Фичи, оканчивающиеся на _g (без nalerthist_g - он для фильтрации)
features_g = [
    "mean_g", "weighted_mean_g", "standard_deviation_g", "median_g",
    "amplitude_g", "beyond_1_std_g", "cusum_g", "inter_percentile_range_10_g", "kurtosis_g",
    "linear_trend_g", "linear_trend_sigma_g", "linear_trend_noise_g", "linear_fit_slope_g",
    "linear_fit_slope_sigma_g", "linear_fit_reduced_chi2_g", "magnitude_percentage_ratio_40_5_g",
    "magnitude_percentage_ratio_20_10_g", "maximum_slope_g", "median_absolute_deviation_g",
    "median_buffer_range_percentage_10_g", "percent_amplitude_g", "mean_variance_g",
    "anderson_darling_normal_g", "chi2_g", "skew_g", "stetson_K_g"
]

# Извлекаем nalerthist_g и нужные фичи
nalerthist_g = table.column('nalerthist_g').to_numpy()
data_full_g = np.column_stack([table.column(col).to_numpy() for col in features_g])

print(f"Размерность данных: {data_full_g.shape}")

# Находим объекты без NaN
mask_no_nan_g = ~np.any(np.isnan(data_full_g), axis=1)
valid_indices_g = np.where(mask_no_nan_g)[0]
print(f"Объектов без NaN: {len(valid_indices_g)}")

# Данные без NaN для построения дерева
data_valid_g = data_full_g[valid_indices_g]
object_ids_valid_g = [object_ids[i] for i in valid_indices_g]
nalerthist_g_valid = nalerthist_g[valid_indices_g]

# Нормализация данных для KD-tree
data_normalized_g = (data_valid_g - np.mean(data_valid_g, axis=0)) / (np.std(data_valid_g, axis=0) + 1e-10)

# Находим индексы объектов с nalerthist_g > 3 (для выборки 500)
mask_nalerthist_g_gt3 = nalerthist_g_valid > 3
indices_g_gt3 = np.where(mask_nalerthist_g_gt3)[0]
print(f"Объектов с nalerthist_g > 3: {len(indices_g_gt3)}")

# Выбираем 500 случайных объектов с nalerthist_g > 3
np.random.seed(42)
sample_indices_g = np.random.choice(indices_g_gt3, size=500, replace=False)
sample_object_ids_g = [object_ids_valid_g[i] for i in sample_indices_g]

print(f"\nВыбрано 500 случайных объектов с nalerthist_g > 3")


print("\n" + "-"*50)
print("Строим KD-tree с полным набором фичей (_g)...")

tree_full_g = KDTree(data_normalized_g, leaf_size=40)

sample_data_g = data_normalized_g[sample_indices_g]
distances_full_g, neighbor_indices_full_g = tree_full_g.query(sample_data_g, k=11)

neighbors_full_g = {}
for i, idx in enumerate(sample_indices_g):
    neighbor_ids = [object_ids_valid_g[ni] for ni in neighbor_indices_full_g[i][1:]]
    neighbors_full_g[object_ids_valid_g[idx]] = neighbor_ids

with open('neighbors_full_features_g.json', 'w') as f:
    json.dump(neighbors_full_g, f, indent=2)

print(f"Сохранено в neighbors_full_features_g.json")

# ==================== БЕЗ kurtosis_g ====================
print("\n" + "-"*50)
print("Строим KD-tree без kurtosis_g...")

features_g_no_kurtosis = [f for f in features_g if f != "kurtosis_g"]
print(f"Фичей без kurtosis_g: {len(features_g_no_kurtosis)}")

data_no_kurtosis_g = np.column_stack([table.column(col).to_numpy() for col in features_g_no_kurtosis])
data_no_kurtosis_g_valid = data_no_kurtosis_g[valid_indices_g]
data_no_kurtosis_g_normalized = (data_no_kurtosis_g_valid - np.mean(data_no_kurtosis_g_valid, axis=0)) / (np.std(data_no_kurtosis_g_valid, axis=0) + 1e-10)

tree_no_kurtosis_g = KDTree(data_no_kurtosis_g_normalized, leaf_size=40)

sample_data_no_kurtosis_g = data_no_kurtosis_g_normalized[sample_indices_g]
distances_no_kurtosis_g, neighbor_indices_no_kurtosis_g = tree_no_kurtosis_g.query(sample_data_no_kurtosis_g, k=11)

neighbors_no_kurtosis_g = {}
for i, idx in enumerate(sample_indices_g):
    neighbor_ids = [object_ids_valid_g[ni] for ni in neighbor_indices_no_kurtosis_g[i][1:]]
    neighbors_no_kurtosis_g[object_ids_valid_g[idx]] = neighbor_ids

with open('neighbors_no_kurtosis_g.json', 'w') as f:
    json.dump(neighbors_no_kurtosis_g, f, indent=2)

print(f"Сохранено в neighbors_no_kurtosis_g.json")

# ==================== СРАВНЕНИЕ _g ====================
print("\n" + "-"*50)
print("Сравнение результатов (фичи _g)...")

intersection_percentages_g = []

for obj_id in sample_object_ids_g:
    set_full = set(neighbors_full_g[obj_id])
    set_no_kurtosis = set(neighbors_no_kurtosis_g[obj_id])
    
    intersection = len(set_full & set_no_kurtosis)
    percentage = (intersection / 10) * 100
    intersection_percentages_g.append(percentage)

mean_intersection_g = np.mean(intersection_percentages_g)
std_intersection_g = np.std(intersection_percentages_g)
min_intersection_g = np.min(intersection_percentages_g)
max_intersection_g = np.max(intersection_percentages_g)

print(f"\nСтатистика пересечения соседей (_g):")
print(f"  Среднее:    {mean_intersection_g:.2f}%")
print(f"  Std:        {std_intersection_g:.2f}%")
print(f"  Минимум:    {min_intersection_g:.2f}%")
print(f"  Максимум:   {max_intersection_g:.2f}%")

print(f"\nРаспределение (_g):")
for threshold in [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]:
    count = sum(1 for p in intersection_percentages_g if p >= threshold)
    print(f"  >= {threshold:3d}%: {count:3d} объектов ({count/5:.1f}%)")




print("\n" + "=" * 60)
print("ИТОГОВОЕ СРАВНЕНИЕ")
print("=" * 60)

print(f"\nФичи _r: среднее пересечение = {mean_intersection_r:.2f}% (std = {std_intersection_r:.2f}%)")
print(f"Фичи _g: среднее пересечение = {mean_intersection_g:.2f}% (std = {std_intersection_g:.2f}%)")

ФИЧИ _r
Читаем файл...
Всего объектов: 3922520
Размерность данных: (3922520, 26)
Объектов без NaN: 700901
Объектов с nalerthist_r > 3: 700901

Выбрано 500 случайных объектов с nalerthist_r > 3

--------------------------------------------------
Строим KD-tree с полным набором фичей (_r)...
Сохранено в neighbors_full_features_r.json

--------------------------------------------------
Строим KD-tree без kurtosis_r...
Фичей без kurtosis_r: 25
Сохранено в neighbors_no_kurtosis_r.json

--------------------------------------------------
Сравнение результатов (фичи _r)...

Статистика пересечения соседей (_r):
  Среднее:    90.66%
  Std:        10.61%
  Минимум:    40.00%
  Максимум:   100.00%

Распределение (_r):
  >=   0%: 500 объектов (100.0%)
  >=  10%: 500 объектов (100.0%)
  >=  20%: 500 объектов (100.0%)
  >=  30%: 500 объектов (100.0%)
  >=  40%: 500 объектов (100.0%)
  >=  50%: 497 объектов (99.4%)
  >=  60%: 490 объектов (98.0%)
  >=  70%: 484 объектов (96.8%)
  >=  80%: 470 объектов