In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
from itertools import combinations
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm

In [3]:
df = pd.read_csv('../data/synthetic_dataset.csv')
df.head()

Unnamed: 0,X_1,Y_1,Z_1,t_1,X_2,Y_2,Z_2,t_2,X_3,Y_3,...,X_35,Y_35,Z_35,t_35,X_36,Y_36,Z_36,t_36,theta,phi
0,-25.359,5.885,-6.684,78.875263,-37.609,5.885,-6.684,74.805427,-37.609,-7.315,...,-37.255,-39.545,-7.339,162.310989,-25.359,-39.545,-7.279,166.099271,35,100
1,-25.359,5.885,-6.684,78.359649,-37.609,5.885,-6.684,73.887611,-37.609,-7.315,...,-37.255,-39.545,-7.339,161.128345,-25.359,-39.545,-7.279,165.307207,35,101
2,-25.359,5.885,-6.684,77.810918,-37.609,5.885,-6.684,72.938039,-37.609,-7.315,...,-37.255,-39.545,-7.339,159.887916,-25.359,-39.545,-7.279,164.456034,35,102
3,-25.359,5.885,-6.684,77.229236,-37.609,5.885,-6.684,71.957001,-37.609,-7.315,...,-37.255,-39.545,-7.339,158.59008,-25.359,-39.545,-7.279,163.546014,35,103
4,-25.359,5.885,-6.684,76.614781,-37.609,5.885,-6.684,70.944796,-37.609,-7.315,...,-37.255,-39.545,-7.339,157.235232,-25.359,-39.545,-7.279,162.577422,35,104


In [13]:
len(df)

7242

In [None]:

def get_clusters(row, num_clusters=9, cluster_size=4):
    """
    Создает список кластеров из строки данных. Каждый кластер содержит X, Y, Z, t для своих станций.
    """
    clusters = []
    for i in range(num_clusters):
        start_idx = i * cluster_size
        cluster = {
            f"X_{start_idx + j + 1}": row[f"X_{start_idx + j + 1}"] for j in range(cluster_size)
        }
        cluster.update({
            f"Y_{start_idx + j + 1}": row[f"Y_{start_idx + j + 1}"] for j in range(cluster_size)
        })
        cluster.update({
            f"Z_{start_idx + j + 1}": row[f"Z_{start_idx + j + 1}"] for j in range(cluster_size)
        })
        cluster.update({
            f"t_{start_idx + j + 1}": row[f"t_{start_idx + j + 1}"] for j in range(cluster_size)
        })
        clusters.append(cluster)
    return clusters


def generate_rows_for_row(row, num_clusters=9, cluster_size=4):
    """
    Генерирует все возможные конфигурации сработавших кластеров для одной строки данных.
    """
    clusters = get_clusters(row, num_clusters, cluster_size)
    new_rows = []
    for r in range(1, num_clusters + 1):
        for active_clusters in combinations(range(num_clusters), r):
            new_row = row.copy()

            for i in range(num_clusters):
                for j in range(cluster_size):
                    station_index = i * cluster_size + j + 1

                    if i in active_clusters:
                        new_row[f"A_{station_index}"] = 1
                    else:
                        new_row[f"A_{station_index}"] = -1
                        new_row[f"X_{station_index}"] = -1
                        new_row[f"Y_{station_index}"] = -1
                        new_row[f"Z_{station_index}"] = -1
                        new_row[f"t_{station_index}"] = -1

            new_rows.append(new_row)

    return new_rows


def generate_rows_parallel(df, num_clusters=9, cluster_size=4, chunk_size=100):
    """
    Использует многозадачность для ускорения генерации строк, разбивая на пакеты для лучшего использования памяти.
    """
    all_new_rows = []

    with ThreadPoolExecutor() as executor:
        for i in tqdm(range(0, len(df), chunk_size), desc="Processing chunks"):
            chunk = df.iloc[i:i + chunk_size]
            result = list(executor.map(generate_rows_for_row,
                          chunk.to_dict(orient="records")))
            all_new_rows.extend([row for sublist in result for row in sublist])

    expanded_df = pd.DataFrame(all_new_rows)

    return expanded_df

In [30]:
expanded_df = generate_rows_parallel(df)

expanded_df.to_csv('../data/expanded_synthetic_dataset.csv', index=False)

print(f"Размер нового датасета: {expanded_df.shape[0]} строк и {
      expanded_df.shape[1]} столбцов")

Processing chunks: 100%|██████████| 73/73 [02:43<00:00,  2.24s/it]


Размер нового датасета: 3700662 строк и 182 столбцов


In [12]:
expanded_df.head()

Unnamed: 0,X_1,Y_1,Z_1,t_1,X_2,Y_2,Z_2,t_2,X_3,Y_3,...,A_27,A_28,A_29,A_30,A_31,A_32,A_33,A_34,A_35,A_36
0,-25.359,5.885,-6.684,78.875263,-37.609,5.885,-6.684,74.805427,-37.609,-7.315,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
