In [1]:
import pandas as pd
import numpy as np
import random
from IPython.display import display

import matplotlib.pyplot as plt
import itertools
from tqdm.notebook import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from torch.utils.data import DataLoader, TensorDataset

In [7]:
synthetic_df = pd.read_csv('../data/synthetic_dataset.csv')
synthetic_df.head(3)

Unnamed: 0,X_1,Y_1,Z_1,t_1,X_2,Y_2,Z_2,t_2,X_3,Y_3,...,X_35,Y_35,Z_35,t_35,X_36,Y_36,Z_36,t_36,theta,phi
0,-25.359,5.885,-6.684,78.875263,-37.609,5.885,-6.684,74.805427,-37.609,-7.315,...,-37.255,-39.545,-7.339,162.310989,-25.359,-39.545,-7.279,166.099271,35,100
1,-25.359,5.885,-6.684,78.359649,-37.609,5.885,-6.684,73.887611,-37.609,-7.315,...,-37.255,-39.545,-7.339,161.128345,-25.359,-39.545,-7.279,165.307207,35,101
2,-25.359,5.885,-6.684,77.810918,-37.609,5.885,-6.684,72.938039,-37.609,-7.315,...,-37.255,-39.545,-7.339,159.887916,-25.359,-39.545,-7.279,164.456034,35,102


In [None]:
def create_missing_data(data, num_clusters=9, cluster_size=4):
    np.random.seed(42)  # Для воспроизводимости
    num_samples = len(data)

    cluster_probs = [0.3, 0.2, 0.1, 0.1, 0.1, 0.1, 0.1]
    cluster_counts = [1, 2, 3, 4, 5, 6, 7]

    new_columns = []
    for i in range(1, num_clusters * cluster_size + 1):
        new_columns.extend([f"A_{i}", f"X_{i}", f"Y_{i}", f"Z_{i}", f"t_{i}"])

    new_columns.extend(["theta", "phi"])

    new_data = pd.DataFrame(columns=new_columns)

    for idx in range(num_samples):
        num_active_clusters = np.random.choice(cluster_counts, p=cluster_probs)
        active_clusters = np.random.choice(
            range(1, num_clusters + 1), size=num_active_clusters, replace=False
        )
        active_stations = []
        for cluster in active_clusters:
            stations = list(
                range((cluster - 1) * cluster_size +
                      1, cluster * cluster_size + 1)
            )

            if np.random.rand() < 0.3:
                missed_station = np.random.choice(stations)
                stations.remove(missed_station)

            active_stations.extend(stations)

        row = []

        for station in range(1, num_clusters * cluster_size + 1):
            if station not in active_stations:
                row.append(-1)
                row.append(-1)
                row.append(-1)
                row.append(-1)
                row.append(-1)
            else:

                row.append(1)  # A_station
                row.append(data.at[idx, f"X_{station}"])
                row.append(data.at[idx, f"Y_{station}"])
                row.append(data.at[idx, f"Z_{station}"])
                row.append(data.at[idx, f"t_{station}"])

        row.append(data.at[idx, 'theta'])
        row.append(data.at[idx, 'phi'])

        new_data.loc[idx] = row

    return new_data

In [9]:
synthetic_with_missing = create_missing_data(
    synthetic_df.copy(), num_clusters=9, cluster_size=4)

synthetic_with_missing.to_csv(
    '../data/synthetic_with_missing.csv', index=False)
synthetic_with_missing.head()

Unnamed: 0,A_1,X_1,Y_1,Z_1,t_1,A_2,X_2,Y_2,Z_2,t_2,...,Y_35,Z_35,t_35,A_36,X_36,Y_36,Z_36,t_36,theta,phi
0,1.0,-25.359,5.885,-6.684,78.875263,1.0,-37.609,5.885,-6.684,74.805427,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,35.0,100.0
1,1.0,-25.359,5.885,-6.684,78.359649,1.0,-37.609,5.885,-6.684,73.887611,...,-1.0,-1.0,-1.0,1.0,-25.359,-39.545,-7.279,165.307207,35.0,101.0
2,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,35.0,102.0
3,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,35.0,103.0
4,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,35.0,104.0
