In [1]:
import seaborn as sns
import numpy as np
import pandas as pd
from IPython.display import clear_output
from sklearn.neighbors import KDTree
from scipy.spatial import cKDTree
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
from sklearn.preprocessing import MinMaxScaler,RobustScaler
from torch.utils.data import DataLoader, TensorDataset
import copy
from IPython.display import clear_output
from sklearn.model_selection import train_test_split
import os
import plotly.graph_objs as go

Matplotlib created a temporary cache directory at /localscratch-ssd/301700/matplotlib-0kidihi0 because the default path (/home/jovyan/.cache/matplotlib) is not a writable directory; it is highly recommended to set the MPLCONFIGDIR environment variable to a writable directory, in particular to speed up the import of Matplotlib and to better support multiprocessing.


In [2]:

import itertools

def get_periodic_coordinates(coord, size):
    """
    Generate all coordinates within a cubic domain considering periodic boundary conditions.
    
    Parameters:
        coord (pandas dataframe): A pandas dataframe containing the columns (x, y, z) of a point.
        size (int): The size of the cubic domain along each axis.
    Returns:
        list: A list of tuples containing all coordinates within the cubic domain.
    """
    ### Keep copy of original dataframe and copy for each periodic bc shift ###
    coord_copy = [coord.copy() for _ in range(27)]
    stacked_df = pd.concat(coord_copy, axis=0)
    stacked_df = stacked_df.reset_index(drop=True, inplace=False)
    
    # Get coordinates ###
    if isinstance(coord, pd.DataFrame):
        coord = coord[["x","y","z"]].values

    # Generate all combinations of displacements (-1, 0, 1) along each axis
    displacements = list(itertools.product([-1, 0, 1], repeat=3))

    # Generate all coordinates by applying periodic boundary conditions
    tp_coordinates = list()
    
    for dx, dy, dz in displacements:
          
        temp = list()
        
        for i in range(len(coord)):
            
            x, y, z = coord[i,0],coord[i,1],coord[i,2]
            
            new_x = x + dx*size
            new_y = y + dy*size
            new_z = z + dz*size

            temp.append((new_x,new_y,new_z))
            
        tp_coordinates.append( np.array(temp) )
    
    stacked_df[["x","y","z"]] = np.vstack(tp_coordinates) 
    
    return np.vstack(tp_coordinates),stacked_df

In [3]:
def group_time_series_data(time_series_data):
    
    """
    Groups the data based on case_ID and time 
    
    Parameters:
       time_series_data (pandas dataframe) : obtained from Ze's final data directory 
    Returns:
        list: A list of pandas dataframes each with a unique case id and time-stamp
    """
    ### load raw data from ze time series data ###
    pd_list  = list()
    
    for (col1_val, col2_val), group in time_series_data.groupby(['case_ID', 'time']):
    
        pd_list.append(group)
    
    return pd_list

In [4]:
def generate_nearest_neighbor_data(time_series_data):

    """
    Wrapper function (in some sense, can be condensed more)to do the data generation 
    
    Parameters:
       time_series_data (pandas dataframe) : obtained from Ze's final data directory 
    Returns:
        list: A list of pandas dataframes each with a unique case id and time-stamp
    """
    
    pd_list = group_time_series_data(time_series_data)
    
    nearest_neighbor_data = list()
    nearest_neighbor_data_extra = list()
    scalar_data = list()
    
    ### Loop over different groups ###
    
    for i in range(len(pd_list)):
        
        print("Currently on case_time subgroup : ",str(i+1))
        tp_particles,stacked_df = get_periodic_coordinates(pd_list[i],5)
        tree = cKDTree(tp_particles)
        
        ### Loop over all particles in a group and getting the nearest neighbors ###
        idx = np.stack([ tree.query(pd_list[i].iloc[j][["x","y","z"]].values,16)[1] for j in range(len(pd_list[i])) ])
        nearest_neighbor_data.append(tp_particles[idx])
        
        ### merging nodal data to the coordinates ###
        nearest_neighbor_data_extra.append(merge_columns_to_pandas_list(tp_particles[idx]
                                                                       ,["local_Re","vpx","vpy","vpz"],stacked_df))
        
        ### Getting the scalar data ###
        scalar_data.append( pd_list[i][["Density_ratio","glb_phi","glb_Re","Drag"]] )
        clear_output(wait=True)
    
    ### Populate graph and scalar lists ###
    nearest_neighbor_data = np.stack(nearest_neighbor_data)
    nearest_neighbor_data_extra = np.stack(nearest_neighbor_data_extra)
    
    nearest_neighbor_data = nearest_neighbor_data.reshape(nearest_neighbor_data.shape[0]*nearest_neighbor_data.shape[1]
                                           ,nearest_neighbor_data.shape[2]*nearest_neighbor_data.shape[3])
    
    nearest_neighbor_data_extra = nearest_neighbor_data_extra.reshape(nearest_neighbor_data_extra.shape[0]*nearest_neighbor_data_extra.shape[1]
                                           ,nearest_neighbor_data_extra.shape[2]*nearest_neighbor_data_extra.shape[3])
    
    scalar_data = np.stack(scalar_data)
    scalar_data = scalar_data.reshape(scalar_data.shape[0]*scalar_data.shape[1],scalar_data.shape[2])    
    
    ### change code if you want to return nearest_neighbor_data or extra ### 
    return np.concatenate( (nearest_neighbor_data_extra,scalar_data) ,axis=1)

In [5]:
def merge_columns_to_pandas_list(nearest_neighbor_data,variable_list,master_dataframe):

    """ given a list of pandas dataframe with the x,y,z locations and re and phi ,this function will
        merge each pandas dataframe from the list with the master dataframe with all the columns  
    """

    joined =[pd.DataFrame(nearest_neighbor_data[i],columns=["x","y","z"]) for i in range(len(nearest_neighbor_data))]

    for i in range(len(joined)):
        
        temp = copy.deepcopy(joined[i])
        add = pd.merge(temp,master_dataframe,how="inner",on=['x','y','z'],sort=False)[variable_list]
        joined[i] = pd.concat([temp,add], axis=1)
        
    return joined

In [6]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

def scale_xyz_and_other_columns(train_df, test_df):
    # Initialize the MinMaxScaler
    scaler = MinMaxScaler()
    scaler = RobustScaler()
    train_df_copy, test_df_copy = train_df.copy(), test_df.copy()
    
    # Find the columns starting with specific prefixes
    x_columns = [col for col in train_df.columns if col.startswith('x_')]
    y_columns = [col for col in train_df.columns if col.startswith('y_')]
    z_columns = [col for col in train_df.columns if col.startswith('z_')]
    
    local_re_columns = [col for col in train_df.columns if col.startswith('local_Re_')]
    
    vpx_columns = [col for col in train_df.columns if col.startswith('vpx_')]
    vpy_columns = [col for col in train_df.columns if col.startswith('vpy_')]
    vpz_columns = [col for col in train_df.columns if col.startswith('vpz_')]

    # Custom scaling for x_, y_, and z_ columns
    for col in x_columns + y_columns + z_columns:
        train_df_copy[col] = (train_df_copy[col] - (-5)) / (10 - (-5))
        test_df_copy[col] = (test_df_copy[col] - (-5)) / (10 - (-5))

    # Compute global max for local_Re_, vpx_, vpy_, and vpz_ columns from train data
    global_max_local_re = train_df_copy[local_re_columns].max().max()
    global_max_vpx = train_df_copy[vpx_columns].max().max()
    global_max_vpy = train_df_copy[vpy_columns].max().max()
    global_max_vpz = train_df_copy[vpz_columns].max().max()

    # Compute global min for local_Re_, vpx_, vpy_, and vpz_ columns from train data
    global_min_local_re = train_df_copy[local_re_columns].min().min()
    global_min_vpx = train_df_copy[vpx_columns].min().min()
    global_min_vpy = train_df_copy[vpy_columns].min().min()
    global_min_vpz = train_df_copy[vpz_columns].min().min()

    # Scaling for local_Re_ columns
    for col in local_re_columns:
        train_df_copy[col] = (train_df_copy[col] - global_min_local_re) / (global_max_local_re - global_min_local_re)
        test_df_copy[col] = (test_df_copy[col] - global_min_local_re) / (global_max_local_re - global_min_local_re)

    # Scaling for vpx_ columns
    for col in vpx_columns:
        train_df_copy[col] = (train_df_copy[col] - global_min_vpx) / (global_max_vpx - global_min_vpx)
        test_df_copy[col] = (test_df_copy[col] - global_min_vpx) / (global_max_vpx - global_min_vpx)

    # Scaling for vpy_ columns
    for col in vpy_columns:
        train_df_copy[col] = (train_df_copy[col] - global_min_vpy) / (global_max_vpy - global_min_vpy)
        test_df_copy[col] = (test_df_copy[col] - global_min_vpy) / (global_max_vpy - global_min_vpy)

    # Scaling for vpz_ columns
    for col in vpz_columns:
        train_df_copy[col] = (train_df_copy[col] - global_min_vpz) / (global_max_vpz - global_min_vpz)
        test_df_copy[col] = (test_df_copy[col] - global_min_vpz) / (global_max_vpz - global_min_vpz)

    # Scaling for other columns using MinMaxScaler
    for col in train_df.columns:
        if not (col.startswith('x_') or col.startswith('y_') or col.startswith('z_') or 
                col.startswith('local_Re_') or col.startswith('vpx_') or 
                col.startswith('vpy_') or col.startswith('vpz_')):
            train_df_copy[col] = scaler.fit_transform(train_df_copy[[col]])
            test_df_copy[col] = scaler.transform(test_df_copy[[col]])
            
    return train_df_copy, test_df_copy,scaler


In [7]:
def bin_and_sample(df, time_col, n_bins, sample_frac=0.5):
    """
    Bins the rows into n different groups based on the time column, and randomly selects 50% of the rows from each bin.
    
    Parameters:
    df (pd.DataFrame): The input DataFrame.
    time_col (str): The name of the time column (assumed to be integers).
    n_bins (int): The number of bins to divide the time into.
    sample_frac (float): The fraction of rows to sample from each bin (default is 0.5).
    
    Returns:
    pd.DataFrame: A DataFrame with randomly selected rows from each bin.
    """
    
    # Create bins
    df['time_bin'] = pd.cut(df[time_col], bins=n_bins, labels=False)
    
    # Function to randomly select a fraction of rows from each bin
    def select_random_fraction(group):
        sample_size = int(len(group) * sample_frac)
        return group.sample(n=sample_size, random_state=3, replace=False)
    
    # Apply the function to each group
    result_df = df.groupby('time_bin').apply(select_random_fraction).reset_index(drop=True)
    
    # Drop the bin column
    result_df = result_df.drop(columns=['time_bin'])
    
    return result_df

In [8]:
def remove_outliers(df, column, sigma):
    """
    Remove rows from the DataFrame where the values in the specified column
    are more than a given number of standard deviations (sigma) away from the mean.
    
    Parameters:
    df (pd.DataFrame): The input DataFrame.
    column (str): The column to check for outliers.
    sigma (float): The number of standard deviations to use as the threshold.
    
    Returns:
    pd.DataFrame: The DataFrame with outliers removed.
    """
    mean = df[column].mean()
    std = df[column].std()
    threshold = sigma * std
    
    # Filter the DataFrame
    filtered_df = df[abs(df[column] - mean) <= threshold]
    
    return filtered_df

In [9]:
def remove_outliers_average_based(df, column, magnitude):
    
    mean = df[column].mean()
    
    print("Mean Value = ",mean)
    
    # Filter the DataFrame
    filtered_df = df[abs(df[column]) <= mean*magnitude]
    
    return filtered_df

In [10]:
def random_rotation_matrix_x_axis():
    """Generates a random rotation matrix for rotation about the x-axis."""
    theta = np.random.uniform(0, 2 * np.pi)
    rotation_matrix = np.array([
        [1, 0, 0],
        [0, np.cos(theta), -np.sin(theta)],
        [0, np.sin(theta), np.cos(theta)]
    ])
    return rotation_matrix, theta


def rotate_and_append(df, n,num_neighbors = 15):
    """
    Randomly sample n rows from the dataframe, rotate the 3D coordinates
    using a random rotation matrix around the x-axis, and append the rotated rows back to the dataframe.
    
    Parameters:
    df (pandas.DataFrame): DataFrame containing 3D coordinates with columns x_poi, y_poi, z_poi and x_1, y_1, z_1, ..., x_15, y_15, z_15
    n (int): Number of rows to sample from the DataFrame
    """
    # Randomly sample n rows without replacement
    sampled_df = df.sample(n=n, replace=False).copy()
    df_copy = df.copy()
    point_columns = ["x_poi","y_poi","z_poi"]
    
    for i in range(1, num_points + 1):
        point_columns.extend([f'x_{i}', f'y_{i}', f'z_{i}'])    
    
    for index, row in sampled_df.iterrows():
        
        # Get the rotation matrix
        rotation_matrix,_ = random_rotation_matrix_x_axis()
         
        # Perform the rotation on each 3D coordinate
        rotated_position_vectors = np.stack([ np.dot(row[point_columns].values.reshape(num_neighbors+1,3)[i][None,:],
                                                    rotation_matrix) for i in range(num_neighbors+1)]).flatten()

        sampled_df.loc[index, point_columns] = rotated_position_vectors
    
    return pd.concat([df_copy,sampled_df])

In [11]:
def plot_3d_vectors(row,num_neighbors=5):
    # Extract the origin (POI)
    x_origin, y_origin, z_origin = row['x_poi'], row['y_poi'], row['z_poi']
    
    # Extract the position vectors (x_1, y_1, z_1, ... , x_15, y_15, z_15)
    x_vectors = [row[f'x_{i}'] for i in range(1, num_neighbors+1)]
    y_vectors = [row[f'y_{i}'] for i in range(1, num_neighbors+1)]
    z_vectors = [row[f'z_{i}'] for i in range(1, num_neighbors+1)]
    
    # Create scatter plot of all vectors (including the origin)
    fig = go.Figure()

    # Plot the origin
    fig.add_trace(go.Scatter3d(
        x=[x_origin], y=[y_origin], z=[z_origin],
        mode='markers',
        marker=dict(size=6, color='red'),
        name='Origin'
    ))

    # Plot the vectors
    fig.add_trace(go.Scatter3d(
        x=x_vectors, y=y_vectors, z=z_vectors,
        mode='markers',
        marker=dict(size=4, color='blue'),
        name='Vectors'
    ))

    # Draw lines from the origin to each vector
    for i in range(num_neighbors):
        fig.add_trace(go.Scatter3d(
            x=[x_origin, x_vectors[i]], 
            y=[y_origin, y_vectors[i]], 
            z=[z_origin, z_vectors[i]],
            mode='lines',
            line=dict(color='black', width=2),
            showlegend=False
        ))

    # Update layout for better visualization
    fig.update_layout(scene=dict(
        xaxis_title='X',
        yaxis_title='Y',
        zaxis_title='Z'
    ),
    title='3D Vectors from Origin',
    showlegend=True)
    
    # Show the figure
    fig.show()

# Outlier filtering

In [12]:
# experiment_list = [
#                    "rho2_10percent_Re10","rho2_10percent_Re50","rho2_10percent_Re100","rho2_10percent_Re200",
#                    "rho2_10percent_Re300",
#                    "rho2_20percent_Re10","rho2_20percent_Re50","rho2_20percent_Re100","rho2_20percent_Re200",
#                    "rho2_20percent_Re300",
#                    "rho2_30percent_Re10","rho2_30percent_Re50","rho2_30percent_Re100","rho2_30percent_Re200",
#                    "rho2_30percent_Re300",
#                    "rho2_40percent_Re10","rho2_40percent_Re50","rho2_40percent_Re100","rho2_40percent_Re200",
#                    "rho2_40percent_Re300"]

In [13]:

# for i in range(len(experiment_list)):
    
#     experiment = experiment_list[i]
#     time_series_data = pd.read_csv("../ze_time_series_data_raw/data_with_xyz_velocities/"+experiment+".dat",index_col=False)

# #     reduced_time_series = remove_outliers(df=time_series_data, column="Drag", sigma=4.0)
#     reduced_time_series = remove_outliers_average_based(df=time_series_data, column="Drag", magnitude=3.0)
    
#     sns.kdeplot(reduced_time_series["Drag"].values)
#     plt.show()
#     print(experiment)
#     print("reduction in the dataset size %", (time_series_data.shape[0] - reduced_time_series.shape[0])/time_series_data.shape[0])
#     print("size of orig and filtered data", time_series_data.shape[0] , reduced_time_series.shape[0])
#     print("Mean drag",time_series_data["Drag"].mean())
#     print("\n")
    

# Generate Data

In [46]:
### Read data ###
experiment = "rho2_40percent_Re300"
time_series_data = pd.read_csv("../ze_time_series_data_raw/data_with_xyz_velocities/"+experiment+".dat",index_col=False)

pd_list = group_time_series_data(time_series_data)
nearest_neighbor_data = generate_nearest_neighbor_data(time_series_data)

Currently on case_time subgroup :  367


In [47]:
### Define the particle id ###
num_particle_1 = time_series_data[ (time_series_data["case_ID"]==1) & (time_series_data["time"]==1)].shape[0]
total_time_1 = time_series_data[ (time_series_data["case_ID"]==1)].iloc[-1]["time"]

num_particle_2 = time_series_data[ (time_series_data["case_ID"]==2) & (time_series_data["time"]==1)].shape[0]
total_time_2 = time_series_data[ (time_series_data["case_ID"]==2)].iloc[-1]["time"]

particle_id = np.concatenate( (np.tile( (np.arange(num_particle_1)+1)[:,None],(int(total_time_1), 1)),
                               np.tile( (np.arange(num_particle_2)+1)[:,None],(int(total_time_2), 1))),axis=0 )

In [48]:
### Renaming the columns ###
num_points = 15

# Names for the first 7 columns
initial_columns = ['x_poi', 'y_poi', 'z_poi', 'local_Re_poi', 'vpx_poi', 'vpy_poi', 'vpz_poi']

# Names for each set of 7 columns for each point
point_columns = []
for i in range(1, num_points + 1):
    point_columns.extend([f'x_{i}', f'y_{i}', f'z_{i}', f'local_Re_{i}', f'vpx_{i}', f'vpy_{i}', f'vpz_{i}'])

# Names for the last 4 columns
final_columns = ['Density_ratio', 'glb_phi', 'glb_Re', 'Drag']

# Combine all column names
column_names = initial_columns + point_columns + final_columns

### make a dataframe version of the nearesrt_negihbor_dat numpy array ###
nearest_neighbor_data_pd = pd.DataFrame(nearest_neighbor_data, columns=column_names)

### adding time and case and particle ID columns ###
nearest_neighbor_data_pd.insert(0, 'case_ID', time_series_data["case_ID"])
nearest_neighbor_data_pd.insert(1, 'particle_ID', particle_id)
# nearest_neighbor_data_pd.insert(2, 'time', time_series_data["time"])
nearest_neighbor_data_pd.insert(2, 'time', time_series_data["time"])

print("Total number of time steps for case 1 and 2 respectively" , 
      time_series_data[time_series_data["case_ID"]==1]["time"].values[-1],',',
      time_series_data[time_series_data["case_ID"]==2]["time"].values[-1])

print("80% percent length for case 1 and 2 respectively" , 
      int(time_series_data[time_series_data["case_ID"]==1]["time"].values[-1]*0.80),',',
      int(time_series_data[time_series_data["case_ID"]==2]["time"].values[-1]*0.80) )

### Keeping only positive Drag Forces ###
nearest_neighbor_data_pd = nearest_neighbor_data_pd.loc[nearest_neighbor_data_pd['Drag'] >= 0]

### Remove outliers of very high/ low drags ###
print("Shape before outlier rejection :  ",nearest_neighbor_data_pd.shape[0])
nearest_neighbor_data_pd = remove_outliers_average_based(df=nearest_neighbor_data_pd, column="Drag", magnitude=3.0)
print("Shape after outlier rejection :  ",nearest_neighbor_data_pd.shape[0])
print("Min,Max and Mean drag forces : ",nearest_neighbor_data_pd["Drag"].values.max(),
                                        nearest_neighbor_data_pd["Drag"].values.min(),
                                        nearest_neighbor_data_pd["Drag"].values.mean())

Total number of time steps for case 1 and 2 respectively 222 , 145
80% percent length for case 1 and 2 respectively 177 , 116
Shape before outlier rejection :   34618
Mean Value =  57.18778606161534
Shape after outlier rejection :   34452
Min,Max and Mean drag forces :  171.28322 0.044876 56.44343501811216


# Extrapolation Data Split

In [49]:
# # Select rows for the train dataset (80/20) split ###
start_time = 0
interval_length_1 = int(time_series_data[time_series_data["case_ID"]==1]["time"].values[-1]*0.80)
interval_length_2 = int(time_series_data[time_series_data["case_ID"]==2]["time"].values[-1]*0.80) 

train_case_1 = nearest_neighbor_data_pd[(nearest_neighbor_data_pd['case_ID'] == 1) & (nearest_neighbor_data_pd['time'] <= start_time+interval_length_1)
                                                                                   & (nearest_neighbor_data_pd['time'] >= start_time)]

train_case_2 = nearest_neighbor_data_pd[(nearest_neighbor_data_pd['case_ID'] == 2) & (nearest_neighbor_data_pd['time'] <= start_time+interval_length_2)
                                                                                   & (nearest_neighbor_data_pd['time'] >= start_time)]

# # Combine the selected rows for the train dataset
train_dataset = pd.concat([train_case_1, train_case_2])

# # Select all other rows for the test dataset
test_dataset = nearest_neighbor_data_pd.drop(train_dataset.index)

print("shape of train and test dataset : ",train_dataset.shape,test_dataset.shape )

shape of train and test dataset :  (27537, 119) (6915, 119)


# For POI based coordinates 

In [50]:
### for POI based coordinates ### 
for col in train_dataset.columns:
    
    if col.startswith("x_") and not col.endswith("_poi"):
        train_dataset[col] -= train_dataset["x_poi"]
        
    if col.startswith("y_") and not col.endswith("_poi"):
        train_dataset[col] -= train_dataset["y_poi"]
        
    if col.startswith("z_") and not col.endswith("_poi"):
        train_dataset[col] -= train_dataset["z_poi"]
        
for col in test_dataset.columns:
    
    if col.startswith("x_") and not col.endswith("_poi"):
        test_dataset[col] -= test_dataset["x_poi"]
        
    if col.startswith("y_") and not col.endswith("_poi"): 
        test_dataset[col] -= test_dataset["y_poi"]
        
    if col.startswith("z_") and not col.endswith("_poi"): 
        test_dataset[col] -= test_dataset["z_poi"]
        

### Set poi to zeros ###
### un-scaled data ###
train_dataset[["x_poi","y_poi","z_poi"]] = 0
test_dataset[["x_poi","y_poi","z_poi"]] = 0

# Rotational Invariance

In [51]:
### Generate synthetic datapoints for rotational invariance soft constraint ###
num_synthetic = 500
train_dataset = rotate_and_append(train_dataset,num_synthetic)

In [59]:
    ### Rotation unit tests ###
    
    num_neighbors=2
    
    temp = train_dataset[train_dataset["Drag"]==train_dataset["Drag"].values[-np.random.randint(750)]]
    dot_1 = np.dot(temp.iloc[0][["x_1","y_1","z_1"]],temp.iloc[0][["x_2","y_2","z_2"]])
    dot_2 = np.dot(temp.iloc[1][["x_1","y_1","z_1"]],temp.iloc[1][["x_2","y_2","z_2"]])
    
    print(dot_1,dot_2)
    
    # Extract the origin (POI)
    row = temp.iloc[0]
    x_origin, y_origin, z_origin = row['x_poi'], row['y_poi'], row['z_poi']
    
    # Extract the position vectors (x_1, y_1, z_1, ... , x_15, y_15, z_15)
    x_vectors = [row[f'x_{i}'] for i in range(1, num_neighbors+1)]
    y_vectors = [row[f'y_{i}'] for i in range(1, num_neighbors+1)]
    z_vectors = [row[f'z_{i}'] for i in range(1, num_neighbors+1)]
    
    # Create scatter plot of all vectors (including the origin)
    fig = go.Figure()

    # Plot the origin
    fig.add_trace(go.Scatter3d(
        x=[x_origin], y=[y_origin], z=[z_origin],
        mode='markers',
        marker=dict(size=6, color='red'),
        name='Origin'
    ))

    # Plot the vectors
    fig.add_trace(go.Scatter3d(
        x=x_vectors, y=y_vectors, z=z_vectors,
        mode='markers',
        marker=dict(size=4, color='blue'),
        name='Vectors'
    ))

    # Draw lines from the origin to each vector
    for i in range(num_neighbors):
        fig.add_trace(go.Scatter3d(
            x=[x_origin, x_vectors[i]], 
            y=[y_origin, y_vectors[i]], 
            z=[z_origin, z_vectors[i]],
            mode='lines',
            line=dict(color='black', width=2),
            showlegend=False
        ))

    # Update layout for better visualization
    fig.update_layout(scene=dict(
        xaxis_title='X',
        yaxis_title='Y',
        zaxis_title='Z'
    ),
    title='3D Vectors from Origin',
    showlegend=True)
    
    # Extract the origin (POI)
    row = temp.iloc[1]
    x_origin, y_origin, z_origin = row['x_poi'], row['y_poi'], row['z_poi']
    
    # Extract the position vectors (x_1, y_1, z_1, ... , x_15, y_15, z_15)
    x_vectors = [row[f'x_{i}'] for i in range(1, num_neighbors+1)]
    y_vectors = [row[f'y_{i}'] for i in range(1, num_neighbors+1)]
    z_vectors = [row[f'z_{i}'] for i in range(1, num_neighbors+1)]
    
    # Plot the origin
    fig.add_trace(go.Scatter3d(
        x=[x_origin], y=[y_origin], z=[z_origin],
        mode='markers',
        marker=dict(size=6, color='red'),
        name='Origin'
    ))

    # Plot the vectors
    fig.add_trace(go.Scatter3d(
        x=x_vectors, y=y_vectors, z=z_vectors,
        mode='markers',
        marker=dict(size=4, color='blue'),
        name='Vectors'
    ))

    # Draw lines from the origin to each vector
    for i in range(num_neighbors):
        fig.add_trace(go.Scatter3d(
            x=[x_origin, x_vectors[i]], 
            y=[y_origin, y_vectors[i]], 
            z=[z_origin, z_vectors[i]],
            mode='lines',
            line=dict(color='yellow', width=2),
            showlegend=False
        ))

    # Update layout for better visualization
    fig.update_layout(scene=dict(
        xaxis_title='X',
        yaxis_title='Y',
        zaxis_title='Z'
    ),
    title='3D Vectors from Origin',
    showlegend=True)
    
    # Show the figure
    fig.show()

-0.2781056443200007 -0.2781056443200006


# Min-Max Scaling

In [53]:
train_dataset_scaled,test_dataset_scaled,scaler = scale_xyz_and_other_columns(train_dataset,test_dataset)

### scaled data ###
train_dataset_scaled[["x_poi","y_poi","z_poi"]] = 0
test_dataset_scaled[["x_poi","y_poi","z_poi"]] = 0

# Save the Data 

In [54]:
import joblib
scaler_filename = "extrapolation/"+experiment+"/80_20_time_split/" + "scaler.save"
joblib.dump(scaler, scaler_filename) 

# And now to load...
# scaler = joblib.load(scaler_filename) 

['extrapolation/rho2_40percent_Re300/80_20_time_split/scaler.save']

In [55]:
### saving the case, particle and time identifiers for the test identifiers ###
test_dataset[["case_ID","particle_ID","time"]].to_csv("extrapolation/"+experiment+"/80_20_time_split/cpt_identifier")

In [56]:
### Saving the unscaled data ###
train_dataset.iloc[:, 2:-4].to_csv("extrapolation/"+experiment+"/80_20_time_split/train_input_unscaled")
test_dataset.iloc[:, 2:-4].to_csv("extrapolation/"+experiment+"/80_20_time_split/test_input_unscaled")

train_dataset.iloc[:, [0,-4,-3,-2]].to_csv("extrapolation/"+experiment+"/80_20_time_split/train_input_scalar_unscaled")
test_dataset.iloc[:, [0,-4,-3,-2]].to_csv("extrapolation/"+experiment+"/80_20_time_split/test_input_scalar_unscaled")

train_dataset.iloc[:, -1:].to_csv("extrapolation/"+experiment+"/80_20_time_split/train_output_unscaled")
test_dataset.iloc[:, -1:].to_csv("extrapolation/"+experiment+"/80_20_time_split/test_output_unscaled")

In [57]:
### Saving the scaled data ###
train_dataset_scaled.iloc[:, 2:-4].to_csv("extrapolation/"+experiment+"/80_20_time_split/train_input")
test_dataset_scaled.iloc[:, 2:-4].to_csv("extrapolation/"+experiment+"/80_20_time_split/test_input")

train_dataset_scaled.iloc[:, [0,-4,-3,-2]].to_csv("extrapolation/"+experiment+"/80_20_time_split/train_input_scalar")
test_dataset_scaled.iloc[:, [0,-4,-3,-2]].to_csv("extrapolation/"+experiment+"/80_20_time_split/test_input_scalar")

train_dataset_scaled.iloc[:, -1:].to_csv("extrapolation/"+experiment+"/80_20_time_split/train_output")
test_dataset_scaled.iloc[:, -1:].to_csv("extrapolation/"+experiment+"/80_20_time_split/test_output")

# Combining all Data 

In [None]:
experiment_list = [
                   "rho2_10percent_Re10","rho2_10percent_Re50","rho2_10percent_Re100","rho2_10percent_Re200",
                   "rho2_10percent_Re300",
                   "rho2_20percent_Re10","rho2_20percent_Re50","rho2_20percent_Re100","rho2_20percent_Re200",
                   "rho2_20percent_Re300",
                   "rho2_30percent_Re10","rho2_30percent_Re50","rho2_30percent_Re100","rho2_30percent_Re200",
                   "rho2_30percent_Re300",
                   "rho2_40percent_Re10","rho2_40percent_Re50","rho2_40percent_Re100","rho2_40percent_Re200",
                   "rho2_40percent_Re300",
                   
                   "rho10_10percent_Re10","rho10_10percent_Re50","rho10_10percent_Re100","rho10_10percent_Re200",
                   "rho10_10percent_Re300",
                   "rho10_20percent_Re10","rho10_20percent_Re50","rho10_20percent_Re100","rho10_20percent_Re200",
                   "rho10_20percent_Re300",
                   "rho10_30percent_Re10","rho10_30percent_Re50","rho10_30percent_Re100","rho10_30percent_Re200",
                   "rho10_30percent_Re300",
                   "rho10_40percent_Re10","rho10_40percent_Re50","rho10_40percent_Re100","rho10_40percent_Re200",
                   "rho10_40percent_Re300",
    
                   "rho100_10percent_Re10","rho100_10percent_Re50","rho100_10percent_Re100","rho100_10percent_Re200",
                   "rho100_10percent_Re300",
                   "rho100_20percent_Re10","rho100_20percent_Re50","rho100_20percent_Re100","rho100_20percent_Re200",
                   "rho100_20percent_Re300",
                   "rho100_30percent_Re10","rho100_30percent_Re50","rho100_30percent_Re100","rho100_30percent_Re200",
                   "rho100_30percent_Re300",
                   "rho100_40percent_Re10","rho100_40percent_Re50","rho100_40percent_Re100","rho100_40percent_Re200",
                   "rho100_40percent_Re300"]

### define lists
train_input_list = list()
train_input_scalar_list = list()
train_output_list = list()

test_input_list = list()
test_input_scalar_list = list()
test_output_list = list()

for i in range(len(experiment_list)):
    
    loc = "extrapolation/"+experiment_list[i]+"/120_150_simple/"
    
    train_input = pd.read_csv(loc+"train_input_unscaled",index_col=False)
    train_input_scalar = pd.read_csv(loc+"train_input_scalar_unscaled",index_col=False)
    train_output = pd.read_csv(loc+"train_output_unscaled",index_col=False)
    
    test_input = pd.read_csv(loc+"test_input_unscaled",index_col=False)
    test_input_scalar = pd.read_csv(loc+"test_input_scalar_unscaled",index_col=False)
    test_output = pd.read_csv(loc+"test_output_unscaled",index_col=False)
    
    ### Removing first columns ###
    train_input = train_input.drop('Unnamed: 0', axis=1)
    train_input_scalar = train_input_scalar.drop('Unnamed: 0', axis=1)
    train_output = train_output.drop('Unnamed: 0', axis=1)
    
    test_input = test_input.drop('Unnamed: 0', axis=1)
    test_input_scalar = test_input_scalar.drop('Unnamed: 0', axis=1)
    test_output = test_output.drop('Unnamed: 0', axis=1)
    
    ### appending to the lists ###
    train_input_list.append(train_input)
    train_input_scalar_list.append(train_input_scalar)
    train_output_list.append(train_output)
    
    test_input_list.append(test_input)
    test_input_scalar_list.append(test_input_scalar)
    test_output_list.append(test_output)
    
### Converting to a dataframe ###
train_input_list = pd.concat(train_input_list)
train_input_scalar_list = pd.concat(train_input_scalar_list)
train_output_list = pd.concat(train_output_list)

test_input_list = pd.concat(test_input_list)
test_input_scalar_list = pd.concat(test_input_scalar_list)
test_output_list = pd.concat(test_output_list)

### combine all daatframes to form a common dataframe for train and test ###
train_dataset = pd.concat([train_input_list,train_input_scalar_list,train_output_list], axis=1)
test_dataset = pd.concat([test_input_list,test_input_scalar_list,test_output_list], axis=1)

In [None]:
### for POI based coordinates ### 
for col in train_dataset.columns:
    
    if col.startswith("x_") and not col.endswith("_poi"):
        train_dataset[col] -= train_dataset["x_poi"]
        
    if col.startswith("y_") and not col.endswith("_poi"):
        train_dataset[col] -= train_dataset["y_poi"]
        
    if col.startswith("z_") and not col.endswith("_poi"):
        train_dataset[col] -= train_dataset["z_poi"]
        
for col in test_dataset.columns:
    
    if col.startswith("x_") and not col.endswith("_poi"):
        test_dataset[col] -= test_dataset["x_poi"]
        
    if col.startswith("y_") and not col.endswith("_poi"): 
        test_dataset[col] -= test_dataset["y_poi"]
        
    if col.startswith("z_") and not col.endswith("_poi"): 
        test_dataset[col] -= test_dataset["z_poi"]

In [None]:
### Combined dataset scaling ###
train_dataset_scaled,test_dataset_scaled = scale_xyz_and_other_columns(train_dataset,test_dataset)

In [None]:
### Saving the unscaled data ###
train_dataset.iloc[:, 0:-4].to_csv("extrapolation/combined_data/120_150_simple/train_input_unscaled")
test_dataset.iloc[:, 0:-4].to_csv("extrapolation/combined_data/120_150_simple/test_input_unscaled")

train_dataset.iloc[:, -4:-1].to_csv("extrapolation/combined_data/120_150_simple/train_input_scalar_unscaled")
test_dataset.iloc[:, -4:-1].to_csv("extrapolation/combined_data/120_150_simple/test_input_scalar_unscaled")

train_dataset.iloc[:, -1:].to_csv("extrapolation/combined_data/120_150_simple/train_output_unscaled")
test_dataset.iloc[:, -1:].to_csv("extrapolation/combined_data/120_150_simple/test_output_unscaled")

In [None]:
### Saving the scaled data ###
train_dataset_scaled.iloc[:, 0:-4].to_csv("extrapolation/combined_data/120_150_simple/train_input")
test_dataset_scaled.iloc[:, 0:-4].to_csv("extrapolation/combined_data/120_150_simple/test_input")

train_dataset_scaled.iloc[:, -4:-1].to_csv("extrapolation/combined_data/120_150_simple/train_input_scalar")
test_dataset_scaled.iloc[:, -4:-1].to_csv("extrapolation/combined_data/120_150_simple/test_input_scalar")

train_dataset_scaled.iloc[:, -1:].to_csv("extrapolation/combined_data/120_150_simple/train_output")
test_dataset_scaled.iloc[:, -1:].to_csv("extrapolation/combined_data/120_150_simple/test_output")

# Verfiy 

In [None]:
def dist_euclidean(vec_1,vec_2):
    
    return np.sqrt( (vec_1[0]-vec_2[0])**2 + (vec_1[1]-vec_2[1])**2 + (vec_1[2]-vec_2[2])**2 )

def brute_search(query,tree,n_nearest=16):

    ### finds the nearest neighbors with a basic algorithm ###
    ### both query and the 'tree' must be numpy arrays ###
    dist = np.stack([dist_euclidean(query,tree[i]) for i in range(len(tree))])

    return np.argsort(dist)[0:n_nearest]

In [None]:
raw_data = pd.read_csv("/home/neilashwinraj/gnns/ze_time_series_data_raw/rho100_10percent_Re100.dat")
pd_list = group_time_series_data(time_series_data)
nn_list = list()

for i in range(len(pd_list)):
    
    _,stacked_df = get_periodic_coordinates(pd_list[i],5)
    print("Case_time subset number : ",str(i+1))
    
    for j in range(len(pd_list[i])):
        
        ### getting nearest neighbors ###
        poi = np.array(pd_list[i].iloc[j][["x","y","z"]].values)
        idx = brute_search(poi,stacked_df[["x","y","z"]].values)
        nn_list.append(stacked_df.iloc[idx])
        
    clear_output(wait=True)
    
### Defining the arrays to compare ###
nn_list = np.stack(nn_list)
previous = np.stack([ nearest_neighbor_data[i][0:64].reshape(16,4)[:,0:3] for i in range(len(nearest_neighbor_data))])

# Verify 2

In [None]:
# Load data from Excel file
time_series_data = pd.read_csv("../ze_time_series_data_raw/data_with_xyz_velocities/"+experiment+".dat",index_col=False)
pd_list = group_time_series_data(time_series_data)

def create_periodic_images(coords, Lx, Ly, Lz):
    
    shifts = [-1, 0, 1]
    extended_coords = []

    for shift_x in shifts:
        for shift_y in shifts:
            for shift_z in shifts:
                shift_vector = np.array([shift_x * Lx, shift_y * Ly, shift_z * Lz])
                extended_coords.append(coords + shift_vector)

    return np.concatenate(extended_coords, axis=0)

def find_nearest_neighbors(coords, extended_coords, Lx, Ly, Lz, num_neighbors=15):
    
    tree = cKDTree(extended_coords)
    distances, indices = tree.query(coords, k=num_neighbors + 1)  # +1 because the query point itself is included
    
    return indices  # Exclude the query point itself


nearest_neighbor_data_verify_2 = list()

for i in range(len(pd_list)):
    print("case_time subgroup number : ",str(i+1))
    df = pd_list[i]

    # Extract coordinates into a numpy array
    coordinates = df[['x', 'y', 'z']].values

    # Define the domain size (replace with actual domain sizes)
    Lx, Ly, Lz = 5, 5, 5
    
    # Generate extended coordinates considering periodic images
    extended_coords = create_periodic_images(coordinates, Lx, Ly, Lz)

    # Find the 15 nearest neighbors for each point
    nearest_neighbors = find_nearest_neighbors(coordinates, extended_coords, Lx, Ly, Lz)

    # Output the nearest neighbors
    # print(nearest_neighbors)
    nearest_neighbor_data_verify_2.append(extended_coords[nearest_neighbors])
    clear_output(wait=True)
    
nearest_neighbor_data_verify_2 = np.vstack(nearest_neighbor_data_verify_2)
nearest_neighbor_data_verify_2 = nearest_neighbor_data_verify_2.reshape(nearest_neighbor_data_verify_2.shape[0],-1)

In [None]:
filtered_df = nearest_neighbor_data_pd.filter(regex=r'^(x_|y_|z_)')
decision = np.array_equal(nearest_neighbor_data_verify_2,filtered_df.values)
if decision:
    print("Its correct !")

# Ze data comparision unit testing

In [None]:
### Load ze training data ###
ze_data = np.load("../ze_time_series_data_raw/TrainTest_shared_by_ze/train_random_all.npy")
# ze_data = ze_data[:, np.concatenate([np.arange(45), [-2]])]

In [None]:
point_columns = []
for i in range(1, num_points + 1):
    point_columns.extend([f'x_{i}', f'y_{i}', f'z_{i}'])

# for i in range(len(train_dataset)):
for i in range(100):
    matching_index = np.where((ze_data[:,0:45] == train_dataset.iloc[i][point_columns].values[None,:].astype('float32')).all(axis=1))[0]
    print("Index in the train_dataset Data Frame and the index in ze_dat : ",i,matching_index)
    print("\n")

# Verify Scaling

In [None]:
### check scaling ###
temp = train_dataset.values
temp = np.array(temp[:,2:-4])

temp_nodal = temp.reshape(temp.shape[0],16,7)
temp_nodal = temp_nodal.reshape(temp_nodal.shape[0]*16,7)

scaler = MinMaxScaler()
temp_nodal_scaled = scaler.fit_transform(temp_nodal)

temp = temp_nodal_scaled.reshape(temp.shape[0],temp.shape[1])

In [None]:
res = train_dataset_scaled.values[:,2:-4] - temp

res_pd = pd.DataFrame(res,columns=["x_poi","y_poi","z_poi","local_Re_poi","vpx_poi","vpy_poi","vpz_poi",
                                   "x_1","y_1","z_1","local_Re_1","vpx_1","vpy_1","vpz_1",
                                   "x_2","y_2","z_2","local_Re_2","vpx_2","vpy_2","vpz_2",
                                   "x_3","y_3","z_3","local_Re_3","vpx_3","vpy_3","vpz_3",
                                   "x_4","y_4","z_4","local_Re_4","vpx_4","vpy_4","vpz_4",
                                   "x_5","y_5","z_5","local_Re_5","vpx_5","vpy_5","vpz_5",
                                   "x_6","y_6","z_6","local_Re_6","vpx_6","vpy_6","vpz_6",
                                   "x_7","y_7","z_7","local_Re_7","vpx_7","vpy_7","vpz_7",
                                   "x_8","y_8","z_8","local_Re_8","vpx_8","vpy_8","vpz_8",
                                   "x_9","y_9","z_9","local_Re_9","vpx_9","vpy_9","vpz_9",
                                   "x_10","y_10","z_10","local_Re_10","vpx_10","vpy_10","vpz_10",  
                                   "x_11","y_11","z_11","local_Re_11","vpx_11","vpy_11","vpz_11",
                                   "x_12","y_12","z_12","local_Re_12","vpx_12","vpy_12","vpz_12",
                                   "x_13","y_13","z_13","local_Re_13","vpx_13","vpy_13","vpz_13",
                                   "x_14","y_14","z_14","local_Re_14","vpx_14","vpy_14","vpz_14",
                                   "x_15","y_15","z_15","local_Re_15","vpx_15","vpy_15","vpz_15",
                                  ])

columns_to_drop = res_pd.filter(regex='^(x_|y_|z_)').columns
res_pd = res_pd.drop(columns=columns_to_drop)

# Statiscal Analysis of the Raw data 

Raw_data

In [None]:
raw_data = pd.read_csv("/home/neilashwinraj/gnns/ze_time_series_data_raw/rho100_40percent_Re300.dat")
pd_list = group_time_series_data(raw_data)
raw_data_case_1 = raw_data[raw_data["case_ID"]==1]
raw_data_case_2 = raw_data[raw_data["case_ID"]==2]
sns.kdeplot(raw_data_case_1["local_Re"],fill=True)
sns.kdeplot(raw_data_case_2["local_Re"],fill=True)
plt.legend(["Case 1","Case 2"])

In [None]:
sns.kdeplot(raw_data_case_1[["x","y","z"]].values.flatten(),fill=True)
sns.kdeplot(raw_data_case_2[["x","y","z"]].values.flatten(),fill=True)

Scaled Train/Test Data

In [None]:
scaled_data_train = np.load("/home/neilashwinraj/gnns/simple_connections/simple_connections_data/time_split/rho100_10percent_Re100/train_inputs.npy")
scaled_data_test = np.load("/home/neilashwinraj/gnns/simple_connections/simple_connections_data/time_split/rho100_10percent_Re100/test_inputs.npy")

sns.kdeplot(scaled_data_train[:,:,0].flatten())
sns.kdeplot(scaled_data_test[:,:,0].flatten())
plt.legend(["Train","Test"])

In [None]:
def histogram_distances_default(distance_list):
    hist, bin_edges = np.histogram( distance_list )
    return hist, bin_edges

def histogram_distances(distance_list, max_dist, bin_size):
    # this is the list of bins in which to calculate
    bins = np.arange(0, max_dist+bin_size, bin_size)
    hist, bin_edges = np.histogram( distance_list, bins=bins )
    return hist, bin_edges

def plot_histogram(hist,bin_edges):
    #for N bins, there are N+1 bin edges. The centers can be found by averaging the positions of 
    # bin edge0 and 1, 1 and 2, ..., N-1 and N
    bin_centers = (bin_edges[:-1]+bin_edges[1:])/2.0
    plt.plot(bin_centers,hist,marker='o')
    plt.ylabel("N(r)")
    plt.xlabel("$r$")
    
def compute_distances_minimum_image( configuration, box_size ):
    distance_list = []
    num_particles = configuration.shape[0]
    k=0
    for i in range(num_particles):
        for j in range(num_particles):
            if i == j: continue
            
            posi = configuration[i]
            posj = configuration[j]
            # compute the euclidian distance between pos1 and pos2 and call it 'dist' 
            # there are many ways to do this
            # you can certainly look up how to do this online if you can't figure it out right away
            
            #dr is a vector (dx,dy)
            dr = posj-posi
            #minimum image dr - can you figure out why this works?
            dr = dr - box_size*np.floor(dr/box_size+0.5)
            
            #dr2 is a vector (dx*dx,dy*dy)
            dr2 = dr*dr 
            #dist = sqrt( dx^2 + dy^2)
            dist = np.sqrt( dr2.sum() )            
            distance_list.append(dist)
            
    return np.array(distance_list)

def plot_rdf(gofr,bin_centers):
    plt.plot(bin_centers,gofr,marker='o')
    plt.ylabel("g(r)")
    plt.xlabel("$r$")
    
def get_gofr(hist,bin_edges,num_particles, box_size):
    
    rho = num_particles/(box_size**3)
    bin_centers = (bin_edges[1:]+bin_edges[:-1])/2.0
    dr = bin_edges[1]-bin_edges[0]
    denominator = 4.*np.pi*(bin_centers**2)*dr*rho*( num_particles )
    gofr = hist/denominator
    
    return gofr, bin_centers

In [None]:
raw_data = pd.read_csv("/home/neilashwinraj/gnns/ze_time_series_data_raw/rho100_40percent_Re300.dat")
pd_list = group_time_series_data(raw_data)

x_list = list()
y_list = list()

### Apllying functions to calculate the RDF ###
for i in range(len(pd_list)):
   
    
    distance_list = compute_distances_minimum_image( pd_list[i][["x","y","z"]].values, 5 )
    hist, bin_edges = histogram_distances(distance_list=distance_list, max_dist=(5/2)*np.sqrt(3),bin_size=0.1)
    bin_centers = (bin_edges[:-1]+bin_edges[1:])/2.0
    
    gofr, bin_centers = get_gofr(hist,bin_edges,num_particles = len(pd_list[i]), box_size=5)
    x_list.append(bin_centers)
    y_list.append(gofr)
    
    plt.plot( bin_centers,gofr)
    
    plt.ylim([-0.05,3.25])
    plt.show()
    print("Currently at case : ",str(np.unique(pd_list[i]["case_ID"].values)[0])
          ,"and time step : ",np.unique(pd_list[i]["time"].values)[0])
    clear_output(wait=True)
    

In [None]:
no_time_steps_case_1 = raw_data[raw_data["case_ID"]==1]["time"].values.max()
for i in range(no_time_steps_case_1):
    plt.plot(x_list[i],y_list[i],c="b",alpha=0.8)

In [None]:
no_time_steps_case_2 = raw_data[raw_data["case_ID"]==2]["time"].values.max()
for i in range(no_time_steps_case_1,no_time_steps_case_1+no_time_steps_case_2):
    plt.plot(x_list[i],y_list[i],c="r",alpha = 0.1)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.animation import FuncAnimation, FFMpegWriter

# Create a sample DataFrame
raw_data = pd.read_csv("/home/neilashwinraj/gnns/ze_time_series_data_raw/rho10_20percent_Re100.dat")
case_1_all_timsteps = raw_data[raw_data["case_ID"]==1]
case_2_all_timsteps = raw_data[raw_data["case_ID"]==2]

### for case 1
# num_particles = len(case_1_all_timsteps[case_1_all_timsteps["time"]==1])
# time_steps = case_1_all_timsteps["time"].values[-1]
# df = case_1_all_timsteps.copy()

### for case 2
num_particles = len(case_2_all_timsteps[case_2_all_timsteps["time"]==1])
time_steps = case_2_all_timsteps["time"].values[-1]
df = case_2_all_timsteps.copy()

# Initialize the figure and 3D axis
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

# Initialize a scatter plot with dummy data
scat = ax.scatter([], [], [], c='r')

# Set axis limits
ax.set_xlim([0, 5])
ax.set_ylim([0, 5])
ax.set_zlim([0, 5])

# Function to update the scatter plot for each frame
def update(frame):
    # Filter the DataFrame for the current time step
    current_data = df[df['time'] == frame]
    # Update the scatter plot data
    scat._offsets3d = (current_data['x'], current_data['y'], current_data['z'])
    ax.set_title(f'Time step: {frame}')
    return scat,

# Create the animation
ani = FuncAnimation(fig, update, frames=range(time_steps), interval=200, blit=False)

# Save the animation as an MP4 file
writer = FFMpegWriter(fps=5, metadata=dict(artist='Me'), bitrate=1800)
ani.save('particle_movement.mp4', writer=writer)

# Show the plot
plt.show()

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import cv2

# Create a sample DataFrame
raw_data = pd.read_csv("/home/neilashwinraj/gnns/ze_time_series_data_raw/rho10_20percent_Re100.dat")
case_1_all_timsteps = raw_data[raw_data["case_ID"]==1]
case_2_all_timsteps = raw_data[raw_data["case_ID"]==2]

df = case_1_all_timsteps.copy()

# Function to create frames
def create_frames(df, output_dir='frames', resolution=(1920, 1080)):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
        
    times = df['time'].unique()
    for time in times:
        fig = plt.figure(figsize=(resolution[0]/100, resolution[1]/100))
        ax = fig.add_subplot(111, projection='3d')
        ax.set_xlim(df['x'].min(), df['x'].max())
        ax.set_ylim(df['y'].min(), df['y'].max())
        ax.set_zlim(df['z'].min(), df['z'].max())
        ax.set_title(f'Time: {time}')
        
        subset = df[df['time'] == time]
        sc = ax.scatter(subset['x'], subset['y'], subset['z'], c=subset['z'], cmap='viridis', marker='o',s=50)
        fig.colorbar(sc, ax=ax, label='z')
        
        plt.savefig(f"{output_dir}/frame_{time:04d}.png")
        plt.close()

# Function to create video
def create_video(output_file='output.mp4', frame_rate=10, resolution=(1920, 1080)):
    images = [img for img in sorted(os.listdir('frames')) if img.endswith(".png")]
    frame = cv2.imread(os.path.join('frames', images[0]))
    height, width, layers = frame.shape

    video = cv2.VideoWriter(output_file, cv2.VideoWriter_fourcc(*'mp4v'), frame_rate, (width, height))

    for image in images:
        video.write(cv2.imread(os.path.join('frames', image)))

    cv2.destroyAllWindows()
    video.release()

# Generate frames
create_frames(df)

# Create video
create_video()


# Garbage

In [None]:
# # Define the time thresholds 80/20 split ###
# time_1 = int(time_series_data[time_series_data["case_ID"]==1]["time"].values[-1]*0.80)
# time_2 = int(time_series_data[time_series_data["case_ID"]==2]["time"].values[-1]*0.80)

# # Select rows for the train dataset based on the given conditions
# train_case_1 = nearest_neighbor_data_pd[(nearest_neighbor_data_pd['case_ID'] == 1) & (nearest_neighbor_data_pd['time'] <= time_1)]
# train_case_2 = nearest_neighbor_data_pd[(nearest_neighbor_data_pd['case_ID'] == 2) & (nearest_neighbor_data_pd['time'] <= time_2)]

# # Combine the selected rows for the train dataset
# train_dataset = pd.concat([train_case_1, train_case_2])

# # Select all other rows for the test dataset
# test_dataset = nearest_neighbor_data_pd.drop(train_dataset.index)

# print("shape of train and test dataset : ",train_dataset.shape,test_dataset.shape )

In [None]:
# ### Ze paper split with time interval definition ###
# start_time = 0
# interval_length = 150
# interval_length_val = 30

# # Select rows for the train dataset based on the given conditions ###
# train_case_1 = nearest_neighbor_data_pd[(nearest_neighbor_data_pd['case_ID'] == 1) & (nearest_neighbor_data_pd['time'] <= start_time+interval_length)
#                                                                    & (nearest_neighbor_data_pd['time'] >= start_time)]

# train_case_2 = nearest_neighbor_data_pd[(nearest_neighbor_data_pd['case_ID'] == 2) & (nearest_neighbor_data_pd['time'] <= start_time+interval_length)
#                                                                    & (nearest_neighbor_data_pd['time'] >= start_time)  ]

# # Combine the selected rows for the train dataset
# train_dataset = pd.concat([train_case_1, train_case_2])
# print("Size of the non sample train dataframe",train_dataset.shape)

# # Select all other rows for the test dataset
# test_dataset = nearest_neighbor_data_pd.drop(train_dataset.index)
# test_dataset = test_dataset[(test_dataset["time"] <= start_time+interval_length+interval_length_val)&
#                             (test_dataset["time"] >= start_time+interval_length)]

# ### Randomly sample after binning ###
# train_case_1_bin_sampled = bin_and_sample(df = train_dataset[train_dataset["case_ID"]==1], time_col="time", n_bins=10, sample_frac=0.5)
# train_case_2_bin_sampled = bin_and_sample(df = train_dataset[train_dataset["case_ID"]==2], time_col="time", n_bins=10, sample_frac=0.5)

# ### reconstruct train dataset ###
# train_dataset = pd.concat([train_case_1_bin_sampled,train_case_2_bin_sampled])

# print("shape of train (after sampling) and test dataset : ",train_dataset.shape,test_dataset.shape )

In [None]:
# ### Saving the unscaled data ###
# train_dataset.iloc[:, 2:-4].to_csv("/home/neilashwinraj/gnns/temporal_gnns/ablation_study/datasets_dev/rho2_phi20_Re10_simple_global/train_input_unscaled")
# test_dataset.iloc[:, 2:-4].to_csv("/home/neilashwinraj/gnns/temporal_gnns/ablation_study/datasets_dev/rho2_phi20_Re10_simple_global/test_input_unscaled")

# train_dataset.iloc[:, [0,-4,-3,-2]].to_csv("/home/neilashwinraj/gnns/temporal_gnns/ablation_study/datasets_dev/rho2_phi20_Re10_simple_global/train_input_scalar_unscaled")
# test_dataset.iloc[:, [0,-4,-3,-2]].to_csv("/home/neilashwinraj/gnns/temporal_gnns/ablation_study/datasets_dev/rho2_phi20_Re10_simple_global/test_input_scalar_unscaled")

# train_dataset.iloc[:, -1:].to_csv("/home/neilashwinraj/gnns/temporal_gnns/ablation_study/datasets_dev/rho2_phi20_Re10_simple_global//train_output_unscaled")
# test_dataset.iloc[:, -1:].to_csv("/home/neilashwinraj/gnns/temporal_gnns/ablation_study/datasets_dev/rho2_phi20_Re10_simple_global//test_output_unscaled")

In [None]:
# ### Saving the scaled data ###
# train_dataset_scaled.iloc[:, 2:-4].to_csv("/home/neilashwinraj/gnns/temporal_gnns/ablation_study/datasets_dev/rho2_phi20_Re10_simple_global/train_input")
# test_dataset_scaled.iloc[:, 2:-4].to_csv("/home/neilashwinraj/gnns/temporal_gnns/ablation_study/datasets_dev/rho2_phi20_Re10_simple_global//test_input")

# train_dataset_scaled.iloc[:, [0,-4,-3,-2]].to_csv("/home/neilashwinraj/gnns/temporal_gnns/ablation_study/datasets_dev/rho2_phi20_Re10_simple_global//train_input_scalar")
# test_dataset_scaled.iloc[:, [0,-4,-3,-2]].to_csv("/home/neilashwinraj/gnns/temporal_gnns/ablation_study/datasets_dev/rho2_phi20_Re10_simple_global//test_input_scalar")

# train_dataset_scaled.iloc[:, -1:].to_csv("/home/neilashwinraj/gnns/temporal_gnns/ablation_study/datasets_dev/rho2_phi20_Re10_simple_global//train_output")
# test_dataset_scaled.iloc[:, -1:].to_csv("/home/neilashwinraj/gnns/temporal_gnns/ablation_study/datasets_dev/rho2_phi20_Re10_simple_global//test_output")