In [1]:
import seaborn as sns
import numpy as np
import pandas as pd
from IPython.display import clear_output
from sklearn.neighbors import KDTree
from scipy.spatial import cKDTree
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
from sklearn.preprocessing import MinMaxScaler
from torch.utils.data import DataLoader, TensorDataset
import copy
from IPython.display import clear_output
from sklearn.model_selection import train_test_split
import os

Matplotlib created a temporary cache directory at /localscratch-ssd/288438/matplotlib-w41__5fr because the default path (/home/jovyan/.cache/matplotlib) is not a writable directory; it is highly recommended to set the MPLCONFIGDIR environment variable to a writable directory, in particular to speed up the import of Matplotlib and to better support multiprocessing.


Generate from Ze time series dataset 

In [2]:
import itertools

def get_periodic_coordinates(coord, size):
    """
    Generate all coordinates within a cubic domain considering periodic boundary conditions.
    
    Parameters:
        coord (pandas dataframe): A pandas dataframe containing the columns (x, y, z) of a point.
        size (int): The size of the cubic domain along each axis.
    Returns:
        list: A list of tuples containing all coordinates within the cubic domain.
    """
    ### Keep copy of original dataframe and copy for each periodic bc shift ###
    coord_copy = [coord.copy() for _ in range(27)]
    stacked_df = pd.concat(coord_copy, axis=0)
    stacked_df = stacked_df.reset_index(drop=True, inplace=False)
    
    # Get coordinates ###
    if isinstance(coord, pd.DataFrame):
        coord = coord[["x","y","z"]].values

    # Generate all combinations of displacements (-1, 0, 1) along each axis
    displacements = list(itertools.product([-1, 0, 1], repeat=3))

    # Generate all coordinates by applying periodic boundary conditions
    tp_coordinates = list()
    
    for dx, dy, dz in displacements:
          
        temp = list()
        
        for i in range(len(coord)):
            
            x, y, z = coord[i,0],coord[i,1],coord[i,2]
            
            new_x = x + dx*size
            new_y = y + dy*size
            new_z = z + dz*size

            temp.append((new_x,new_y,new_z))
            
        tp_coordinates.append( np.array(temp) )
    
    stacked_df[["x","y","z"]] = np.vstack(tp_coordinates) 
    
    return np.vstack(tp_coordinates),stacked_df

In [3]:
def group_time_series_data(time_series_data):
    
    """
    Groups the data based on case_ID and time 
    
    Parameters:
       time_series_data (pandas dataframe) : obtained from Ze's final data directory 
    Returns:
        list: A list of pandas dataframes each with a unique case id and time-stamp
    """
    ### load raw data from ze time series data ###
    pd_list  = list()
    
    for (col1_val, col2_val), group in time_series_data.groupby(['case_ID', 'time']):
    
        pd_list.append(group)
    
    return pd_list

In [4]:
def generate_nearest_neighbor_data(time_series_data):

    """
    Wrapper function (in some sense, can be condensed more)to do the data generation 
    
    Parameters:
       time_series_data (pandas dataframe) : obtained from Ze's final data directory 
    Returns:
        list: A list of pandas dataframes each with a unique case id and time-stamp
    """
    
    pd_list = group_time_series_data(time_series_data)
    
    nearest_neighbor_data = list()
    nearest_neighbor_data_extra = list()
    scalar_data = list()
    
    ### Loop over different groups ###
    
    for i in range(len(pd_list)):
        
        print("Currently on case_time subgroup : ",str(i+1))
        tp_particles,stacked_df = get_periodic_coordinates(pd_list[i],5)
        tree = cKDTree(tp_particles)
        
        ### Loop over all particles in a group and getting the nearest neighbors ###
        idx = np.stack([ tree.query(pd_list[i].iloc[j][["x","y","z"]].values,16)[1] for j in range(len(pd_list[i])) ])
        nearest_neighbor_data.append(tp_particles[idx])
        
        ### merging nodal data to the coordinates ###
        nearest_neighbor_data_extra.append(merge_columns_to_pandas_list(tp_particles[idx],"local_Re",stacked_df))
        
        ### Getting the scalar data ###
        scalar_data.append( pd_list[i][["Density_ratio","glb_phi","glb_Re","local_Re","Drag"]] )
        clear_output(wait=True)
    
    ### Populate graph and scalar lists ###
    nearest_neighbor_data = np.stack(nearest_neighbor_data)
    nearest_neighbor_data_extra = np.stack(nearest_neighbor_data_extra)
    
    nearest_neighbor_data = nearest_neighbor_data.reshape(nearest_neighbor_data.shape[0]*nearest_neighbor_data.shape[1]
                                           ,nearest_neighbor_data.shape[2]*nearest_neighbor_data.shape[3])
    
    nearest_neighbor_data_extra = nearest_neighbor_data_extra.reshape(nearest_neighbor_data_extra.shape[0]*nearest_neighbor_data_extra.shape[1]
                                           ,nearest_neighbor_data_extra.shape[2]*nearest_neighbor_data_extra.shape[3])
    
    scalar_data = np.stack(scalar_data)
    scalar_data = scalar_data.reshape(scalar_data.shape[0]*scalar_data.shape[1],scalar_data.shape[2])    
    
    ### change code if you want to return nearest_neighbor_data or extra ### 
    return np.concatenate( (nearest_neighbor_data_extra,scalar_data) ,axis=1)

In [5]:
def merge_columns_to_pandas_list(nearest_neighbor_data,variable_list,master_dataframe):

    """ given a list of pandas dataframe with the x,y,z locations and re and phi ,this function will
        merge each pandas dataframe from the list with the master dataframe with all the columns  
    """

    joined =[pd.DataFrame(nearest_neighbor_data[i],columns=["x","y","z"]) for i in range(len(nearest_neighbor_data))]

    for i in range(len(joined)):
        
        temp = copy.deepcopy(joined[i])
        add = pd.merge(temp,master_dataframe,how="inner",on=['x','y','z'],sort=False)[variable_list]
        joined[i] = pd.concat([temp,add], axis=1)
        
    return joined

In [6]:
def modify_nearest_neighbor_data(nearest_neighbor_data,pd_list):
    
    """
    This function takes nearest neighbor data and the pd_list and it will return a pandas dataframe with each row
    having the particle ID (integer), the time step (integer) and the case (integer) of which the particle is a part of
    ,and the remaining columns will be the nearest neighbor row itself.
    """
    case_column = np.stack( [ pd_list[i]["case_ID"].values for i in range(len(pd_list)) ] ).flatten()
    particle_id_column = np.stack( [ np.arange(pd_list[i].shape[0])+1 for i in range(len(pd_list)) ] ).flatten()
    time_column = np.stack( [ pd_list[i]["time"].values for i in range(len(pd_list)) ] ).flatten()
    
    ### Combining columns with nearest_neighbor_data ###
    nearest_neighbor_data_modified = np.concatenate( (case_column[:,None],particle_id_column[:,None],time_column[:,None]
                ,nearest_neighbor_data),axis=1 )
    
    return nearest_neighbor_data_modified

In [8]:
# def generate_temporally_related_datasets(nearest_neighbor_data_modified,history_length=3):
    
    

In [19]:
# ### Read data ###
experiment = "rho2_40percent_Re100"
time_series_data = pd.read_csv("../ze_time_series_data_raw/"+experiment+".dat")

pd_list = group_time_series_data(time_series_data)
nearest_neighbor_data = generate_nearest_neighbor_data(time_series_data)
nearest_neighbor_data_modified = modify_nearest_neighbor_data(nearest_neighbor_data,pd_list)
save=True

Currently on case_time subgroup :  323


In [20]:
nearest_neighbor_data_modified = modify_nearest_neighbor_data(nearest_neighbor_data,pd_list)
nearest_neighbor_data_modified = pd.DataFrame(nearest_neighbor_data_modified)

new_column_names = {0: 'case', 1: 'particle_ID', 2: 'time'}
nearest_neighbor_data_modified.rename(columns=new_column_names, inplace=True)
nearest_neighbor_data_modified = nearest_neighbor_data_modified.groupby(["case","particle_ID"])

# Collect groups into a list of DataFrames
grouped_dfs = [group for _, group in nearest_neighbor_data_modified]

In [21]:
grouped_dfs[-1]

Unnamed: 0,case,particle_ID,time,3,4,5,6,7,8,9,...,62,63,64,65,66,67,68,69,70,71
14495,2.0,96.0,1.0,0.360318,2.13072,2.23686,74.979743,0.412010,3.168920,2.38407,...,106.186847,-0.44449,0.984937,1.319230,86.832430,2.0,0.4,100.0,74.979743,45.680066
14591,2.0,96.0,2.0,0.373662,2.13099,2.21697,75.333743,0.418750,3.166730,2.38719,...,97.942070,-0.44401,0.987527,1.326270,86.077305,2.0,0.4,100.0,75.333743,16.453703
14687,2.0,96.0,3.0,0.386057,2.13072,2.19560,74.576114,0.425720,3.162310,2.38721,...,96.010325,-0.44217,0.990373,1.334160,86.117707,2.0,0.4,100.0,74.576114,34.332095
14783,2.0,96.0,4.0,0.402369,2.12824,2.17720,69.492254,-0.283976,2.462690,1.45661,...,96.642000,-0.44110,0.994344,1.342030,85.961561,2.0,0.4,100.0,69.492254,15.028256
14879,2.0,96.0,5.0,0.421006,2.12508,2.16109,69.324984,0.436720,3.150820,2.38535,...,84.235200,-1.02766,2.074730,2.992060,96.771278,2.0,0.4,100.0,69.324984,29.294079
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30623,2.0,96.0,169.0,4.388670,1.07292,1.04713,85.061506,3.550200,0.446929,1.13419,...,71.202699,4.56455,2.278590,-0.165850,103.224933,2.0,0.4,100.0,85.061506,15.587544
30719,2.0,96.0,170.0,4.390916,1.06134,1.06683,87.025160,4.721977,1.847300,1.67932,...,72.671017,4.54928,2.281980,-0.163383,103.322283,2.0,0.4,100.0,87.025160,23.345079
30815,2.0,96.0,171.0,4.391879,1.04840,1.08909,87.939140,3.526940,0.455561,1.14792,...,72.589558,4.53278,2.284030,-0.160131,105.174133,2.0,0.4,100.0,87.939140,19.300434
30911,2.0,96.0,172.0,4.391562,1.03443,1.11374,89.613672,3.513070,0.460734,1.15405,...,75.877944,4.51665,2.286310,-0.157473,103.386776,2.0,0.4,100.0,89.613672,20.887419


In [91]:
### For splitting across case ###
case_list = np.stack ( [np.unique( pd_list[i]["case_ID"] ) for i in range(len(pd_list))] )
_,index = np.unique(case_list,return_index=True)
change_point = (pd_list[0].shape[0])*index[1]
change_point_2 = change_point + (len(case_list[case_list==2])//2)*pd_list[-1].shape[0]

In [None]:
### Splitting the data as test and train (Random) ###
X_train, X_test, y_train, y_test = train_test_split(nearest_neighbor_data[:,0:64],nearest_neighbor_data[:,64:], test_size=0.2, random_state=1)

### Scaling the data ###
### Inputs ###
scaler = MinMaxScaler()
scaler.fit(X_train)

train_input_scaled = scaler.transform(X_train)
test_input_scaled = scaler.transform(X_test)

### Scalar Inputs and outputs (both are under outputs and thus needs to be separated during saving) ###
scaler = MinMaxScaler()
scaler.fit(y_train)

train_output_scaled = scaler.transform(y_train)
test_output_scaled = scaler.transform(y_test)

In [None]:
if save==True:
    
    ### Creat Directory ###
    directory = "simple_connections_data/random_split/"+experiment
    
    ### if directory does not exist ###
    if not os.path.exists(directory):
        # Create the directory
        os.makedirs(directory)
        
        if os.path.exists(directory):
            print(f"Directory '{directory}' created successfully.")
        else:
            print(f"Failed to create directory '{directory}'.")
    
    else:
        print(f"Directory '{directory}' already exists.")

    np.save("simple_connections_data/random_split/"+experiment+"/train_inputs",train_input_scaled.reshape(train_input_scaled.shape[0],16,4))
    np.save("simple_connections_data/random_split/"+experiment+"/test_inputs",test_input_scaled.reshape(test_input_scaled.shape[0],16,4))

    np.save("simple_connections_data/random_split/"+experiment+"/train_input_scalar",train_output_scaled[:,0:4])
    np.save("simple_connections_data/random_split/"+experiment+"/test_input_scalar",test_output_scaled[:,0:4])

    np.save("simple_connections_data/random_split/"+experiment+"/train_output",train_output_scaled[:,4:])
    np.save("simple_connections_data/random_split/"+experiment+"/test_output",test_output_scaled[:,4:])

In [None]:
### Splitting the data as test and train (Case-wise) ###
X_train, X_test = nearest_neighbor_data[0:change_point,0:64],nearest_neighbor_data[change_point:,0:64]
y_train, y_test = nearest_neighbor_data[0:change_point,64:],nearest_neighbor_data[change_point:,64:]

### Scaling the data ###
### Inputs ###
scaler = MinMaxScaler()
scaler.fit(X_train)

train_input_scaled = scaler.transform(X_train)
test_input_scaled = scaler.transform(X_test)

### Scalar Inputs and outputs (both are under outputs and thus needs to be separated during saving) ###
scaler = MinMaxScaler()
scaler.fit(y_train)

train_output_scaled = scaler.transform(y_train)
test_output_scaled = scaler.transform(y_test)

In [None]:
if save==True:
    
        ### Creat Directory ###
    directory = "simple_connections_data/case_split/"+experiment
    
    ### if directory does not exist ###
    if not os.path.exists(directory):
        # Create the directory
        os.makedirs(directory)
        
        if os.path.exists(directory):
            print(f"Directory '{directory}' created successfully.")
        else:
            print(f"Failed to create directory '{directory}'.")
    
    else:
        print(f"Directory '{directory}' already exists.")

    np.save("simple_connections_data/case_split/"+experiment+"/train_inputs",train_input_scaled.reshape(train_input_scaled.shape[0],16,4))
    np.save("simple_connections_data/case_split/"+experiment+"/test_inputs",test_input_scaled.reshape(test_input_scaled.shape[0],16,4))

    np.save("simple_connections_data/case_split/"+experiment+"/train_input_scalar",train_output_scaled[:,0:4])
    np.save("simple_connections_data/case_split/"+experiment+"/test_input_scalar",test_output_scaled[:,0:4])

    np.save("simple_connections_data/case_split/"+experiment+"/train_output",train_output_scaled[:,4:])
    np.save("simple_connections_data/case_split/"+experiment+"/test_output",test_output_scaled[:,4:])

In [None]:
# ### mid-time splitting ###
# ### Splitting the data as test and train (Splitting each case into two halves) ###

# ### Features Train and Test ###
X_train_1,X_train_2 = nearest_neighbor_data[0:change_point//2,0:64],nearest_neighbor_data[change_point:change_point_2,0:64]
X_train = np.concatenate((X_train_1,X_train_2),axis=0)

X_test_1,X_test_2 = nearest_neighbor_data[change_point//2:change_point,0:64],nearest_neighbor_data[change_point_2:,0:64]
X_test = np.concatenate((X_test_1,X_test_2),axis=0)

### Labels Train and Test ###
y_train_1,y_train_2 = nearest_neighbor_data[0:change_point//2,64:],nearest_neighbor_data[change_point:change_point_2,64:]
y_train = np.concatenate((y_train_1,y_train_2),axis=0)

y_test_1,y_test_2 = nearest_neighbor_data[change_point//2:change_point,64:],nearest_neighbor_data[change_point_2:,64:]
y_test = np.concatenate((y_test_1,y_test_2),axis=0)

### Scaling the data ###
### Inputs ###
scaler = MinMaxScaler()
scaler.fit(X_train)

train_input_scaled = scaler.transform(X_train)
test_input_scaled = scaler.transform(X_test)

### Scalar Inputs and outputs (both are under outputs and thus needs to be separated during saving) ###
scaler = MinMaxScaler()
scaler.fit(y_train)

train_output_scaled = scaler.transform(y_train)
test_output_scaled = scaler.transform(y_test)

In [None]:
if save==True:
    
    ### Creat Directory ###
    directory = "simple_connections_data/time_split/"+experiment
    
    ### if directory does not exist ###
    if not os.path.exists(directory):
        # Create the directory
        os.makedirs(directory)
        
        if os.path.exists(directory):
            print(f"Directory '{directory}' created successfully.")
        else:
            print(f"Failed to create directory '{directory}'.")
    
    else:
        print(f"Directory '{directory}' already exists.")

    np.save("simple_connections_data/time_split/"+experiment+"/train_inputs",train_input_scaled.reshape(train_input_scaled.shape[0],16,4))
    np.save("simple_connections_data/time_split/"+experiment+"/test_inputs",test_input_scaled.reshape(test_input_scaled.shape[0],16,4))

    np.save("simple_connections_data/time_split/"+experiment+"/train_input_scalar",train_output_scaled[:,0:4])
    np.save("simple_connections_data/time_split/"+experiment+"/test_input_scalar",test_output_scaled[:,0:4])

    np.save("simple_connections_data/time_split/"+experiment+"/train_output",train_output_scaled[:,4:])
    np.save("simple_connections_data/time_split/"+experiment+"/test_output",test_output_scaled[:,4:])

# Combine all cells from above (messy clean later)

In [6]:
all_exp_list=[
              "rho10_10percent_Re10","rho10_10percent_Re50","rho10_10percent_Re100","rho10_10percent_Re200","rho10_10percent_Re300",
              "rho10_20percent_Re10","rho10_20percent_Re50","rho10_20percent_Re100","rho10_20percent_Re200","rho10_20percent_Re300",
              "rho10_30percent_Re10","rho10_30percent_Re50","rho10_30percent_Re100","rho10_30percent_Re200","rho10_30percent_Re300",
              "rho10_40percent_Re10","rho10_40percent_Re50","rho10_40percent_Re100","rho10_40percent_Re200","rho10_40percent_Re300",
               
              "rho100_10percent_Re10","rho100_10percent_Re50","rho100_10percent_Re100","rho100_10percent_Re200","rho100_10percent_Re300",
              "rho100_20percent_Re10","rho100_20percent_Re50","rho100_20percent_Re100","rho100_20percent_Re200","rho100_20percent_Re300",
              "rho100_30percent_Re10","rho100_30percent_Re50","rho100_30percent_Re100","rho100_30percent_Re200","rho100_30percent_Re300",
              "rho100_40percent_Re10","rho100_40percent_Re50","rho100_40percent_Re100","rho100_40percent_Re200","rho100_40percent_Re300",
      
              
             ]

In [None]:
### Read data ###

for i in range(len(all_exp_list)):
    
    experiment = all_exp_list[i]
    time_series_data = pd.read_csv("../ze_time_series_data_raw/"+experiment+".dat")
    pd_list = group_time_series_data(time_series_data)
    nearest_neighbor_data = generate_nearest_neighbor_data(time_series_data)
    save=True

    ### For splitting across case ###
    case_list = np.stack ( [np.unique( pd_list[i]["case_ID"] ) for i in range(len(pd_list))] )
    _,index = np.unique(case_list,return_index=True)
    change_point = (pd_list[0].shape[0])*index[1]
    change_point_2 = change_point + (len(case_list[case_list==2])//2)*pd_list[-1].shape[0]

    ### Splitting the data as test and train (Random) ###
    X_train, X_test, y_train, y_test = train_test_split(nearest_neighbor_data[:,0:64],nearest_neighbor_data[:,64:], test_size=0.2, random_state=1)

    ### Scaling the data ###
    ### Inputs ###
    scaler = MinMaxScaler()
    scaler.fit(X_train)

    train_input_scaled = scaler.transform(X_train)
    test_input_scaled = scaler.transform(X_test)

    ### Scalar Inputs and outputs (both are under outputs and thus needs to be separated during saving) ###
    scaler = MinMaxScaler()
    scaler.fit(y_train)

    train_output_scaled = scaler.transform(y_train)
    test_output_scaled = scaler.transform(y_test)

    if save==True:

        ### Creat Directory ###
        directory = "simple_connections_data/random_split/"+experiment

        ### if directory does not exist ###
        if not os.path.exists(directory):
            # Create the directory
            os.makedirs(directory)

            if os.path.exists(directory):
                print(f"Directory '{directory}' created successfully.")
            else:
                print(f"Failed to create directory '{directory}'.")

        else:
            print(f"Directory '{directory}' already exists.")

        np.save("simple_connections_data/random_split/"+experiment+"/train_inputs",train_input_scaled.reshape(train_input_scaled.shape[0],16,4))
        np.save("simple_connections_data/random_split/"+experiment+"/test_inputs",test_input_scaled.reshape(test_input_scaled.shape[0],16,4))

        np.save("simple_connections_data/random_split/"+experiment+"/train_input_scalar",train_output_scaled[:,0:4])
        np.save("simple_connections_data/random_split/"+experiment+"/test_input_scalar",test_output_scaled[:,0:4])

        np.save("simple_connections_data/random_split/"+experiment+"/train_output",train_output_scaled[:,4:])
        np.save("simple_connections_data/random_split/"+experiment+"/test_output",test_output_scaled[:,4:])

Currently on case_time subgroup :  241


In [None]:
all_exp_list=["rho2_10percent_Re10","rho2_10percent_Re50","rho2_10percent_Re100","rho2_10percent_Re200","rho2_10percent_Re300",
              "rho2_20percent_Re10","rho2_20percent_Re50","rho2_20percent_Re100","rho2_20percent_Re200","rho2_20percent_Re300",
              "rho2_30percent_Re10","rho2_30percent_Re50","rho2_30percent_Re100","rho2_30percent_Re200","rho2_30percent_Re300",
              "rho2_40percent_Re10","rho2_40percent_Re50","rho2_40percent_Re100","rho2_40percent_Re200","rho2_40percent_Re300",
              
              "rho10_10percent_Re10","rho10_10percent_Re50","rho10_10percent_Re100","rho10_10percent_Re200","rho10_10percent_Re300",
              "rho10_20percent_Re10","rho10_20percent_Re50","rho10_20percent_Re100","rho10_20percent_Re200","rho10_20percent_Re300",
              "rho10_30percent_Re10","rho10_30percent_Re50","rho10_30percent_Re100","rho10_30percent_Re200","rho10_30percent_Re300",
              "rho10_40percent_Re10","rho10_40percent_Re50","rho10_40percent_Re100","rho10_40percent_Re200","rho10_40percent_Re300",
               
              "rho100_10percent_Re10","rho100_10percent_Re50","rho100_10percent_Re100","rho100_10percent_Re200","rho100_10percent_Re300",
              "rho100_20percent_Re10","rho100_20percent_Re50","rho100_20percent_Re100","rho100_20percent_Re200","rho100_20percent_Re300",
              "rho100_30percent_Re10","rho100_30percent_Re50","rho100_30percent_Re100","rho100_30percent_Re200","rho100_30percent_Re300",
              "rho100_40percent_Re10","rho100_40percent_Re50","rho100_40percent_Re100","rho100_40percent_Re200","rho100_40percent_Re300",
      
              
             ]

In [None]:
### Read data ###

for i in range(len(all_exp_list)):
    
    experiment = all_exp_list[i]
    time_series_data = pd.read_csv("../ze_time_series_data_raw/"+experiment+".dat")
    pd_list = group_time_series_data(time_series_data)
    nearest_neighbor_data = generate_nearest_neighbor_data(time_series_data)
    save=True

    ### For splitting across case ###
    case_list = np.stack ( [np.unique( pd_list[i]["case_ID"] ) for i in range(len(pd_list))] )
    _,index = np.unique(case_list,return_index=True)
    change_point = (pd_list[0].shape[0])*index[1]
    change_point_2 = change_point + (len(case_list[case_list==2])//2)*pd_list[-1].shape[0]
    
    
    ### mid-time splitting ###
    ### Splitting the data as test and train (Splitting each case into two halves) ###

    ### Features Train and Test ###
    X_train_1,X_train_2 = nearest_neighbor_data[0:change_point//2,0:64],nearest_neighbor_data[change_point:change_point_2,0:64]
    X_train = np.concatenate((X_train_1,X_train_2),axis=0)

    X_test_1,X_test_2 = nearest_neighbor_data[change_point//2:change_point,0:64],nearest_neighbor_data[change_point_2:,0:64]
    X_test = np.concatenate((X_test_1,X_test_2),axis=0)

    ### Labels Train and Test ###
    y_train_1,y_train_2 = nearest_neighbor_data[0:change_point//2,64:],nearest_neighbor_data[change_point:change_point_2,64:]
    y_train = np.concatenate((y_train_1,y_train_2),axis=0)

    y_test_1,y_test_2 = nearest_neighbor_data[change_point//2:change_point,64:],nearest_neighbor_data[change_point_2:,64:]
    y_test = np.concatenate((y_test_1,y_test_2),axis=0)

    ### Scaling the data ###
    ### Inputs ###
    scaler = MinMaxScaler()
    scaler.fit(X_train)

    train_input_scaled = scaler.transform(X_train)
    test_input_scaled = scaler.transform(X_test)

    ### Scalar Inputs and outputs (both are under outputs and thus needs to be separated during saving) ###
    scaler = MinMaxScaler()
    scaler.fit(y_train)

    train_output_scaled = scaler.transform(y_train)
    test_output_scaled = scaler.transform(y_test)

    
    if save==True:

        ### Creat Directory ###
        directory = "simple_connections_data/time_split/"+experiment

        ### if directory does not exist ###
        if not os.path.exists(directory):
            # Create the directory
            os.makedirs(directory)

            if os.path.exists(directory):
                print(f"Directory '{directory}' created successfully.")
            else:
                print(f"Failed to create directory '{directory}'.")

        else:
            print(f"Directory '{directory}' already exists.")

        np.save("simple_connections_data/time_split/"+experiment+"/train_inputs",train_input_scaled.reshape(train_input_scaled.shape[0],16,4))
        np.save("simple_connections_data/time_split/"+experiment+"/test_inputs",test_input_scaled.reshape(test_input_scaled.shape[0],16,4))

        np.save("simple_connections_data/time_split/"+experiment+"/train_input_scalar",train_output_scaled[:,0:4])
        np.save("simple_connections_data/time_split/"+experiment+"/test_input_scalar",test_output_scaled[:,0:4])

        np.save("simple_connections_data/time_split/"+experiment+"/train_output",train_output_scaled[:,4:])
        np.save("simple_connections_data/time_split/"+experiment+"/test_output",test_output_scaled[:,4:])

# Verfiy 

In [None]:
def dist_euclidean(vec_1,vec_2):
    
    return np.sqrt( (vec_1[0]-vec_2[0])**2 + (vec_1[1]-vec_2[1])**2 + (vec_1[2]-vec_2[2])**2 )

def brute_search(query,tree,n_nearest=16):

    ### finds the nearest neighbors with a basic algorithm ###
    ### both query and the 'tree' must be numpy arrays ###
    dist = np.stack([dist_euclidean(query,tree[i]) for i in range(len(tree))])

    return np.argsort(dist)[0:n_nearest]

In [None]:
raw_data = pd.read_csv("/home/neilashwinraj/gnns/ze_time_series_data_raw/rho100_10percent_Re100.dat")
pd_list = group_time_series_data(time_series_data)
nn_list = list()

for i in range(len(pd_list)):
    
    _,stacked_df = get_periodic_coordinates(pd_list[i],5)
    print("Case_time subset number : ",str(i+1))
    
    for j in range(len(pd_list[i])):
        
        ### getting nearest neighbors ###
        poi = np.array(pd_list[i].iloc[j][["x","y","z"]].values)
        idx = brute_search(poi,stacked_df[["x","y","z"]].values)
        nn_list.append(stacked_df.iloc[idx])
        
    clear_output(wait=True)
    
### Defining the arrays to compare ###
nn_list = np.stack(nn_list)
previous = np.stack([ nearest_neighbor_data[i][0:64].reshape(16,4)[:,0:3] for i in range(len(nearest_neighbor_data))])

# Statiscal Analysis of the Raw data 

Raw_data

In [None]:
raw_data = pd.read_csv("/home/neilashwinraj/gnns/ze_time_series_data_raw/rho100_40percent_Re300.dat")
pd_list = group_time_series_data(raw_data)
raw_data_case_1 = raw_data[raw_data["case_ID"]==1]
raw_data_case_2 = raw_data[raw_data["case_ID"]==2]
sns.kdeplot(raw_data_case_1["local_Re"],fill=True)
sns.kdeplot(raw_data_case_2["local_Re"],fill=True)
plt.legend(["Case 1","Case 2"])

In [None]:
sns.kdeplot(raw_data_case_1[["x","y","z"]].values.flatten(),fill=True)
sns.kdeplot(raw_data_case_2[["x","y","z"]].values.flatten(),fill=True)

Scaled Train/Test Data

In [None]:
scaled_data_train = np.load("/home/neilashwinraj/gnns/simple_connections/simple_connections_data/time_split/rho100_10percent_Re100/train_inputs.npy")
scaled_data_test = np.load("/home/neilashwinraj/gnns/simple_connections/simple_connections_data/time_split/rho100_10percent_Re100/test_inputs.npy")

sns.kdeplot(scaled_data_train[:,:,0].flatten())
sns.kdeplot(scaled_data_test[:,:,0].flatten())
plt.legend(["Train","Test"])

In [None]:
def histogram_distances_default(distance_list):
    hist, bin_edges = np.histogram( distance_list )
    return hist, bin_edges

def histogram_distances(distance_list, max_dist, bin_size):
    # this is the list of bins in which to calculate
    bins = np.arange(0, max_dist+bin_size, bin_size)
    hist, bin_edges = np.histogram( distance_list, bins=bins )
    return hist, bin_edges

def plot_histogram(hist,bin_edges):
    #for N bins, there are N+1 bin edges. The centers can be found by averaging the positions of 
    # bin edge0 and 1, 1 and 2, ..., N-1 and N
    bin_centers = (bin_edges[:-1]+bin_edges[1:])/2.0
    plt.plot(bin_centers,hist,marker='o')
    plt.ylabel("N(r)")
    plt.xlabel("$r$")
    
def compute_distances_minimum_image( configuration, box_size ):
    distance_list = []
    num_particles = configuration.shape[0]
    k=0
    for i in range(num_particles):
        for j in range(num_particles):
            if i == j: continue
            
            posi = configuration[i]
            posj = configuration[j]
            # compute the euclidian distance between pos1 and pos2 and call it 'dist' 
            # there are many ways to do this
            # you can certainly look up how to do this online if you can't figure it out right away
            
            #dr is a vector (dx,dy)
            dr = posj-posi
            #minimum image dr - can you figure out why this works?
            dr = dr - box_size*np.floor(dr/box_size+0.5)
            
            #dr2 is a vector (dx*dx,dy*dy)
            dr2 = dr*dr 
            #dist = sqrt( dx^2 + dy^2)
            dist = np.sqrt( dr2.sum() )            
            distance_list.append(dist)
            
    return np.array(distance_list)

def plot_rdf(gofr,bin_centers):
    plt.plot(bin_centers,gofr,marker='o')
    plt.ylabel("g(r)")
    plt.xlabel("$r$")
    
def get_gofr(hist,bin_edges,num_particles, box_size):
    rho = num_particles/(box_size**3)
    bin_centers = (bin_edges[1:]+bin_edges[:-1])/2.0
    dr = bin_edges[1]-bin_edges[0]
    denominator = 4.*np.pi*(bin_centers**2)*dr*rho*( num_particles )
    gofr = hist/denominator
    
    return gofr, bin_centers


In [None]:
raw_data = pd.read_csv("/home/neilashwinraj/gnns/ze_time_series_data_raw/rho100_40percent_Re300.dat")
pd_list = group_time_series_data(raw_data)

x_list = list()
y_list = list()

### Apllying functions to calculate the RDF ###
for i in range(len(pd_list)):
   
    
    distance_list = compute_distances_minimum_image( pd_list[i][["x","y","z"]].values, 5 )
    hist, bin_edges = histogram_distances(distance_list=distance_list, max_dist=(5/2)*np.sqrt(3),bin_size=0.1)
    bin_centers = (bin_edges[:-1]+bin_edges[1:])/2.0
    
    gofr, bin_centers = get_gofr(hist,bin_edges,num_particles = len(pd_list[i]), box_size=5)
    x_list.append(bin_centers)
    y_list.append(gofr)
    
    plt.plot( bin_centers,gofr)
    
    plt.ylim([-0.05,3.25])
    plt.show()
    print("Currently at case : ",str(np.unique(pd_list[i]["case_ID"].values)[0])
          ,"and time step : ",np.unique(pd_list[i]["time"].values)[0])
    clear_output(wait=True)
    

In [None]:
no_time_steps_case_1 = raw_data[raw_data["case_ID"]==1]["time"].values.max()
for i in range(no_time_steps_case_1):
    plt.plot(x_list[i],y_list[i],c="b",alpha=0.8)

In [None]:
no_time_steps_case_2 = raw_data[raw_data["case_ID"]==2]["time"].values.max()
for i in range(no_time_steps_case_1,no_time_steps_case_1+no_time_steps_case_2):
    plt.plot(x_list[i],y_list[i],c="r",alpha = 0.1)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.animation import FuncAnimation, FFMpegWriter

# Create a sample DataFrame
raw_data = pd.read_csv("/home/neilashwinraj/gnns/ze_time_series_data_raw/rho10_20percent_Re100.dat")
case_1_all_timsteps = raw_data[raw_data["case_ID"]==1]
case_2_all_timsteps = raw_data[raw_data["case_ID"]==2]

### for case 1
# num_particles = len(case_1_all_timsteps[case_1_all_timsteps["time"]==1])
# time_steps = case_1_all_timsteps["time"].values[-1]
# df = case_1_all_timsteps.copy()

### for case 2
num_particles = len(case_2_all_timsteps[case_2_all_timsteps["time"]==1])
time_steps = case_2_all_timsteps["time"].values[-1]
df = case_2_all_timsteps.copy()

# Initialize the figure and 3D axis
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

# Initialize a scatter plot with dummy data
scat = ax.scatter([], [], [], c='r')

# Set axis limits
ax.set_xlim([0, 5])
ax.set_ylim([0, 5])
ax.set_zlim([0, 5])

# Function to update the scatter plot for each frame
def update(frame):
    # Filter the DataFrame for the current time step
    current_data = df[df['time'] == frame]
    # Update the scatter plot data
    scat._offsets3d = (current_data['x'], current_data['y'], current_data['z'])
    ax.set_title(f'Time step: {frame}')
    return scat,

# Create the animation
ani = FuncAnimation(fig, update, frames=range(time_steps), interval=200, blit=False)

# Save the animation as an MP4 file
writer = FFMpegWriter(fps=5, metadata=dict(artist='Me'), bitrate=1800)
ani.save('particle_movement.mp4', writer=writer)

# Show the plot
plt.show()

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import cv2

# Create a sample DataFrame
raw_data = pd.read_csv("/home/neilashwinraj/gnns/ze_time_series_data_raw/rho10_20percent_Re100.dat")
case_1_all_timsteps = raw_data[raw_data["case_ID"]==1]
case_2_all_timsteps = raw_data[raw_data["case_ID"]==2]

df = case_1_all_timsteps.copy()

# Function to create frames
def create_frames(df, output_dir='frames', resolution=(1920, 1080)):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
        
    times = df['time'].unique()
    for time in times:
        fig = plt.figure(figsize=(resolution[0]/100, resolution[1]/100))
        ax = fig.add_subplot(111, projection='3d')
        ax.set_xlim(df['x'].min(), df['x'].max())
        ax.set_ylim(df['y'].min(), df['y'].max())
        ax.set_zlim(df['z'].min(), df['z'].max())
        ax.set_title(f'Time: {time}')
        
        subset = df[df['time'] == time]
        sc = ax.scatter(subset['x'], subset['y'], subset['z'], c=subset['z'], cmap='viridis', marker='o',s=50)
        fig.colorbar(sc, ax=ax, label='z')
        
        plt.savefig(f"{output_dir}/frame_{time:04d}.png")
        plt.close()

# Function to create video
def create_video(output_file='output.mp4', frame_rate=10, resolution=(1920, 1080)):
    images = [img for img in sorted(os.listdir('frames')) if img.endswith(".png")]
    frame = cv2.imread(os.path.join('frames', images[0]))
    height, width, layers = frame.shape

    video = cv2.VideoWriter(output_file, cv2.VideoWriter_fourcc(*'mp4v'), frame_rate, (width, height))

    for image in images:
        video.write(cv2.imread(os.path.join('frames', image)))

    cv2.destroyAllWindows()
    video.release()

# Generate frames
create_frames(df)

# Create video
create_video()
