In [1]:
import seaborn as sns
import numpy as np
import pandas as pd
from IPython.display import clear_output
from sklearn.neighbors import KDTree
from scipy.spatial import cKDTree
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
from sklearn.preprocessing import MinMaxScaler
from torch.utils.data import DataLoader, TensorDataset
import copy
from IPython.display import clear_output
from sklearn.model_selection import train_test_split
import os

Matplotlib created a temporary cache directory at /localscratch-ssd/288604/matplotlib-ju_vrbj2 because the default path (/home/jovyan/.cache/matplotlib) is not a writable directory; it is highly recommended to set the MPLCONFIGDIR environment variable to a writable directory, in particular to speed up the import of Matplotlib and to better support multiprocessing.


Generate from Ze time series dataset 

In [2]:
import itertools

def get_periodic_coordinates(coord, size):
    """
    Generate all coordinates within a cubic domain considering periodic boundary conditions.
    
    Parameters:
        coord (pandas dataframe): A pandas dataframe containing the columns (x, y, z) of a point.
        size (int): The size of the cubic domain along each axis.
    Returns:
        list: A list of tuples containing all coordinates within the cubic domain.
    """
    ### Keep copy of original dataframe and copy for each periodic bc shift ###
    coord_copy = [coord.copy() for _ in range(27)]
    stacked_df = pd.concat(coord_copy, axis=0)
    stacked_df = stacked_df.reset_index(drop=True, inplace=False)
    
    # Get coordinates ###
    if isinstance(coord, pd.DataFrame):
        coord = coord[["x","y","z"]].values

    # Generate all combinations of displacements (-1, 0, 1) along each axis
    displacements = list(itertools.product([-1, 0, 1], repeat=3))

    # Generate all coordinates by applying periodic boundary conditions
    tp_coordinates = list()
    
    for dx, dy, dz in displacements:
          
        temp = list()
        
        for i in range(len(coord)):
            
            x, y, z = coord[i,0],coord[i,1],coord[i,2]
            
            new_x = x + dx*size
            new_y = y + dy*size
            new_z = z + dz*size

            temp.append((new_x,new_y,new_z))
            
        tp_coordinates.append( np.array(temp) )
    
    stacked_df[["x","y","z"]] = np.vstack(tp_coordinates) 
    
    return np.vstack(tp_coordinates),stacked_df

In [3]:
def group_time_series_data(time_series_data):
    
    """
    Groups the data based on case_ID and time 
    
    Parameters:
       time_series_data (pandas dataframe) : obtained from Ze's final data directory 
    Returns:
        list: A list of pandas dataframes each with a unique case id and time-stamp
    """
    ### load raw data from ze time series data ###
    pd_list  = list()
    
    for (col1_val, col2_val), group in time_series_data.groupby(['case_ID', 'time']):
    
        pd_list.append(group)
    
    return pd_list

In [4]:
def generate_nearest_neighbor_data(time_series_data):

    """
    Wrapper function (in some sense, can be condensed more)to do the data generation 
    
    Parameters:
       time_series_data (pandas dataframe) : obtained from Ze's final data directory 
    Returns:
        list: A list of pandas dataframes each with a unique case id and time-stamp
    """
    
    pd_list = group_time_series_data(time_series_data)
    
    nearest_neighbor_data = list()
    nearest_neighbor_data_extra = list()
    scalar_data = list()
    
    ### Loop over different groups ###
    
    for i in range(len(pd_list)):
        
        print("Currently on case_time subgroup : ",str(i+1))
        tp_particles,stacked_df = get_periodic_coordinates(pd_list[i],5)
        tree = cKDTree(tp_particles)
        
        ### Loop over all particles in a group and getting the nearest neighbors ###
        idx = np.stack([ tree.query(pd_list[i].iloc[j][["x","y","z"]].values,16)[1] for j in range(len(pd_list[i])) ])
        nearest_neighbor_data.append(tp_particles[idx])
        
        ### merging nodal data to the coordinates ###
        nearest_neighbor_data_extra.append(merge_columns_to_pandas_list(tp_particles[idx],"local_Re",stacked_df))
        
        ### Getting the scalar data ###
        scalar_data.append( pd_list[i][["Density_ratio","glb_phi","glb_Re","local_Re","Drag"]] )
        clear_output(wait=True)
    
    ### Populate graph and scalar lists ###
    nearest_neighbor_data = np.stack(nearest_neighbor_data)
    nearest_neighbor_data_extra = np.stack(nearest_neighbor_data_extra)
    
    nearest_neighbor_data = nearest_neighbor_data.reshape(nearest_neighbor_data.shape[0]*nearest_neighbor_data.shape[1]
                                           ,nearest_neighbor_data.shape[2]*nearest_neighbor_data.shape[3])
    
    nearest_neighbor_data_extra = nearest_neighbor_data_extra.reshape(nearest_neighbor_data_extra.shape[0]*nearest_neighbor_data_extra.shape[1]
                                           ,nearest_neighbor_data_extra.shape[2]*nearest_neighbor_data_extra.shape[3])
    
    scalar_data = np.stack(scalar_data)
    scalar_data = scalar_data.reshape(scalar_data.shape[0]*scalar_data.shape[1],scalar_data.shape[2])    
    
    ### change code if you want to return nearest_neighbor_data or extra ### 
    return np.concatenate( (nearest_neighbor_data_extra,scalar_data) ,axis=1)

In [5]:
def merge_columns_to_pandas_list(nearest_neighbor_data,variable_list,master_dataframe):

    """ given a list of pandas dataframe with the x,y,z locations and re and phi ,this function will
        merge each pandas dataframe from the list with the master dataframe with all the columns  
    """

    joined =[pd.DataFrame(nearest_neighbor_data[i],columns=["x","y","z"]) for i in range(len(nearest_neighbor_data))]

    for i in range(len(joined)):
        
        temp = copy.deepcopy(joined[i])
        add = pd.merge(temp,master_dataframe,how="inner",on=['x','y','z'],sort=False)[variable_list]
        joined[i] = pd.concat([temp,add], axis=1)
        
    return joined

In [6]:
def modify_nearest_neighbor_data(nearest_neighbor_data,pd_list):
    
    """
    This function takes nearest neighbor data and the pd_list and it will return a pandas dataframe with each row
    having the particle ID (integer), the time step (integer) and the case (integer) of which the particle is a part of
    ,and the remaining columns will be the nearest neighbor row itself.
    """
    case_column = np.stack( [ pd_list[i]["case_ID"].values for i in range(len(pd_list)) ] ).flatten()
    particle_id_column = np.stack( [ np.arange(pd_list[i].shape[0])+1 for i in range(len(pd_list)) ] ).flatten()
    time_column = np.stack( [ pd_list[i]["time"].values for i in range(len(pd_list)) ] ).flatten()
    
    ### Combining columns with nearest_neighbor_data ###
    nearest_neighbor_data_modified = np.concatenate( (case_column[:,None],particle_id_column[:,None],time_column[:,None]
                ,nearest_neighbor_data),axis=1 )
    
    return nearest_neighbor_data_modified

In [66]:
   def generate_temporally_history_datasets_for_single_group(single_df,history_length,sampling_rate):
        
        """
        performs the data generation for a single group
        """
        start_index = history_length*sampling_rate
        
        for i in range(start_index,len(single_df)):
            
            extracted_sequences = [ [single_df.iloc[k - j * sampling_rate] for j in range(history_length + 1)]
                                    for k in range(start_index, len(single_df)) ]
                              
        extracted_sequences = [pd.concat(series_list, axis=1).T for series_list in extracted_sequences]
        
        return extracted_sequences

In [141]:
def generate_temporally_history_datasets(grouped_dfs,history_length=3,sampling_rate=2):
    
    """ 
    Given a list of pandas dataframes where each element is the temporal trajectory of one particle
    , this functions operates on each of the elements and gives historical time data points, for instance if
    the trajectory has 100 time steps, and the history is length is 3 with the sampling rate being 2. The first
    data point will be of timestep 1-3-5 and the label would be the drag from timestep 5, second datapoint would be 
    2-4-6 and the label would be the drag at 6 and so on.
    """
    
    def generate_temporally_history_datasets_for_single_group(single_df,history_length,sampling_rate):
        
        """
        performs the data generation for a single group
        """
        start_index = history_length*sampling_rate
        
        for i in range(start_index,len(single_df)):
            
            extracted_sequences = [ [single_df.iloc[k - j * sampling_rate] for j in range(history_length + 1)]
                                    for k in range(start_index, len(single_df)) ]
                              
        extracted_sequences = [pd.concat(series_list, axis=1).T for series_list in extracted_sequences]
        
        return extracted_sequences
    
    extracted_sequences = list()
    
    for i in range(len(grouped_dfs)):
        
        print("Currently on particle number : ",str(i+1))
        
        extracted_sequences.append( generate_temporally_history_datasets_for_single_group( grouped_dfs[i] , history_length 
                                                                                , sampling_rate) )
        clear_output(wait=True)
    
    return extracted_sequences  

In [140]:
def concat_over_all_levels(data):
    
    # Flatten the list of lists into a single list of DataFrames
    flattened_list = [df for sublist in data for df in sublist]

    # Concatenate all DataFrames in the flattened list into a single DataFrame
    combined_dataframe = pd.concat(flattened_list, ignore_index=True)
    
    return combined_dataframe

In [8]:
# ### Read data ###
experiment = "rho2_40percent_Re100"
time_series_data = pd.read_csv("../ze_time_series_data_raw/"+experiment+".dat")

pd_list = group_time_series_data(time_series_data)
nearest_neighbor_data = generate_nearest_neighbor_data(time_series_data)

### add particle id, case and time column to the dataset ###
nearest_neighbor_data_modified = modify_nearest_neighbor_data(nearest_neighbor_data,pd_list)
save=True

Currently on case_time subgroup :  323


In [11]:
### splitting the data such that each grouped df is the trajectory of a single particle across all cases ###
nearest_neighbor_data_modified = pd.DataFrame(nearest_neighbor_data_modified)

new_column_names = {0: 'case', 1: 'particle_ID', 2: 'time',3:"x",4:"y",5:"z",6:"local Re"}
nearest_neighbor_data_modified.rename(columns=new_column_names, inplace=True)
nearest_neighbor_data_modified = nearest_neighbor_data_modified.groupby(["case","particle_ID"])

### each element of grouped_dfs is a particle and its trajectory ###
grouped_dfs = [group for _, group in nearest_neighbor_data_modified]

### IMPORTANT : generating the sequnetial data ###
extracted_sequences = generate_temporally_history_datasets(grouped_dfs)

In [144]:
### For splitting data as train and test ###

### define train/test indices ###
test_particles_id = np.random.randint(0,len(extracted_sequences)-1,35)
train_particles_id =np.setdiff1d(np.arange(len(extracted_sequences)),test_particles_id)

### extracting the train and test datasets ###
train_data = concat_over_all_levels([extracted_sequences[i] for i in train_particles_id])
test_data = concat_over_all_levels([extracted_sequences[i] for i in test_particles_id])

In [149]:
### Scaling the data ###
### Inputs ###
scaler = MinMaxScaler()
scaler.fit(train_data.values)

train_input_scaled = scaler.transform(train_data.values)
test_input_scaled = scaler.transform(test_data.values)

In [153]:
train_input_scaled.shape

(99704, 72)

In [154]:
# Calculate the number of parts needed
num_parts = int(np.ceil(len(train_input_scaled) / 4))
# Split the array
parts = np.array_split(train_input_scaled, num_parts)

len(parts)

24926

In [156]:
parts[0].shape

(4, 72)

In [None]:
if save==True:
    
    ### Creat Directory ###
    directory = "simple_connections_data/random_split/"+experiment
    
    ### if directory does not exist ###
    if not os.path.exists(directory):
        # Create the directory
        os.makedirs(directory)
        
        if os.path.exists(directory):
            print(f"Directory '{directory}' created successfully.")
        else:
            print(f"Failed to create directory '{directory}'.")
    
    else:
        print(f"Directory '{directory}' already exists.")

    np.save("simple_connections_data/random_split/"+experiment+"/train_inputs",train_input_scaled.reshape(train_input_scaled.shape[0],16,4))
    np.save("simple_connections_data/random_split/"+experiment+"/test_inputs",test_input_scaled.reshape(test_input_scaled.shape[0],16,4))

    np.save("simple_connections_data/random_split/"+experiment+"/train_input_scalar",train_output_scaled[:,0:4])
    np.save("simple_connections_data/random_split/"+experiment+"/test_input_scalar",test_output_scaled[:,0:4])

    np.save("simple_connections_data/random_split/"+experiment+"/train_output",train_output_scaled[:,4:])
    np.save("simple_connections_data/random_split/"+experiment+"/test_output",test_output_scaled[:,4:])

In [None]:
### Splitting the data as test and train (Case-wise) ###
X_train, X_test = nearest_neighbor_data[0:change_point,0:64],nearest_neighbor_data[change_point:,0:64]
y_train, y_test = nearest_neighbor_data[0:change_point,64:],nearest_neighbor_data[change_point:,64:]

### Scaling the data ###
### Inputs ###
scaler = MinMaxScaler()
scaler.fit(X_train)

train_input_scaled = scaler.transform(X_train)
test_input_scaled = scaler.transform(X_test)

### Scalar Inputs and outputs (both are under outputs and thus needs to be separated during saving) ###
scaler = MinMaxScaler()
scaler.fit(y_train)

train_output_scaled = scaler.transform(y_train)
test_output_scaled = scaler.transform(y_test)

In [None]:
if save==True:
    
        ### Creat Directory ###
    directory = "simple_connections_data/case_split/"+experiment
    
    ### if directory does not exist ###
    if not os.path.exists(directory):
        # Create the directory
        os.makedirs(directory)
        
        if os.path.exists(directory):
            print(f"Directory '{directory}' created successfully.")
        else:
            print(f"Failed to create directory '{directory}'.")
    
    else:
        print(f"Directory '{directory}' already exists.")

    np.save("simple_connections_data/case_split/"+experiment+"/train_inputs",train_input_scaled.reshape(train_input_scaled.shape[0],16,4))
    np.save("simple_connections_data/case_split/"+experiment+"/test_inputs",test_input_scaled.reshape(test_input_scaled.shape[0],16,4))

    np.save("simple_connections_data/case_split/"+experiment+"/train_input_scalar",train_output_scaled[:,0:4])
    np.save("simple_connections_data/case_split/"+experiment+"/test_input_scalar",test_output_scaled[:,0:4])

    np.save("simple_connections_data/case_split/"+experiment+"/train_output",train_output_scaled[:,4:])
    np.save("simple_connections_data/case_split/"+experiment+"/test_output",test_output_scaled[:,4:])

In [None]:
# ### mid-time splitting ###
# ### Splitting the data as test and train (Splitting each case into two halves) ###

# ### Features Train and Test ###
X_train_1,X_train_2 = nearest_neighbor_data[0:change_point//2,0:64],nearest_neighbor_data[change_point:change_point_2,0:64]
X_train = np.concatenate((X_train_1,X_train_2),axis=0)

X_test_1,X_test_2 = nearest_neighbor_data[change_point//2:change_point,0:64],nearest_neighbor_data[change_point_2:,0:64]
X_test = np.concatenate((X_test_1,X_test_2),axis=0)

### Labels Train and Test ###
y_train_1,y_train_2 = nearest_neighbor_data[0:change_point//2,64:],nearest_neighbor_data[change_point:change_point_2,64:]
y_train = np.concatenate((y_train_1,y_train_2),axis=0)

y_test_1,y_test_2 = nearest_neighbor_data[change_point//2:change_point,64:],nearest_neighbor_data[change_point_2:,64:]
y_test = np.concatenate((y_test_1,y_test_2),axis=0)

### Scaling the data ###
### Inputs ###
scaler = MinMaxScaler()
scaler.fit(X_train)

train_input_scaled = scaler.transform(X_train)
test_input_scaled = scaler.transform(X_test)

### Scalar Inputs and outputs (both are under outputs and thus needs to be separated during saving) ###
scaler = MinMaxScaler()
scaler.fit(y_train)

train_output_scaled = scaler.transform(y_train)
test_output_scaled = scaler.transform(y_test)

In [None]:
if save==True:
    
    ### Creat Directory ###
    directory = "simple_connections_data/time_split/"+experiment
    
    ### if directory does not exist ###
    if not os.path.exists(directory):
        # Create the directory
        os.makedirs(directory)
        
        if os.path.exists(directory):
            print(f"Directory '{directory}' created successfully.")
        else:
            print(f"Failed to create directory '{directory}'.")
    
    else:
        print(f"Directory '{directory}' already exists.")

    np.save("simple_connections_data/time_split/"+experiment+"/train_inputs",train_input_scaled.reshape(train_input_scaled.shape[0],16,4))
    np.save("simple_connections_data/time_split/"+experiment+"/test_inputs",test_input_scaled.reshape(test_input_scaled.shape[0],16,4))

    np.save("simple_connections_data/time_split/"+experiment+"/train_input_scalar",train_output_scaled[:,0:4])
    np.save("simple_connections_data/time_split/"+experiment+"/test_input_scalar",test_output_scaled[:,0:4])

    np.save("simple_connections_data/time_split/"+experiment+"/train_output",train_output_scaled[:,4:])
    np.save("simple_connections_data/time_split/"+experiment+"/test_output",test_output_scaled[:,4:])