In [2]:
# Data manipulation
import pandas as pd
import numpy as np

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Date and time handling
from datetime import datetime

# Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.metrics import mean_absolute_percentage_error

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [3]:
df = pd.read_csv('REF.csv')
df

Unnamed: 0.1,Unnamed: 0,REF_U_1,REF_U_2,REF_U_3,REF_tau_11,REF_tau_12,REF_tau_13,REF_tau_22,REF_tau_23,REF_tau_33,...,REF_tauplus_22,REF_tauplus_23,REF_tauplus_31,REF_tauplus_32,REF_tauplus_33,REF_Uplus_1,REF_Uplus_2,REF_Uplus_3,REF_yplus,Case
0,0,0.183148,5.524527e-08,0.000000e+00,8.607882e-03,-7.599257e-06,0.000000e+00,1.508704e-07,0.000000e+00,3.648287e-03,...,1.648931e-08,0.0,0.0,0.0,0.000399,0.060548,1.826393e-08,0.0,0.060548,fp_1000
1,1,0.554677,5.522239e-07,0.000000e+00,5.727882e-02,-9.240743e-05,0.000000e+00,3.399372e-06,0.000000e+00,2.366991e-02,...,3.715329e-07,0.0,0.0,0.0,0.002587,0.183375,1.825637e-07,0.0,0.183376,fp_1000
2,2,0.936817,1.724616e-06,0.000000e+00,1.533795e-01,-3.501041e-04,0.000000e+00,1.554829e-05,0.000000e+00,6.197673e-02,...,1.699343e-06,0.0,0.0,0.0,0.006774,0.309709,5.701532e-07,0.0,0.309713,fp_1000
3,3,1.329845,3.472650e-06,0.000000e+00,3.129591e-01,-1.097912e-03,0.000000e+00,7.115068e-05,0.000000e+00,1.216620e-01,...,7.776382e-06,0.0,0.0,0.0,0.013297,0.439643,1.148049e-06,0.0,0.439660,fp_1000
4,4,1.734088,5.539989e-06,0.000000e+00,5.072728e-01,-2.106934e-03,0.000000e+00,1.497585e-04,0.000000e+00,1.931306e-01,...,1.636779e-05,0.0,0.0,0.0,0.021108,0.573285,1.831505e-06,0.0,0.573321,fp_1000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
902807,9211,0.005959,-8.866439e-06,1.956330e-06,3.380772e-06,2.847676e-08,2.475930e-09,1.864412e-07,-1.829451e-08,3.345494e-09,...,,,,,,,,,,squareDuctAve_Re_3500
902808,9212,0.004547,-2.971654e-06,9.090173e-07,2.616207e-06,1.654386e-08,1.419062e-09,6.365150e-08,-7.552760e-09,2.102089e-09,...,,,,,,,,,,squareDuctAve_Re_3500
902809,9213,0.003354,-6.777688e-07,3.514683e-07,2.089987e-06,9.503438e-09,9.837950e-10,1.963132e-08,-2.862823e-09,1.315723e-09,...,,,,,,,,,,squareDuctAve_Re_3500
902810,9214,0.002072,-4.948563e-08,5.171945e-08,1.351600e-06,3.031013e-09,6.699116e-10,3.164032e-09,-4.900372e-10,5.482198e-10,...,,,,,,,,,,squareDuctAve_Re_3500


In [4]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 902812 entries, 0 to 902811
Data columns (total 49 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   Unnamed: 0      902812 non-null  int64  
 1   REF_U_1         902812 non-null  float64
 2   REF_U_2         902812 non-null  float64
 3   REF_U_3         902812 non-null  float64
 4   REF_tau_11      902812 non-null  float64
 5   REF_tau_12      902812 non-null  float64
 6   REF_tau_13      902812 non-null  float64
 7   REF_tau_22      902812 non-null  float64
 8   REF_tau_23      902812 non-null  float64
 9   REF_tau_33      902812 non-null  float64
 10  REF_k           902812 non-null  float64
 11  REF_a_11        902812 non-null  float64
 12  REF_a_12        902812 non-null  float64
 13  REF_a_13        902812 non-null  float64
 14  REF_a_22        902812 non-null  float64
 15  REF_a_23        902812 non-null  float64
 16  REF_a_33        902812 non-null  float64
 17  REF_b_11  

In [7]:
# Drop rows with any null values
df_cleaned = df.dropna()
df_cleaned

Unnamed: 0.1,Unnamed: 0,REF_U_1,REF_U_2,REF_U_3,REF_tau_11,REF_tau_12,REF_tau_13,REF_tau_22,REF_tau_23,REF_tau_33,...,REF_tauplus_22,REF_tauplus_23,REF_tauplus_31,REF_tauplus_32,REF_tauplus_33,REF_Uplus_1,REF_Uplus_2,REF_Uplus_3,REF_yplus,Case
0,0,0.183148,5.524527e-08,0.0,0.008608,-0.000008,0.0,1.508704e-07,0.0,0.003648,...,1.648931e-08,0.0,0.0,0.0,0.000399,0.060548,1.826393e-08,0.0,0.060548,fp_1000
1,1,0.554677,5.522239e-07,0.0,0.057279,-0.000092,0.0,3.399372e-06,0.0,0.023670,...,3.715329e-07,0.0,0.0,0.0,0.002587,0.183375,1.825637e-07,0.0,0.183376,fp_1000
2,2,0.936817,1.724616e-06,0.0,0.153380,-0.000350,0.0,1.554829e-05,0.0,0.061977,...,1.699343e-06,0.0,0.0,0.0,0.006774,0.309709,5.701532e-07,0.0,0.309713,fp_1000
3,3,1.329845,3.472650e-06,0.0,0.312959,-0.001098,0.0,7.115068e-05,0.0,0.121662,...,7.776382e-06,0.0,0.0,0.0,0.013297,0.439643,1.148049e-06,0.0,0.439660,fp_1000
4,4,1.734088,5.539989e-06,0.0,0.507273,-0.002107,0.0,1.497585e-04,0.0,0.193131,...,1.636779e-05,0.0,0.0,0.0,0.021108,0.573285,1.831505e-06,0.0,0.573321,fp_1000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1553,185,60.170067,5.664020e-02,0.0,14.968957,-4.315332,0.0,6.584633e+00,0.0,8.829160,...,9.371833e-01,0.0,0.0,0.0,1.256644,22.700058,2.136837e-02,0.0,687.725600,fp_4060
1554,186,60.553772,5.919544e-02,0.0,14.507311,-4.182678,0.0,6.389991e+00,0.0,8.544334,...,9.094802e-01,0.0,0.0,0.0,1.216105,22.844818,2.233237e-02,0.0,707.485000,fp_4060
1555,187,60.942444,6.188760e-02,0.0,14.014689,-4.042442,0.0,6.189920e+00,0.0,8.252177,...,8.810044e-01,0.0,0.0,0.0,1.174523,22.991450,2.334802e-02,0.0,727.809000,fp_4060
1556,188,61.335743,6.471685e-02,0.0,13.497219,-3.894517,0.0,5.986698e+00,0.0,7.937406,...,8.520800e-01,0.0,0.0,0.0,1.129722,23.139828,2.441540e-02,0.0,748.713900,fp_4060


In [9]:
df_cleaned['Case'].unique()

array(['fp_1000', 'fp_1410', 'fp_2000', 'fp_2540', 'fp_3030', 'fp_3270',
       'fp_3630', 'fp_3970', 'fp_4060'], dtype=object)

In [11]:
# Save the cleaned dataset (optional)
df_cleaned.to_csv("cleaned_REF.csv", index=False)