In [1]:
import pandas as pd
import glob

### Merging P0_ files into two dataframes per Press

In [2]:
def get_df_P0(num):
    # Directory where the CSV files are located
    directory = f"clean_csv/P0{num}"

    # Pattern to match CSV files
    file_pattern = "*.csv"

    # Get the file names matching the pattern for P03
    file_names = glob.glob(directory + "/" + file_pattern)


    value_df = pd.DataFrame()
    other_df = pd.DataFrame()

    for indx, file_name in enumerate(file_names):
        # Read the CSV file
        df = pd.read_csv(file_name)
        df = df.drop(columns="Unnamed: 0")

        #keep only document name
        file_name=file_name.split("\\")[1]
        print(file_name)

        if "value" in file_name:
            if indx == 0:
                value_df = df
                continue

            if "TraceabilityCode" in df.columns:                #check for traceability code in the columns
                merge_columns = ["TraceabilityCode", "time", "time.value"]
                
                value_df = pd.merge(value_df,df,how="outer",on=merge_columns,suffixes=("",f"_{indx}"))
        else:
            if other_df.empty:
                other_df = df
                continue
            if "TraceabilityCode" in df.columns:                #check for traceability code in the columns
                merge_columns = ["TraceabilityCode", "time"]
                
                other_df = pd.merge(other_df,df,how="outer",on=merge_columns,suffixes=("",f"_{indx}"))
    
    value_df = value_df.drop(columns="time.value")
    value_df = value_df.groupby(["TraceabilityCode", "time"]).mean(numeric_only=True).reset_index() #grouping all ["TraceabilityCode", "time"] duplicates and keep the avg

    return value_df, other_df

In [3]:
def merge_DieReference(value, other):
    merged_df = pd.merge(value, other[["TraceabilityCode","DieReference"]], on="TraceabilityCode")
    return merged_df

In [4]:
def df_to_csv(df, id, type=0):
    print(f"{id}_{type}")
    if type == 0:
        df.to_csv(f"merged_csv/{id}.csv")
    else:
        df.to_csv(f"merged_csv/P0{id}/P0{id}_{type}.csv")

In [5]:
values = []
others = []
for i in range (1,7):                                   #looping through every P0_ section of files, excluding P01
    df_value, df_other = get_df_P0(i)                   #merging every p0_ dataset together into one df, separated by P0_

    df_value = merge_DieReference(df_value, df_other)   #merge DieReference column to df_value
    
    values.append(df_value)                             #saving the sizes of the Dataframes
    others.append(df_other)

P01-AmbientHumidity-values.csv
P01-AmbientTemperature-values.csv
P01-ClutchBrakeTemperatureUnitOil-values.csv
P01-ClutchBrakeWaterTemperature-values.csv
P01-CushionOilDegradation-values.csv
P01-CushionPumpMaxPower1-values.csv
P01-CushionPumpMaxPower2-values.csv
P01-CushionPumpMaxPower3-values.csv
P01-CushionPumpMeanPower1-values.csv
P01-CushionPumpMeanPower2-values.csv
P01-CushionPumpMeanPower3-values.csv
P01-CushionTemperatureUnitOil-values.csv
P01-CushionWaterFlow-values.csv
P01-CushionWaterTemperature-values.csv
P01-Cylinder1MaxForce-values.csv
P01-Cylinder1MaxParalelismErrorInBDC-values.csv
P01-Cylinder1MaxParallelismInCycle-values.csv
P01-Cylinder2MaxForce-values.csv
P01-Cylinder2MaxParalelismErrorInBDC-values.csv
P01-Cylinder2MaxParallelismInCycle-values.csv
P01-Cylinder3MaxForce-values.csv
P01-Cylinder3MaxParalelismErrorInBDC-values.csv
P01-Cylinder3MaxParallelismInCycle-values.csv
P01-Cylinder4MaxForce-values.csv
P01-Cylinder4MaxParalelismErrorInBDC-values.csv
P01-Cylinder4MaxP

In [6]:
print(values[0].shape)
print(values[1].shape)
print(values[2].shape)
print(values[3].shape)
print(values[4].shape)
print(values[5].shape)

(143638, 76)
(113400, 33)
(136984, 34)
(130535, 34)
(131930, 33)
(91023, 33)


### Cleaning Defects

In [7]:
defects = pd.read_csv("clean_csv/defects.csv")

defects = defects.drop(columns="Unnamed: 0")

defects["Defect"] = defects["Defect"].str.split(r'\(|\)').str.get(1)
defects["Defect"] = defects["Defect"].fillna("Other")

defects.to_csv("merged_csv/Defects.csv")

In [8]:
for i in range(0,len(values)):
    df_to_csv(values[i], i+1, "value")                #saving value dfs to csv
    if i != 0:
        df_to_csv(others[i], i+1, "other")            #saving other dfs to csv

1_value
2_value
2_other
3_value
3_other
4_value
4_other
5_value
5_other
6_value
6_other


### Check for duplicates and NaNs

Traceability codes with more than one value

In [9]:
dfs2 = get_df_P0(2)

P02-ClutchBrakeTemperatureUnitOil-values.csv
P02-ClutchBrakeWaterTemperature-values.csv
P02-DieReference.csv
P02-Gear1MaxTorque-values.csv
P02-Gear2MaxTorque-values.csv
P02-Gear3MaxTorque-values.csv
P02-HELMGaugesControl-values.csv
P02-LubricationTemperatureUnitOil-values.csv
P02-MainMotorMaxPower-values.csv
P02-MainMotorMeanPower-values.csv
P02-MaxForceFL-values.csv
P02-MaxForceFR-values.csv
P02-MaxForceRL-values.csv
P02-MaxForceRR-values.csv
P02-MaxOffCenteredLoadFR-values.csv
P02-MaxOffCenteredLoadLR-values.csv
P02-OverloadSystemPrechargePressure-values.csv
P02-PartEnergy-values.csv
P02-PressForceValueFL-values.csv
P02-PressForceValueFR-values.csv
P02-PressForceValueRL-values.csv
P02-PressForceValueRR-values.csv
P02-PressMode.csv
P02-PressSpeed-parameter.csv
P02-ProcessEnergy-values.csv
P02-RealDieChangePosition-parameter.csv
P02-RealUpperCushionPressure-parameter.csv
P02-SlideAccelerationDown-values.csv
P02-SlideAccelerationUp-values.csv
P02-SlideAdjustment-values.csv
P02-SlidePosi

In [10]:
dfs2[0].index.value_counts().head(5)

0        1
75595    1
75606    1
75605    1
75604    1
dtype: int64

In [11]:
dfs2[0].shape

(113400, 32)

Check for NaNs in Rows and Columns

In [12]:
def row_nans(df):                                       #return series of rows and their percentage of nans
    rows = df.isna().sum(axis=1)/df.shape[1]
    return rows.sort_values()

def col_nans(df):                                       #return series of columns with the percentage of nans
    cols = df.isna().sum()/df.shape[0]
    return cols.sort_values()

In [13]:
row_nans(dfs2[0])

10324     0.00000
14036     0.03125
104926    0.03125
62259     0.03125
83735     0.03125
           ...   
34454     0.50000
91430     0.50000
2550      0.50000
83418     0.53125
17208     0.59375
Length: 113400, dtype: float64

In [14]:
col_nans(dfs2[0])

TraceabilityCode                   0.000000
time                               0.000000
SlideAccelerationUp                0.002840
SlideAccelerationDown              0.003959
MainMotorMeanPower                 0.008157
OverloadSystemPrechargePressure    0.010035
SlideSpeedDown                     0.031596
MaxOffCenteredLoadFR               0.034056
MaxOffCenteredLoadLR               0.036023
MaxForceFR                         0.039409
HELMGaugesControl                  0.042866
SlidePositionDown                  0.048827
MaxForceRR                         0.053519
MainMotorMaxPower                  0.056023
ProcessEnergy                      0.058757
SlideSpeedUp                       0.063924
MaxForceFL                         0.086843
Gear2MaxTorque                     0.087584
MaxForceRL                         0.090300
PartEnergy                         0.096799
SlidePositionUp                    0.098034
LubricationTemperatureUnitOil      0.108377
Gear1MaxTorque                  