In [36]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [37]:
df = pd.read_csv('dataset_raw.csv')

In [38]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   UDI                      10000 non-null  int64  
 1   Product ID               10000 non-null  object 
 2   Type                     10000 non-null  object 
 3   Air temperature [K]      10000 non-null  float64
 4   Process temperature [K]  10000 non-null  float64
 5   Rotational speed [rpm]   10000 non-null  int64  
 6   Torque [Nm]              10000 non-null  float64
 7   Tool wear [min]          10000 non-null  int64  
 8   Target                   10000 non-null  int64  
 9   Failure Type             10000 non-null  object 
dtypes: float64(3), int64(4), object(3)
memory usage: 781.4+ KB


In [39]:
df.head()

Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Target,Failure Type
0,1,M14860,M,298.1,308.6,1551,42.8,0,0,No Failure
1,2,L47181,L,298.2,308.7,1408,46.3,3,0,No Failure
2,3,L47182,L,298.1,308.5,1498,49.4,5,0,No Failure
3,4,L47183,L,298.2,308.6,1433,39.5,7,0,No Failure
4,5,L47184,L,298.2,308.7,1408,40.0,9,0,No Failure


clean dataset

In [40]:
df = df.drop_duplicates()

In [41]:
df = df.dropna(how="all")

In [42]:
df.columns = df.columns.str.replace(r"[\[\]]", "", regex=True)
df.columns = df.columns.str.replace(" ", "_")

In [43]:
df = df.drop(columns=["UDI", "Product_ID"], errors="ignore")

In [44]:
y_binary = df['Target']
y_multi = df['Failure_Type']

In [45]:
df = df.drop(columns=['Target','Failure_Type'])

In [46]:
cat_cols = df.select_dtypes(include=['object']).columns
num_cols = df.select_dtypes(include=['float64','int64']).columns

df[cat_cols] = df[cat_cols].fillna(df[cat_cols].mode().iloc[0])
df[num_cols] = df[num_cols].fillna(df[num_cols].median())

In [47]:
X_train, X_test, y_train_bin, y_test_bin = train_test_split(
    df, y_binary, test_size=0.2, random_state=42, stratify=y_binary
)

X_train_m, X_test_m, y_train_multi, y_test_multi = train_test_split(
    df, y_multi, test_size=0.2, random_state=42, stratify=y_multi
)

In [48]:
encoder = OneHotEncoder(drop="first", sparse_output=False)

In [49]:
encoded_train = encoder.fit_transform(X_train[['Type']])
encoded_test = encoder.transform(X_test[['Type']])

In [50]:
X_train = X_train.drop(columns=['Type'])
X_test = X_test.drop(columns=['Type'])

X_train = pd.concat([X_train.reset_index(drop=True),
                     pd.DataFrame(encoded_train)], axis=1)

X_test = pd.concat([X_test.reset_index(drop=True),
                    pd.DataFrame(encoded_test)], axis=1)

In [51]:
scaler = StandardScaler()
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])

In [52]:
X_train.to_pickle("dataset_preprocessed/X_train.pkl")
X_test.to_pickle("dataset_preprocessed/X_test.pkl")
y_train_bin.to_pickle("dataset_preprocessed/y_train_bin.pkl")
y_test_bin.to_pickle("dataset_preprocessed/y_test_bin.pkl")
y_train_multi.to_pickle("dataset_preprocessed/y_train_multi.pkl")
y_test_multi.to_pickle("dataset_preprocessed/y_test_multi.pkl")