In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [2]:
df = pd.read_csv('dataset_raw.csv').drop_duplicates().dropna(how="all")

In [3]:
df.columns = df.columns.str.replace(r"[\[\]]", "", regex=True).str.replace(" ", "_")

In [4]:
df["quality"] = df["Product_ID"].str[0]
df["serial"] = df["Product_ID"].str[1:].astype(int)

In [5]:
df = df.drop(columns=["UDI", "Product_ID"])

In [6]:
y_binary = df['Target']
y_multi = df['Failure_Type']
df = df.drop(columns=['Target','Failure_Type'])

In [7]:
cat_cols = df.select_dtypes(include=['object']).columns
num_cols = df.select_dtypes(include=['float64','int64']).columns

In [8]:
df[cat_cols] = df[cat_cols].fillna(df[cat_cols].mode().iloc[0])
df[num_cols] = df[num_cols].fillna(df[num_cols].median())

In [9]:
X_train, X_test = train_test_split(
    df, test_size=0.2, random_state=42, stratify=y_binary
)
y_train_bin = y_binary.loc[X_train.index]
y_test_bin = y_binary.loc[X_test.index]
y_train_multi = y_multi.loc[X_train.index]
y_test_multi = y_multi.loc[X_test.index]

In [10]:
encoder = OneHotEncoder(drop="first", sparse_output=False)
encoded_train = encoder.fit_transform(X_train[['Type','quality']])
encoded_test = encoder.transform(X_test[['Type','quality']])

In [11]:
encoded_cols = encoder.get_feature_names_out(['Type','quality'])
encoded_train = pd.DataFrame(encoded_train, columns=encoded_cols, index=X_train.index)
encoded_test = pd.DataFrame(encoded_test, columns=encoded_cols, index=X_test.index)

In [12]:
X_train = pd.concat([X_train.drop(columns=['Type','quality']), encoded_train], axis=1)
X_test = pd.concat([X_test.drop(columns=['Type','quality']), encoded_test], axis=1)

In [13]:
num_cols = X_train.select_dtypes(include=['float64','int64']).columns

In [14]:
scaler = StandardScaler()
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])

In [15]:
X_train.to_pickle("dataset_preprocessed/X_train.pkl")
X_test.to_pickle("dataset_preprocessed/X_test.pkl")
y_train_bin.to_pickle("dataset_preprocessed/y_train_bin.pkl")
y_test_bin.to_pickle("dataset_preprocessed/y_test_bin.pkl")
y_train_multi.to_pickle("dataset_preprocessed/y_train_multi.pkl")
y_test_multi.to_pickle("dataset_preprocessed/y_test_multi.pkl")