#### Pre-processing e class balancing

In [1]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# 1. Lettura dei dati
X_train = pd.read_csv("Dataset/Train/X_train.csv")
y_train = pd.read_csv("Dataset/Train/y_train.csv")

# 2. Unire X_train e y_train per mantenere le etichette
df_train = pd.merge(X_train, y_train, on="id")

# Separare le feature e le etichette
X = df_train.drop(columns=["id", "faulty", "trq_margin"])  # Rimuoviamo id, faulty, trq_margin
y = df_train[["faulty", "trq_margin"]]  # Manteniamo "faulty" e "trq_margin"

target_size = 400000

undersampler = RandomUnderSampler(sampling_strategy=1.0, random_state=42)  # Undersampling della classe maggioritaria
oversampler = SMOTE(sampling_strategy=1.0, random_state=42)  # Oversampling della classe minoritaria

# Applicare l'undersampling
X_undersampled, y_undersampled = undersampler.fit_resample(X, y["faulty"])

# Se il numero di campioni è inferiore al target_size, applicare SMOTE per raggiungere 400.000 campioni
if len(X_undersampled) < target_size:
    X_resampled, y_resampled = oversampler.fit_resample(X_undersampled, y_undersampled)
else:
    X_resampled, y_resampled = X_undersampled, y_undersampled

# Se i dati finali sono superiori a 400.000, riduciamo a 400.000
if len(X_resampled) > target_size:
    X_resampled, _, y_resampled, _ = train_test_split(X_resampled, y_resampled, train_size=target_size, random_state=42)

# 4. Standardizzare i dati
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_resampled)

# 5. Creare i nuovi DataFrame per X_train e y_train
df_X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

# Convertire y_resampled in un formato 1D (array semplice)
y_resampled = y_resampled.values.flatten()

# Creare un DataFrame per y_train, che include anche trq_margin
y_train_final = pd.DataFrame({
    'id': df_train['id'][:len(df_X_scaled)],
    'faulty': y_resampled,  # y_resampled è ora 1D
    'trq_margin': df_train['trq_margin'][:len(df_X_scaled)]
})

# 6. Salvare i nuovi DataFrame nei file CSV
df_X_scaled.to_csv("Dataset/Train/X_train_resampled.csv", index=False)
y_train_final.to_csv("Dataset/Train/y_train_resampled.csv", index=False)

print("I file sono stati salvati correttamente come X_train_resampled.csv e y_train_resampled.csv.")

I file sono stati salvati correttamente come X_train_resampled.csv e y_train_resampled.csv.


In [2]:
df_X_scaled.describe()

Unnamed: 0,trq_measured,oat,mgt,pa,ias,np,ng
count,400000.0,400000.0,400000.0,400000.0,400000.0,400000.0,400000.0
mean,-1.611795e-15,-2.578204e-16,2.278533e-16,1.392664e-16,-2.952305e-16,-1.413341e-15,-3.161915e-17
std,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001
min,-3.667485,-3.761772,-3.061163,-1.312339,-2.203377,-3.174565,-1.505902
25%,-0.6714813,-0.6379957,-0.7650272,-0.6722853,-0.8183705,-0.506439,-0.9102593
50%,0.03465311,0.1647758,-0.070144,-0.3332354,0.2269506,0.581604,-0.1816337
75%,0.7633213,0.734275,0.7707653,0.3278751,0.895816,0.6468636,1.223179
max,2.925078,2.95599,3.152503,4.410292,1.568183,1.038936,1.590251


In [3]:
y_train_final.describe()

Unnamed: 0,id,faulty,trq_margin
count,400000.0,400000.0,400000.0
mean,199999.5,0.499727,-1.153898
std,115470.198175,0.500001,14.071843
min,0.0,0.0,-74.747111
25%,99999.75,0.0,-6.595603
50%,199999.5,0.0,1.689704
75%,299999.25,1.0,7.788254
max,399999.0,1.0,33.597754
