In [1]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from collections import Counter

In [None]:
df=pd.read_csv("ai4i2020_original.csv")


<h3 style = "background-color: #000033;
             padding: 15px;
             font: bold 32px arial;
             color: #ccebff;
             border: 2px #e6e6ff;
             border-radius: 8px">
Handling Imbalanced Data
</h3>

In [3]:
# SMOTE- Synthetic Minority Over-sampling Technique
# SMOTE is a method for handling imbalanced datasets in machine learning.
# Goal: To increase the number of instances in the minority class by creating synthetic samples.
# How it works: SMOTE generates new examples by interpolating between existing minority class instances and their nearest neighbors.
# This helps the model learn better about the minority class and improves its performance on imbalanced datasets.

In [4]:
# Separate features and target variable
y = df['Machine failure']
X = df[["Air temperature [K]", "Process temperature [K]", "Rotational speed [rpm]", "Torque [Nm]", "Tool wear [min]"]]

# Combine resampled features and target variable into a new DataFrame
df_imbalanced = pd.concat([X, y], axis=1)

# Save the imbalanced dataset to a new CSV file
df_imbalanced.to_csv('ai4i2020_imbalanced.csv', index=False)

In [5]:
# print distribution of class before SMOTE
counts = Counter(y)
print("before resampling", counts)

# Apply SMOTE to balance the dataset
smote = SMOTE(sampling_strategy={1:9661})
X_resampled, y_resampled = smote.fit_resample(X, y)

# print distribution of class AFTER SMOTE
counts = Counter(y_resampled)
print("after resampling",counts)

# Combine resampled features and target variable into a new DataFrame
df_resampled = pd.concat([X_resampled, y_resampled], axis=1)

# Save the balanced dataset to a new CSV file
df_resampled.to_csv('ai4i2020_balanced.csv', index=False)

before resampling Counter({0: 9661, 1: 339})
after resampling Counter({0: 9661, 1: 9661})
