In [35]:
import pandas as pd
data_ori = pd.read_csv("ClassifyProducts.csv")


In [36]:
data_ori.set_index('id', inplace=True)
# Pfad zur CSV-Datei anpassen
# Load the file afterwards
high_corr_pairs = pd.read_csv('high_corr_feature_pairs.csv')

features_to_drop = high_corr_pairs['Feature_2'].unique()
data_ori.drop(columns=features_to_drop, inplace=True, errors='ignore')

print(data_ori.shape)

(61878, 89)


In [37]:
#Transform target column from string to int using LabelEncoder
from sklearn.preprocessing import LabelEncoder

# Beispiel: Zielspalte
y = data_ori['target']  # oder wie deine Spalte heißt

# Initialisieren und anwenden
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Ersetze die alte Zielspalte oder speichere neu
data_ori['target'] = y_encoded + 1
print(data_ori.head())

    feat_1  feat_2  feat_3  feat_4  feat_5  feat_6  feat_7  feat_8  feat_9  \
id                                                                           
1        1       0       0       0       0       0       0       0       0   
2        0       0       0       0       0       0       0       1       0   
3        0       0       0       0       0       0       0       1       0   
4        1       0       0       1       6       1       5       0       0   
5        0       0       0       0       0       0       0       0       0   

    feat_10  ...  feat_85  feat_86  feat_87  feat_88  feat_89  feat_90  \
id           ...                                                         
1         0  ...        1        0        0        0        0        0   
2         0  ...        0        0        0        0        0        0   
3         0  ...        0        0        0        0        0        0   
4         1  ...        0        1        2        0        0        0   
5        

In [38]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
df=data_ori.copy()

X = df.drop(columns='target')
y = df['target']

# Klassenhäufigkeit
class_counts = df['target'].value_counts()

# Mittelwert der Klassengrößen berechnen (ganzzahlig)
mean_class_size = int(class_counts.mean())

# Oversampling-Strategie: Klassen < Mittelwert
oversampling_classes = {
    label: mean_class_size
    for label in class_counts.index
    if class_counts[label] < mean_class_size
}

# Undersampling-Strategie: Klassen > Mittelwert
undersampling_strategy = {
    label: mean_class_size
    for label in class_counts.index
    if class_counts[label] > mean_class_size
}

# Resampler initialisieren
smote = SMOTE(sampling_strategy=oversampling_classes, random_state=1)
rus = RandomUnderSampler(sampling_strategy=undersampling_strategy, random_state=1)

# Pipeline definieren
pipeline = Pipeline([
    ('SMOTE', smote),
    ('RandomUnderSampler', rus)
])

# Resampling durchführen
X_resampled, y_resampled = pipeline.fit_resample(X, y)

# Neues DataFrame erzeugen
balanced_data = pd.DataFrame(X_resampled, columns=X.columns)
balanced_data['target'] = y_resampled


In [39]:
print(balanced_data['target'].value_counts())

target
1    6875
2    6875
3    6875
4    6875
5    6875
6    6875
7    6875
8    6875
9    6875
Name: count, dtype: int64


In [40]:
from sklearn.preprocessing import MinMaxScaler


cols_to_normalize = balanced_data.drop(columns='target').columns

# Skaler initialisieren und anwenden
scaler = MinMaxScaler()
balanced_data[cols_to_normalize] = scaler.fit_transform(balanced_data[cols_to_normalize])
print(balanced_data.head())

     feat_1  feat_2  feat_3    feat_4    feat_5    feat_6    feat_7    feat_8  \
0  0.017857     0.0     0.0  0.000000  0.000000  0.000000  0.000000  0.000000   
1  0.000000     0.0     0.0  0.000000  0.000000  0.000000  0.000000  0.013158   
2  0.000000     0.0     0.0  0.000000  0.000000  0.000000  0.000000  0.013158   
3  0.017857     0.0     0.0  0.014286  0.315789  0.166667  0.131579  0.000000   
4  0.000000     0.0     0.0  0.000000  0.000000  0.000000  0.000000  0.000000   

   feat_9   feat_10  ...   feat_85   feat_86   feat_87  feat_88  feat_89  \
0     0.0  0.000000  ...  0.018182  0.000000  0.000000      0.0      0.0   
1     0.0  0.000000  ...  0.000000  0.000000  0.000000      0.0      0.0   
2     0.0  0.000000  ...  0.000000  0.000000  0.000000      0.0      0.0   
3     0.0  0.033333  ...  0.000000  0.016129  0.040816      0.0      0.0   
4     0.0  0.000000  ...  0.018182  0.000000  0.000000      0.0      0.0   

    feat_90  feat_91  feat_92  feat_93  target  
0  0.00

In [41]:
balanced_data.to_csv('balanced_data.csv', index=False)