# Supervised preprocessing and visualization

Import CSV Daten

In [6]:
import pandas as pd
data = pd.read_csv("data/diabetes.csv")
print(data.head()) 

   preg  plas  pres  skin  insu  mass   pedi  age            class
0     6   148    72    35     0  33.6  0.627   50  tested_positive
1     1    85    66    29     0  26.6  0.351   31  tested_negative
2     8   183    64     0     0  23.3  0.672   32  tested_positive
3     1    89    66    23    94  28.1  0.167   21  tested_negative
4     0   137    40    35   168  43.1  2.288   33  tested_positive


## IQR

In [8]:
# Funktion zur IQR-basierten Ausreißererkennung
def detect_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return (df[column] < lower_bound) | (df[column] > upper_bound)

for col in data.select_dtypes(include=['float64', 'int64']).columns:
    outliers = detect_outliers_iqr(data, col)
    data.loc[outliers, col] = None

print(data.isnull().sum())  # Anzahl fehlender Werte anzeigen


preg      4
plas      5
pres     49
skin      1
insu     41
mass     19
pedi     41
age      16
class     0
dtype: int64


In [10]:
# Nur vollständige Fälle auswählen
data_cleaned = data.dropna()

print(data_cleaned.shape)  # Dimensionen des bereinigten Datensatzes anzeigen

(611, 9)


In [None]:
# Statistische Beschreibung der Features
print(data_cleaned.describe())


In [None]:
from sklearn.feature_selection import chi2
import numpy as np

# Separate features and target variable
X = data_cleaned.drop(columns=['class'])
y = data_cleaned['class']

# Apply Chi-squared test
chi2_scores, p_values = chi2(X, y)

# Create a DataFrame to display the scores
chi2_df = pd.DataFrame({'Feature': X.columns, 'Chi2 Score': chi2_scores, 'p-value': p_values})
chi2_df = chi2_df.sort_values(by='Chi2 Score', ascending=False)

print(chi2_df)