In [13]:
import pandas as pd

# Feature Engine
from feature_engine.imputation import (
    AddMissingIndicator,
    MeanMedianImputer,
    CategoricalImputer,
)

from feature_engine.encoding import RareLabelEncoder, OneHotEncoder, CountFrequencyEncoder
from feature_engine.selection import DropConstantFeatures, DropDuplicateFeatures
from feature_engine.selection import DropCorrelatedFeatures, SmartCorrelatedSelection
from feature_engine import transformation as vt
from feature_engine.wrappers import SklearnTransformerWrapper

# Scikit-Learn - Visualisation
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Visualisation
import matplotlib.pyplot as plt
import plotly.express as px

## Einlesen der Daten

In [2]:
df = pd.read_csv("Disease_symptom_and_patient_profile_dataset.csv")

## Deskriptive Statistik

In [3]:
df.shape

(349, 10)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 349 entries, 0 to 348
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Disease               349 non-null    object
 1   Fever                 349 non-null    object
 2   Cough                 349 non-null    object
 3   Fatigue               349 non-null    object
 4   Difficulty Breathing  349 non-null    object
 5   Age                   349 non-null    int64 
 6   Gender                349 non-null    object
 7   Blood Pressure        349 non-null    object
 8   Cholesterol Level     349 non-null    object
 9   Outcome Variable      349 non-null    object
dtypes: int64(1), object(9)
memory usage: 27.4+ KB


In [5]:
df.head(10)

Unnamed: 0,Disease,Fever,Cough,Fatigue,Difficulty Breathing,Age,Gender,Blood Pressure,Cholesterol Level,Outcome Variable
0,Influenza,Yes,No,Yes,Yes,19,Female,Low,Normal,Positive
1,Common Cold,No,Yes,Yes,No,25,Female,Normal,Normal,Negative
2,Eczema,No,Yes,Yes,No,25,Female,Normal,Normal,Negative
3,Asthma,Yes,Yes,No,Yes,25,Male,Normal,Normal,Positive
4,Asthma,Yes,Yes,No,Yes,25,Male,Normal,Normal,Positive
5,Eczema,Yes,No,No,No,25,Female,Normal,Normal,Positive
6,Influenza,Yes,Yes,Yes,Yes,25,Female,Normal,Normal,Positive
7,Influenza,Yes,Yes,Yes,Yes,25,Female,Normal,Normal,Positive
8,Hyperthyroidism,No,Yes,No,No,28,Female,Normal,Normal,Negative
9,Hyperthyroidism,No,Yes,No,No,28,Female,Normal,Normal,Negative


In [6]:
fig = px.histogram(df, x="Age", marginal="box",
                  title="Boxplot und Histogram für das Alter der Patienten",
                  histnorm="probability")

fig.show()

In [7]:
print(f"Berechnung der Schiefe: {df['Age'].skew()}")

Berechnung der Schiefe: 0.5674050307810211


Die Daten sind normalverteilt

In [8]:
print(f"Der Lokalisierungswert (Mittelwert) beträgt: {df['Age'].mean()}")
print(f"Die Standardabweichung beträgt: {df['Age'].std()}")


Der Lokalisierungswert (Mittelwert) beträgt: 46.32378223495702
Die Standardabweichung beträgt: 13.085089852889405


Alle anderen Variablen sind kategorisch. Shape und Spread kann somit bei diesen Variablen ermittelt werden. Die Location kann mit dem Modus ermittelt werden. Auf diese Fleissarbeit habe ich hier aber verzichtet, da dies in Kaggle schon ersichtlich ist.

## Datenaufbereitung

### Fehlende Werte

In [11]:
missing_values = df.isnull().mean() * 100  
missing_values = missing_values[missing_values > 0]  

missing_values = missing_values.sort_values(ascending=False)

print(f"Liste der fehlenden Variablen: {missing_values}")
print(f"Anzahl Variablen mit fehlenden Werten: {len(missing_values)}")

Liste der fehlenden Variablen: Series([], dtype: float64)
Anzahl Variablen mit fehlenden Werten: 0


Es gibt keine fehlenden Werte im Datensatz.

### Train Test Split

In [12]:
x = df.drop(columns = ['Outcome Variable'])
y = df['Outcome Variable']
print(f"The shape of the data set with training varialbes is: {x.shape}")
print(f"The shape of the target variable is: {y.shape}")

The shape of the data set with training varialbes is: (349, 9)
The shape of the target variable is: (349,)


In [14]:
X_train, X_test, y_train, y_test = train_test_split(
    x,
    y,
    test_size=0.3,
    random_state=0)

print(f"The shape of the training sample is: {X_train.shape}")
print(f"The shape of the test sample is: {X_test.shape}")

The shape of the training sample is: (244, 9)
The shape of the test sample is: (105, 9)


## Rare Label Encoding

In [18]:
from feature_engine.encoding import RareLabelEncoder

rare_encoder = RareLabelEncoder(
    tol=0.05,
    n_categories=4,
    variables=['Disease']
)

rare_encoder.fit(X_train)
X_train = rare_encoder.transform(X_train)
X_test = rare_encoder.transform(X_test)

print(f"Kategorienverteilung nach RareLabelEncoding: {X_train['Disease'].value_counts()}!")


## # One-Hot-Encoding mit k–1 Dummies für alle Variablen

In [20]:
from feature_engine.encoding import OneHotEncoder

encoder = OneHotEncoder(
    variables=['Disease', 'Fever', 'Cough', 'Fatigue', 'Difficulty Breathing', 'Gender', 'Blood Pressure', 'Cholesterol Level'],
    drop_last=True  # k–1 Dummies
)

# Encoder fitten
encoder.fit(X_train)

# Daten transformieren
X_train = encoder.transform(X_train)
X_test = encoder.transform(X_test)

print(f"Shape nach One-Hot-Encoding – Trainingsdaten: {X_train.shape}!")
print(f"Shape nach One-Hot-Encoding – Testdaten: {X_test.shape}!")


## Quasi-Konstanten Merkmale entfernen

In [19]:
sel = DropConstantFeatures(tol=0.95, variables=None, missing_values='raise')
sel.fit(X_train)
X_train = sel.transform(X_train)
X_test = sel.transform(X_test)

print(f"Folgende quasi-konstanten Merkmale wurden entfernt: {sel.features_to_drop_}")


Ich habe jetzt einfach mal den Test Train Split so gemacht, wie im Notebook "Data_Preperation_General". So dass der Test-Datensatz 30% ausmacht. In den Folien "Introduction_to_Machine_Learning" steht aber, dass der Test-Datensatz 20% ausmachen sollte.

In [15]:
X_train.dtypes

Disease                 object
Fever                   object
Cough                   object
Fatigue                 object
Difficulty Breathing    object
Age                      int64
Gender                  object
Blood Pressure          object
Cholesterol Level       object
dtype: object

In [16]:
object_columns = X_train.select_dtypes(include=['object']).columns

print(f"Liste von kategorischen Variablen: {object_columns}")

print(f"Lange der Liste von kategorischen Variablen: {len(object_columns)}")

Liste von kategorischen Variablen: Index(['Disease', 'Fever', 'Cough', 'Fatigue', 'Difficulty Breathing',
       'Gender', 'Blood Pressure', 'Cholesterol Level'],
      dtype='object')
Lange der Liste von kategorischen Variablen: 8


In [17]:
for col in  X_train.select_dtypes(include=['object']):
    cardinality = len(pd.Index(X_train[col].value_counts()))
    print(X_train[col].name + ": " + str(cardinality))

Disease: 97
Fever: 2
Cough: 2
Fatigue: 2
Difficulty Breathing: 2
Gender: 2
Blood Pressure: 3
Cholesterol Level: 3


Evtl. muss man die Variable Age noch in float umwandeln? 