# This is a sample Jupyter Notebook

Below is an example of a code cell. 
Put your cursor into the cell and press Shift+Enter to execute it and select the next one, or click 'Run Cell' button.

Press Double Shift to search everywhere for classes, files, tool windows, actions, and settings.

To learn more about Jupyter Notebooks in PyCharm, see [help](https://www.jetbrains.com/help/pycharm/ipython-notebook-support.html).
For an overview of PyCharm, go to Help -> Learn IDE features or refer to [our documentation](https://www.jetbrains.com/help/pycharm/getting-started.html).

In [2]:
import os
import pandas as pd
import numpy as np

path_dir = "C:/Users/mrepi/PycharmProjects/IAU_part3/data"
files = [f for f in os.listdir(path_dir) if os.path.isfile(os.path.join(path_dir, f))]
df_observation = pd.read_csv(os.path.join(path_dir, files[0]), sep='\t')

In [3]:
from sklearn.model_selection import train_test_split

X = df_observation.drop(columns="oximetry latitude longitude".split())
y = df_observation.oximetry

X_train, X_test, y_train, y_test = train_test_split(X, y ,test_size=0.2, random_state = 42, stratify=y)

print(f"Tréningová množina: {X_train.shape[0]} riadkov ({X_train.shape[1]} atribútov)")
print(f"Testovacia množina: {X_test.shape[0]} riadkov")
print(f"Podiel pozitívnych tried v y: {y.mean():.3f}")

Tréningová množina: 9610 riadkov (20 atribútov)
Testovacia množina: 2403 riadkov
Podiel pozitívnych tried v y: 0.594


In [4]:
from sklearn.feature_selection import mutual_info_classif, f_classif, SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

X = X_train.copy()
y = y_train.copy()

mi_scores = mutual_info_classif(X, y, random_state=42)
mi_series = pd.Series(mi_scores, index=X.columns, name='Mutual_Info').sort_values(ascending=False)

f_values, p_values = f_classif(X, y)
f_series = pd.Series(f_values, index=X.columns, name="F_value").sort_values(ascending=False)

log_reg = LogisticRegression(penalty="l1", solver="saga", max_iter=5_000, random_state=42)
log_reg.fit(X, y)

fm_model = SelectFromModel(log_reg, prefit=True)
lasso_importance = np.abs(log_reg.coef_).flatten()
lasso_series = pd.Series(lasso_importance, index=X.columns, name='SelectFromModel').sort_values(ascending=False)

rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X, y)
rf_series = pd.Series(rf.feature_importances_, index=X.columns, name="RandomForest").sort_values(ascending=False)

In [5]:
feature_scores = pd.concat([mi_series, f_series, lasso_series, rf_series], axis=1)

# Normalizácia po stĺpcoch (bez delenia nulou)
feature_scores_norm = feature_scores.copy()
for col in feature_scores_norm.columns:
    col_max = feature_scores_norm[col].max()
    feature_scores_norm[col] = feature_scores_norm[col] / col_max


# Priemerné skóre a zoradenie
feature_scores_norm["Average_Score"] = feature_scores_norm.mean(axis=1)
feature_scores_norm = feature_scores_norm.sort_values("Average_Score", ascending=False)

print("Top 10 atribútov:")
feature_scores_norm.head(20).round(3)

best_features = feature_scores_norm.head(6).index

Top 10 atribútov:


In [7]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

# Použijeme vybrané top features
selected_features = best_features.copy()

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin


class ColumnSelector(BaseEstimator, TransformerMixin):
    """
    Flexibilný selector stĺpcov:
    - Môže odstrániť konkrétne stĺpce (columns_to_drop)
    - ALEBO ponechať len konkrétne stĺpce (columns_to_keep)
    """

    def __init__(self, columns_to_drop=None, columns_to_keep=None):
        self.columns_to_drop = columns_to_drop
        self.columns_to_keep = columns_to_keep

        # Validácia - nemôžeš použiť obe naraz
        if columns_to_drop is not None and columns_to_keep is not None:
            raise ValueError("Nemôžeš použiť columns_to_drop a columns_to_keep súčasne!")

    def fit(self, X, y=None):
        # Ulož pôvodné stĺpce pri fit
        self.feature_names_in_ = list(X.columns)
        return self

    def transform(self, X):
        # Variant 1: DROP konkrétne stĺpce
        if self.columns_to_drop is not None:
            existing_cols = [col for col in self.columns_to_drop if col in X.columns]
            if existing_cols:
                print(f"Odstraňujem stĺpce: {existing_cols}")
            return X.drop(columns=existing_cols)

        # Variant 2: KEEP len konkrétne stĺpce (odstráň všetko ostatné)
        elif self.columns_to_keep is not None:
            existing_cols = [col for col in self.columns_to_keep if col in X.columns]
            missing_cols = [col for col in self.columns_to_keep if col not in X.columns]

            if missing_cols:
                print(f" Chýbajúce stĺpce: {missing_cols}")
            if existing_cols:
                print(f" Ponechávam stĺpce: {existing_cols}")

            return X[existing_cols]

        # Variant 3: Ak nič nie je špecifikované, vráť všetko
        else:
            return X

    def get_feature_names_out(self, input_features=None):
        """Potrebné pre sklearn pipeline compatibility"""
        if self.columns_to_keep is not None:
            return self.columns_to_keep
        elif self.columns_to_drop is not None:
            return [col for col in self.feature_names_in_
                    if col not in self.columns_to_drop]
        else:
            return self.feature_names_in_


DROP_COLS = ['latitude', 'longitude']
KEEP_COLS = list(selected_features)



data_preprocess = Pipeline(steps=[
    ('selector', ColumnSelector(columns_to_drop=DROP_COLS)),
    ('selector_2', ColumnSelector(columns_to_keep=KEEP_COLS)),
    ('imputer', SimpleImputer(strategy='median')),
    ('transform', PowerTransformer(method='yeo-johnson')),
    ('scaler', StandardScaler()),

])

data_preprocess.fit(X_train)

 Ponechávam stĺpce: ['SpO₂', 'CO', 'FiO₂', 'Skin Temperature', 'EtCO₂', 'Hb level']


0,1,2
,steps,"[('selector', ...), ('selector_2', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,columns_to_drop,"['latitude', 'longitude']"
,columns_to_keep,

0,1,2
,columns_to_drop,
,columns_to_keep,"['SpO₂', 'CO', ...]"

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,method,'yeo-johnson'
,standardize,True
,copy,True

0,1,2
,copy,True
,with_mean,True
,with_std,True


In [10]:
# In your Jupyter notebook, add this cell after your preprocessing:

from models.id3 import ID3NumericalClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Get the preprocessed data
X_train_processed = data_preprocess.transform(X_train)
X_test_processed = data_preprocess.transform(X_test)

# Convert to numpy arrays if they're DataFrames
if isinstance(X_train_processed, pd.DataFrame):
    X_train_arr = X_train_processed.values
    X_test_arr = X_test_processed.values
else:
    X_train_arr = X_train_processed
    X_test_arr = X_test_processed

# Display info
print("=" * 60)
print("ID3 Decision Tree on Your Medical Data")
print("=" * 60)
print(f"Training samples: {X_train_arr.shape[0]}")
print(f"Test samples: {X_test_arr.shape[0]}")
print(f"Number of features: {X_train_arr.shape[1]}")

# Train ID3 model
clf = ID3NumericalClassifier(max_depth=15, min_samples_split=80)
clf.fit(X_train_arr, y_train.values)

# Predict on test set
y_pred = clf.predict(X_test_arr)

# Evaluate
accuracy = accuracy_score(y_test, y_pred)
print(f"\nTest Accuracy: {accuracy:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\n" + "=" * 60)
print("Decision Tree Structure:")
print("=" * 60)
clf.print_tree()

 Ponechávam stĺpce: ['SpO₂', 'CO', 'FiO₂', 'Skin Temperature', 'EtCO₂', 'Hb level']
 Ponechávam stĺpce: ['SpO₂', 'CO', 'FiO₂', 'Skin Temperature', 'EtCO₂', 'Hb level']
ID3 Decision Tree on Your Medical Data
Training samples: 9610
Test samples: 2403
Number of features: 6

Test Accuracy: 0.8993

Classification Report:
              precision    recall  f1-score   support

         0.0       0.89      0.85      0.87       975
         1.0       0.90      0.93      0.92      1428

    accuracy                           0.90      2403
   macro avg       0.90      0.89      0.89      2403
weighted avg       0.90      0.90      0.90      2403


Confusion Matrix:
[[ 832  143]
 [  99 1329]]

Decision Tree Structure:
Root -> Feature 0 (threshold: 0.071)
  <= 0.071 -> Feature 2 (threshold: -0.256)
    <= -0.256 -> Feature 4 (threshold: 0.158)
      <= 0.158 -> Feature 5 (threshold: -1.003)
        <= -1.003 -> Feature 4 (threshold: -0.663)
          <= -0.663 -> Leaf: 1.0
          > -0.663 -> Fe

In [11]:
from sklearn.model_selection import GridSearchCV, cross_val_score
from models.id3 import ID3NumericalClassifier
from sklearn.base import BaseEstimator, ClassifierMixin

# Wrapper to make ID3 compatible with GridSearchCV
class ID3Wrapper(BaseEstimator, ClassifierMixin):
    def __init__(self, max_depth=None, min_samples_split=2):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.clf = None

    def fit(self, X, y):
        self.clf = ID3NumericalClassifier(
            max_depth=self.max_depth,
            min_samples_split=self.min_samples_split
        )
        self.clf.fit(X, y)
        return self

    def predict(self, X):
        return self.clf.predict(X)

    def score(self, X, y):
        from sklearn.metrics import accuracy_score
        return accuracy_score(y, self.predict(X))

# Define parameter grid
param_grid = {
    'max_depth': [3, 5, 7, 10, None],
    'min_samples_split': [2, 10, 20, 50, 100]
}

# Grid search
grid_search = GridSearchCV(
    ID3Wrapper(),
    param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train_arr, y_train.values)

print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

# Train with best parameters
best_clf = ID3NumericalClassifier(**grid_search.best_params_)
best_clf.fit(X_train_arr, y_train.values)
y_pred = best_clf.predict(X_test_arr)

print(f"\nTest Accuracy: {accuracy_score(y_test, y_pred):.4f}")

Fitting 5 folds for each of 25 candidates, totalling 125 fits


KeyboardInterrupt: 