In [70]:
import sns
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV, RepeatedStratifiedKFold, cross_val_predict, cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [71]:

df = pd.read_csv('task_data.csv')

df.info()
columns_to_fix = ['CTR - Cardiothoracic Ratio', 'Inscribed circle radius', 'Heart perimeter']
for col in columns_to_fix:
    if col in df.columns:
        #replacing , with . and changing type to float
        df[col] = df[col].astype(str).str.replace(',', '.')
        df[col] = pd.to_numeric(df[col], errors='coerce')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37 entries, 0 to 36
Data columns (total 14 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   ID                          37 non-null     int64  
 1   Cardiomegaly                37 non-null     int64  
 2   Heart width                 37 non-null     int64  
 3   Lung width                  37 non-null     int64  
 4   CTR - Cardiothoracic Ratio  37 non-null     object 
 5   xx                          37 non-null     float64
 6   yy                          37 non-null     float64
 7   xy                          37 non-null     float64
 8   normalized_diff             37 non-null     float64
 9   Inscribed circle radius     37 non-null     object 
 10  Polygon Area Ratio          37 non-null     float64
 11  Heart perimeter             37 non-null     object 
 12  Heart area                  37 non-null     int64  
 13  Lung area                   37 non-nu

In [72]:
X = df[["Heart width", "Lung width", "CTR - Cardiothoracic Ratio", "xx", "yy", "xy","normalized_diff", "Inscribed circle radius", "Polygon Area Ratio", "Heart perimeter", "Heart area ", "Lung area" ]]
#selecting data to learn
y = df["Cardiomegaly"]
#spliting data
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state =42)
#scailing
scaler = StandardScaler()
X_scaled_train = scaler.fit_transform(X_train)
X_scaled_test = scaler.transform(X_test)

In [73]:
#test of the parameters of knn
param_grid = {
    "model__n_neighbors": [3, 5, 7, 9, 11, 15],  # Number of neighbors to consider
    "model__weights": ["distance"],   # How neighbors contribute to the prediction
    "model__metric": ["minkowski", "manhattan", "euclidean", "chebyshev"],  # Distance metrics to test
}


rskf = RepeatedStratifiedKFold(
    n_splits=5,
    n_repeats=100,
    random_state=None
)

pipe_knn = Pipeline(steps=[
    ("scaler", StandardScaler()),           # Standardize data inside each CV fold
    ("model", KNeighborsClassifier())       # The KNN model to be optimized
])


grid_search = GridSearchCV(
    estimator=pipe_knn,          # The pipeline (scaler + model)
    param_grid=param_grid,       # The grid of parameters to test
    scoring="accuracy",          # Metric used to evaluate model performance
    cv=rskf,                     # Cross-validation strategy
    verbose=1,                   # Display progress in the console
    n_jobs=-1                    # Use all available CPU cores for faster processing
)

grid_search.fit(X_train, y_train)

# Display the best results
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best accuracy (averaged CV): {grid_search.best_score_:.4f}")

Fitting 500 folds for each of 24 candidates, totalling 12000 fits
Best parameters: {'model__metric': 'manhattan', 'model__n_neighbors': 5, 'model__weights': 'distance'}
Best accuracy (averaged CV): 0.8277


In [74]:
#KNN
pipe_knn = Pipeline([('scaler', StandardScaler()), ('model', KNeighborsClassifier(
    n_neighbors=5,
    weights='distance',
    metric='manhattan',
))])
cv_score = cross_val_score(pipe_knn, X_train, y_train)

print("Scores of training data cross-validation (each fold):")
list(map(print, cv_score))
print(f"\nCross-validation mean score: {np.mean(cv_score):.3}")
print(f"Standard deviation of CV score: {np.std(cv_score):.3f}")

Scores of training data cross-validation (each fold):
0.8333333333333334
1.0
0.8333333333333334
0.8333333333333334
1.0

Cross-validation mean score: 0.9
Standard deviation of CV score: 0.082


AttributeError: module 'sns' has no attribute 'heatmap'

<Figure size 600x400 with 0 Axes>