In [17]:
# Summary:

# Developing a semi-supervised learning model on Diabetes dataset 
# using a super learner and deploying the model on other datasets.

# Step 1: Data pre-processing phase

# Remove outliers from all columns.
# Impute missing values in all columns.
# Normalize all columns.

# Step 2: Unsupervised Learning for generating labels

# Use K-means clustering on three features of Glucose, 
# BMI and Age to cluster data into two clusters.
# Assign ‘Diabetes’ name to the cluster with higher average Glucose 
# and ‘No Diabetes’ to the other cluster.
# Add a new column (Outcome) to the dataset containing 1 for ‘Diabetes’ 
# and 0 for ‘No Diabetes’. Use these values as labels for classification (step 4).

# Step 3: Feature Extraction

# Split data into test and training sets (consider 20% for test).
# Use PCA on the training data to create 3 new components 
# from existing features (all columns except outcome).
# Transfer training and test data to the new dimensions (PCs).

# Step 4: Classification using a super learner

# Define three classification models as base classifiers 
# consisting of Naïve Bayes, Neural Network, and KNN.
# Define a decision tree as the meta learner.
# Train decision tree (meta learner) on outputs of three base classifiers 
# using 5-fold cross validation.
# Find hyperparameters for all these models which provide the best accuracy rate.
# Report accuracy of the model on the test data.

# Step 5: Employing the model on other datasets

# Use the last column of the assigned dataset as outcome (label).
# Use your current code for steps 1,3, and 4 
# with minor changes (e.g., encoding categorical variables) 
# to train your model on the new dataset and calculate the accuracy.
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
data = pd.read_csv('Datasets/10year_risk_coronary_heart_disease.csv')
# print(data.head(5))
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2927 entries, 0 to 2926
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   age              2927 non-null   int64  
 1   education        2927 non-null   int64  
 2   sex              2927 non-null   object 
 3   is_smoking       2927 non-null   object 
 4   cigsPerDay       2927 non-null   int64  
 5   BPMeds           2927 non-null   int64  
 6   prevalentStroke  2927 non-null   int64  
 7   prevalentHyp     2927 non-null   int64  
 8   diabetes         2927 non-null   int64  
 9   totChol          2927 non-null   int64  
 10  sysBP            2927 non-null   float64
 11  diaBP            2927 non-null   float64
 12  BMI              2927 non-null   float64
 13  heartRate        2927 non-null   int64  
 14  glucose          2927 non-null   int64  
 15  TenYearCHD       2927 non-null   int64  
dtypes: float64(3), int64(11), object(2)
memory usage: 366.0+ KB


In [18]:
#pre processing
target = 'TenYearCHD'
categorical_features = ['sex', 'is_smoking']
data_encoded = pd.get_dummies(data, 
                                  columns=categorical_features, 
                                  drop_first=True)
data_encoded.info()
print(data_encoded.head())

# because the data is non-null, so we don't need to use IQR to fill the Datasets
# what we need to do is normalize the dataset in the next code cell


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2927 entries, 0 to 2926
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   age              2927 non-null   int64  
 1   education        2927 non-null   int64  
 2   cigsPerDay       2927 non-null   int64  
 3   BPMeds           2927 non-null   int64  
 4   prevalentStroke  2927 non-null   int64  
 5   prevalentHyp     2927 non-null   int64  
 6   diabetes         2927 non-null   int64  
 7   totChol          2927 non-null   int64  
 8   sysBP            2927 non-null   float64
 9   diaBP            2927 non-null   float64
 10  BMI              2927 non-null   float64
 11  heartRate        2927 non-null   int64  
 12  glucose          2927 non-null   int64  
 13  TenYearCHD       2927 non-null   int64  
 14  sex_M            2927 non-null   bool   
 15  is_smoking_YES   2927 non-null   bool   
dtypes: bool(2), float64(3), int64(11)
memory usage: 326.0 KB
   

In [19]:
# Step 3: Feature Extraction

# Split data into test and training sets (consider 20% for test).
# Use PCA on the training data to create 3 new components 
# from existing features (all columns except outcome).
# Transfer training and test data to the new dimensions (PCs).

from psutil import net_connections
from sklearn.metrics import classification_report
from sklearn.decomposition import PCA
features = ['age','education','cigsPerDay','BPMeds','prevalentStroke',
            'prevalentHyp','diabetes','totChol','sysBP','diaBP','BMI',
            'heartRate','glucose','sex_M','is_smoking_YES']

X = data_encoded[features]
y = data_encoded[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
PCA_scaler = StandardScaler()
X_train_scaled = PCA_scaler.fit_transform(X_train)
X_test_scaled = PCA_scaler.transform(X_test)

# Use PCA on the training data to create 3 new components from existing features (all columns except outcome).
pca = PCA(n_components = 3)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)
print(f"original features: {X_train_scaled.shape}")
print(f"PCA train features: {X_train_pca.shape}")
print(f"PCA test features: {X_test_pca.shape}")

original features: (2341, 15)
PCA train features: (2341, 3)
PCA test features: (586, 3)


In [20]:
# Step 4: Classification using a super learner

# Define three classification models as base classifiers 
# consisting of Naïve Bayes, Neural Network, and KNN.
# Define a decision tree as the meta learner.
# Train decision tree (meta learner) on outputs of three base classifiers 
# using 5-fold cross validation.
# Find hyperparameters for all these models which provide the best accuracy rate.
# Report accuracy of the model on the test data.
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score

# create instance for three classification models
base_nb = GaussianNB()
base_nn = MLPClassifier(max_iter=1000, random_state=42)
base_knn = KNeighborsClassifier()
# create the meta learner
meta_learner = DecisionTreeClassifier(random_state=42)
level0_estimators = [
    ('nb', base_nb),
    ('nn', base_nn),
    ('knn', base_knn)
]
#create super learner using 5-fold cross validation
stacking_model = StackingClassifier(
    estimators=level0_estimators,
    final_estimator=meta_learner,
    cv = 5
)
param_grid = {
    # try more hyperparemeters
    'knn__n_neighbors': [3, 5, 7, 9, 11],
    'knn__weights': ['uniform', 'distance'],

    # try more layer sizes and learning rate
    'nn__hidden_layer_sizes': [(25,), (50,), (100,), (25, 25)],
    'nn__alpha': [0.0001, 0.001, 0.01],

    # try more hyperparemeters
    'final_estimator__max_depth': [3, 5, 7, 10],
    'final_estimator__min_samples_leaf': [1, 5, 10]
}

grid_search = GridSearchCV(
    estimator=stacking_model, 
    param_grid=param_grid, 
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)
grid_search.fit(X_train_pca, y_train)
print(f"best hyperparameters: \n{grid_search.best_params_}")
print(f"\nbest score: {grid_search.best_score_:.4f}")

# Report accuracy of the model on the test data.
y_pred_test = grid_search.predict(X_test_pca)

final_accuracy = accuracy_score(y_test, y_pred_test)
final_report = classification_report(y_test, y_pred_test)

print(f"Accuracy: {final_accuracy:.4f}")
print("\nClassification Report:")
print(final_report)

best hyperparameters: 
{'final_estimator__max_depth': 3, 'final_estimator__min_samples_leaf': 10, 'knn__n_neighbors': 9, 'knn__weights': 'uniform', 'nn__alpha': 0.01, 'nn__hidden_layer_sizes': (100,)}

best score: 0.8513
Accuracy: 0.8430

Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.99      0.91       496
           1       0.25      0.01      0.02        90

    accuracy                           0.84       586
   macro avg       0.55      0.50      0.47       586
weighted avg       0.76      0.84      0.78       586

