In [1]:
# Summary:

# Developing a semi-supervised learning model on Diabetes dataset 
# using a super learner and deploying the model on other datasets.

# Step 1: Data pre-processing phase

# Remove outliers from all columns.
# Impute missing values in all columns.
# Normalize all columns.

# Step 2: Unsupervised Learning for generating labels

# Use K-means clustering on three features of Glucose, 
# BMI and Age to cluster data into two clusters.
# Assign ‘Diabetes’ name to the cluster with higher average Glucose 
# and ‘No Diabetes’ to the other cluster.
# Add a new column (Outcome) to the dataset containing 1 for ‘Diabetes’ 
# and 0 for ‘No Diabetes’. Use these values as labels for classification (step 4).

# Step 3: Feature Extraction

# Split data into test and training sets (consider 20% for test).
# Use PCA on the training data to create 3 new components 
# from existing features (all columns except outcome).
# Transfer training and test data to the new dimensions (PCs).

# Step 4: Classification using a super learner

# Define three classification models as base classifiers 
# consisting of Naïve Bayes, Neural Network, and KNN.
# Define a decision tree as the meta learner.
# Train decision tree (meta learner) on outputs of three base classifiers 
# using 5-fold cross validation.
# Find hyperparameters for all these models which provide the best accuracy rate.
# Report accuracy of the model on the test data.

# Step 5: Employing the model on other datasets

# Use the last column of the assigned dataset as outcome (label).
# Use your current code for steps 1,3, and 4 
# with minor changes (e.g., encoding categorical variables) 
# to train your model on the new dataset and calculate the accuracy.
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
data = pd.read_csv('diabetes_project.csv')
# print(data.head(5))
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 733 entries, 0 to 732
Data columns (total 7 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               733 non-null    int64  
 1   Glucose                   633 non-null    float64
 2   BloodPressure             733 non-null    int64  
 3   SkinThickness             543 non-null    float64
 4   BMI                       726 non-null    float64
 5   DiabetesPedigreeFunction  689 non-null    float64
 6   Age                       733 non-null    int64  
dtypes: float64(4), int64(3)
memory usage: 40.2 KB


In [2]:
# check the information of the data
print(data.head(5))
print(data.isnull().sum())

   Pregnancies  Glucose  BloodPressure  SkinThickness   BMI  \
0            6    148.0             72           35.0  33.6   
1            1     85.0             66           29.0  26.6   
2            8      NaN             64          -35.0  23.3   
3            1     89.0             66           23.0  28.1   
4            0    137.0             40           35.0  43.1   

   DiabetesPedigreeFunction  Age  
0                     0.627   50  
1                     0.351   31  
2                     0.672   32  
3                     0.167   21  
4                       NaN   33  
Pregnancies                   0
Glucose                     100
BloodPressure                 0
SkinThickness               190
BMI                           7
DiabetesPedigreeFunction     44
Age                           0
dtype: int64


In [3]:
# Step 1: Data pre-processing phase
# Remove outliers from all columns.
# Impute missing values in all columns.
# Normalize all columns.
# I think for the four missing values, we should use median values to impute it at first,then remove the outliers.
# But I still confused that whether I should remove outliers firstly.

# Impute missing values in all columns.
# comlumns_to_impute = ['Glucose', 'SkinThickness', 'BMI', 'DiabetesPedigreeFunction']
# medians = data[comlumns_to_impute].median()
# print("medians is:")
# print(medians)
# data[comlumns_to_impute] = data[comlumns_to_impute].fillna(medians)
# print(data.isnull().sum())


In [4]:
# Remove outliers from all columns.
# but the age need to be removed outliers?
data_no_outliers = data.copy()
features = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'BMI', 'DiabetesPedigreeFunction', 'Age']
for col in features:
    Q1 = data_no_outliers[col].quantile(0.25)
    Q3 = data_no_outliers[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - (1.5 * IQR)
    upper_bound = Q3 + (1.5 * IQR)
    data_no_outliers.loc[
        (data_no_outliers[col] < lower_bound) | 
        (data_no_outliers[col] > upper_bound),
        col
    ] = np.nan
print(data_no_outliers.isnull().sum())


Pregnancies                   4
Glucose                     100
BloodPressure                 0
SkinThickness               194
BMI                          11
DiabetesPedigreeFunction     44
Age                           0
dtype: int64


In [5]:
# Impute missing values in all columns.
comlumns_to_impute = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'BMI', 'DiabetesPedigreeFunction', 'Age']
medians = data_no_outliers[comlumns_to_impute].median()
print("medians is:")
print(medians)
data_no_outliers[comlumns_to_impute] = data_no_outliers[comlumns_to_impute].fillna(medians)
print(data_no_outliers.isnull().sum())


medians is:
Pregnancies                   3.000
Glucose                     117.000
BloodPressure                72.000
SkinThickness                29.000
BMI                          32.150
DiabetesPedigreeFunction      0.355
Age                          29.000
dtype: float64
Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
dtype: int64


In [6]:
# Normalize all columns.
scaler = StandardScaler()
data_scaler = scaler.fit_transform(data_no_outliers)
print(data_scaler[:5])

[[ 0.67961989  0.95479867 -0.03101107  0.69146549  0.21297848  0.96547681
   1.42845618]
 [-0.85791523 -1.26980516 -0.53614696 -0.00522757 -0.87790504 -0.27797363
  -0.19807552]
 [ 1.29463394 -0.13984766 -0.70452559 -0.00522757 -1.3921787   1.16821329
  -0.11246859]
 [-0.85791523 -1.12856047 -0.53614696 -0.70192064 -0.64414429 -1.10694059
  -1.05414483]
 [-1.16542226  0.56637578 -2.72506914  0.69146549  1.69346325 -0.25995261
  -0.02686166]]


In [7]:
# Step 2: Unsupervised Learning for generating labels

# Use K-means clustering on three features of Glucose, 
# BMI and Age to cluster data into two clusters.
# Assign ‘Diabetes’ name to the cluster with higher average Glucose 
# and ‘No Diabetes’ to the other cluster.
# Add a new column (Outcome) to the dataset containing 1 for ‘Diabetes’ 
# and 0 for ‘No Diabetes’. Use these values as labels for classification (step 4).

# Use K-means clustering on three features of Glucose, 
# BMI and Age to cluster data into two clusters.

from sklearn.cluster import KMeans

kmeans_features = ['Glucose', 'BMI', 'Age']
data_for_kmeans = data_no_outliers[kmeans_features]
kmeans_scaler = StandardScaler()
data_kmeans_scaled = kmeans_scaler.fit_transform(data_for_kmeans)

chose_k = 2
kmeans = KMeans(n_clusters = chose_k, init = 'k-means++',n_init = 'auto', random_state = 42)
kmeans.fit(data_kmeans_scaled)
cluster_labels = kmeans.labels_
# come back to the original data to analysis
data_no_outliers['Cluster'] = cluster_labels
cluster_analysis = data_no_outliers.groupby('Cluster')[kmeans_features].mean()
print(cluster_analysis)

            Glucose        BMI        Age
Cluster                                  
0        141.945578  35.415476  42.142857
1        106.906606  30.102278  27.400911




In [8]:
# Assign ‘Diabetes’ name to the cluster with higher average Glucose 
# and ‘No Diabetes’ to the other cluster.
# Add a new column (Outcome) to the dataset containing 1 for ‘Diabetes’ 
# and 0 for ‘No Diabetes’. Use these values as labels for classification (step 4).
diabetes_cluster_labels = cluster_analysis['Glucose'].idxmax()
no_diabetes_cluster_labels = 1 - diabetes_cluster_labels
print(f"diabetes with high average Glucose cluster is: Cluster {diabetes_cluster_labels}")
print(f"no diabetes with high average Glucose cluster is: Cluster {no_diabetes_cluster_labels}")
# we use Map to distribute the labels
label_map = {
    diabetes_cluster_labels: 1,  # 1 = 'Diabetes'
    no_diabetes_cluster_labels: 0   # 0 = 'No Diabetes'
}
data_no_outliers['Outcome'] = data_no_outliers['Cluster'].map(label_map)
print(data_no_outliers[['Cluster','Outcome']].head(10))


diabetes with high average Glucose cluster is: Cluster 0
no diabetes with high average Glucose cluster is: Cluster 1
   Cluster  Outcome
0        0        1
1        1        0
2        1        0
3        1        0
4        0        1
5        1        0
6        1        0
7        0        1
8        0        1
9        1        0


In [9]:
# Step 3: Feature Extraction

# Split data into test and training sets (consider 20% for test).
# Use PCA on the training data to create 3 new components 
# from existing features (all columns except outcome).
# Transfer training and test data to the new dimensions (PCs).

# ! data_no_outliers is a Pandas DataFrame.
# ! data_scaler is a NumPy Array.
# so we need to normalize it again

from psutil import net_connections
from sklearn.metrics import classification_report
from sklearn.decomposition import PCA


target = 'Outcome'

X = data_no_outliers[features]
y = data_no_outliers[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
PCA_scaler = StandardScaler()
X_train_scaled = PCA_scaler.fit_transform(X_train)
X_test_scaled = PCA_scaler.transform(X_test)

# Use PCA on the training data to create 3 new components from existing features (all columns except outcome).
pca = PCA(n_components = 3)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)
print(f"original features: {X_train_scaled.shape}")
print(f"PCA train features: {X_train_pca.shape}")
print(f"PCA test features: {X_test_pca.shape}")


original features: (586, 7)
PCA train features: (586, 3)
PCA test features: (147, 3)


In [10]:
# Step 4: Classification using a super learner

# Define three classification models as base classifiers 
# consisting of Naïve Bayes, Neural Network, and KNN.
# Define a decision tree as the meta learner.
# Train decision tree (meta learner) on outputs of three base classifiers 
# using 5-fold cross validation.
# Find hyperparameters for all these models which provide the best accuracy rate.
# Report accuracy of the model on the test data.
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score

# create instance for three classification models
base_nb = GaussianNB()
base_nn = MLPClassifier(max_iter=1000, random_state=42)
base_knn = KNeighborsClassifier()
# create the meta learner
meta_learner = DecisionTreeClassifier(random_state=42)
level0_estimators = [
    ('nb', base_nb),
    ('nn', base_nn),
    ('knn', base_knn)
]
#create super learner using 5-fold cross validation
stacking_model = StackingClassifier(
    estimators=level0_estimators,
    final_estimator=meta_learner,
    cv = 5
)

In [None]:
# Find hyperparameters for all these models which provide the best accuracy rate.

# we use GridSearchCV to find the best hyperparameters
# for the naive bayes,we don't need to change the hyperparameters
param_grid = {
    # try more hyperparemeters
    'knn__n_neighbors': [3, 5, 7, 9, 11],
    'knn__weights': ['uniform', 'distance'],

    # try more layer sizes and learning rate
    'nn__hidden_layer_sizes': [(25,), (50,), (100,), (25, 25)],
    'nn__alpha': [0.0001, 0.001, 0.01],

    # try more hyperparemeters
    'final_estimator__max_depth': [3, 5, 7, 10],
    'final_estimator__min_samples_leaf': [1, 5, 10]
}

grid_search = GridSearchCV(
    estimator=stacking_model, 
    param_grid=param_grid, 
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)
grid_search.fit(X_train_pca, y_train)
print(f"best hyperparameters: \n{grid_search.best_params_}")
print(f"\nbest score: {grid_search.best_score_:.4f}")
# output
# best hyperparameters:  {'final_estimator__max_depth': 3, 
#                         'final_estimator__min_samples_leaf': 5, 
#                         'knn__n_neighbors': 5, 
#                         'knn__weights': 'uniform', 
#                         'nn__alpha': 0.0001, 
#                         'nn__hidden_layer_sizes': (50,)} 
# best score: 0.8772
# Accuracy: 0.8707

# Classification Report:
#               precision    recall  f1-score   support

#            0       0.83      0.95      0.89        78
#            1       0.93      0.78      0.85        69

#     accuracy                           0.87       147
#    macro avg       0.88      0.87      0.87       147
# weighted avg       0.88      0.87      0.87       147




KeyboardInterrupt: 

In [None]:
# Report accuracy of the model on the test data.
y_pred_test = grid_search.predict(X_test_pca)

final_accuracy = accuracy_score(y_test, y_pred_test)
final_report = classification_report(y_test, y_pred_test)

print(f"Accuracy: {final_accuracy:.4f}")
print("\nClassification Report:")
print(final_report)

Accuracy: 0.8707

Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.95      0.89        78
           1       0.93      0.78      0.85        69

    accuracy                           0.87       147
   macro avg       0.88      0.87      0.87       147
weighted avg       0.88      0.87      0.87       147



In [None]:
# Step 5: Employing the model on other datasets

# Use the last column of the assigned dataset as outcome (label).
# Use your current code for steps 1,3, and 4 
# with minor changes (e.g., encoding categorical variables) 
# to train your model on the new dataset and calculate the accuracy.