In [141]:
import pandas as pd

file_path = 'path_to_your_file.xlsx'
df = pd.read_csv('./diabetes.csv')

df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [142]:
df.dtypes

Pregnancies                   int64
Glucose                       int64
BloodPressure                 int64
SkinThickness                 int64
Insulin                       int64
BMI                         float64
DiabetesPedigreeFunction    float64
Age                           int64
Outcome                       int64
dtype: object

In [143]:
df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.8451,120.8945,69.1055,20.5365,79.7995,31.9926,0.4719,33.2409,0.349
std,3.3696,31.9726,19.3558,15.9522,115.244,7.8842,0.3313,11.7602,0.477
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.2437,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.6262,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


# Part 1 Data Preprocessing
## #1.1 Split dataset to 80% test_data + 20% train_data

In [144]:
# Split data into training and testing
from sklearn import model_selection

X = df.iloc[:, :8] 
y = df.iloc[:, -1] 

# Reserve 20% for testing
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.8, random_state=42)

print('training data has %d observation with %d features'% X_train.shape)
print('test data has %d observation with %d features'% X_test.shape)

training data has 153 observation with 8 features
test data has 615 observation with 8 features


# Part 2 Active Learning
### 2.1 Use KNN to train an initial model - get inital model
### 2.2 Apply initial model on test_data - get initial model accuracy: 0.68
### 2.3 Select samples from test_data which predict_y are between 0.4 and 0.6
### 2.4 Add samples from obtained from step 2.3 to train_data, and retrain the model - get active learning model accuracy: 0.76




In [145]:
# Build model
from sklearn.neighbors import KNeighborsClassifier

# K Nearest Neighbors
classifier_KNN = KNeighborsClassifier()
classifier_KNN.fit(X_train, y_train)

In [146]:
from sklearn.metrics import classification_report, accuracy_score

# Make predictions on the initial model
y_pred_initial = classifier_KNN.predict(X_test)
y_pred_proba_initial = classifier_KNN.predict_proba(X_test)[:, 1]  # Predict probabilities

# Evaluate the initial model's performance
print("Initial Model Evaluation:")
print(classification_report(y_test, y_pred_initial))
print("Initial Model Accuracy:", accuracy_score(y_test, y_pred_initial))

Initial Model Evaluation:
              precision    recall  f1-score   support

           0       0.77      0.74      0.76       405
           1       0.53      0.57      0.55       210

    accuracy                           0.68       615
   macro avg       0.65      0.65      0.65       615
weighted avg       0.69      0.68      0.69       615

Initial Model Accuracy: 0.6829268292682927


In [147]:
# Create a DataFrame that includes test features and predicted probabilities
import numpy as np
results = pd.DataFrame(X_test)  
results['Predicted Probability'] = np.round(y_pred_proba_initial, 4)  
pd.set_option('display.float_format', '{:.4f}'.format)
results['True Y'] = y_test.values  # Add true y values

results.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Predicted Probability,True Y
668,6,98,58,33,190,34.0,0.43,43,0.8,0
324,2,112,75,32,0,35.7,0.148,21,0.0,0
624,2,108,64,0,0,30.8,0.158,21,0.2,0
690,8,107,80,0,0,24.6,0.856,34,0.8,0
473,7,136,90,0,0,29.9,0.21,50,0.6,0


In [148]:
# Select samples with predicted probabilities between 0.4 and 0.6
selected_samples = results[(results['Predicted Probability'] >= 0.4) & (results['Predicted Probability'] <= 0.6)]

# Randomly select 20% of the chosen samples
selected_samples_50 = selected_samples.sample(frac=0.2, random_state=42)  # random_state ensures reproducibility

# Split X and y
X_selected = selected_samples_50.drop(columns=['Predicted Probability', 'True Y'])
y_selected = selected_samples_50['True Y']

# Check the shape of the selected samples
print(X_selected.shape)

(44, 8)


In [149]:
# Remove the selected samples from X_test
X_test_filtered = results[~results.index.isin(selected_samples.index)].drop(columns=['Predicted Probability', 'True Y'])
y_test_filtered = results[~results.index.isin(selected_samples.index)]['True Y']
X_test_filtered

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
668,6,98,58,33,190,34.0000,0.4300,43
324,2,112,75,32,0,35.7000,0.1480,21
624,2,108,64,0,0,30.8000,0.1580,21
690,8,107,80,0,0,24.6000,0.8560,34
97,1,71,48,18,76,20.4000,0.3230,22
...,...,...,...,...,...,...,...,...
376,0,98,82,15,84,25.2000,0.2990,22
384,1,125,70,24,110,24.3000,0.2210,25
224,1,100,66,15,56,23.6000,0.6660,26
729,2,92,52,0,0,30.1000,0.1410,22


In [151]:
# Add the selected samples to the training set
X_new_train = pd.concat([pd.DataFrame(X_train), X_selected], ignore_index=True)
y_new_train = pd.concat([pd.Series(y_train), y_selected], ignore_index=True)
X_new_train

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,10,75,82,0,0,33.3000,0.2630,38
1,7,137,90,41,0,32.0000,0.3910,39
2,3,158,64,13,387,31.2000,0.2950,24
3,7,129,68,49,125,38.5000,0.4390,43
4,11,127,106,0,0,39.0000,0.1900,51
...,...,...,...,...,...,...,...,...
192,9,171,110,24,240,45.4000,0.7210,54
193,5,117,92,0,0,34.1000,0.3370,38
194,10,90,85,32,0,34.9000,0.8250,56
195,9,164,84,21,0,30.8000,0.8310,32


In [152]:
# Re-train the model
classifier_KNN_updated = KNeighborsClassifier()
classifier_KNN_updated.fit(X_new_train, y_new_train)  # Train the updated model

In [153]:
# Make predictions on the new test set
y_pred_updated = classifier_KNN_updated.predict(X_test_filtered)
y_pred_proba_updated = classifier_KNN_updated.predict_proba(X_test_filtered)[:, 1]

# Print updated model predictions
print("Updated Model Predictions:")
print(pd.DataFrame({'True Y': y_test_filtered, 'Predicted Y': y_pred_updated, 'Predicted Probability': y_pred_proba_updated}).head())

# Evaluate the performance of the updated model
print("Updated Model Evaluation:")
print(classification_report(y_test_filtered, y_pred_updated))
print("Updated Model Accuracy:", accuracy_score(y_test_filtered, y_pred_updated))

Updated Model Predictions:
     True Y  Predicted Y  Predicted Probability
668       0            1                 0.6000
324       0            0                 0.0000
624       0            0                 0.2000
690       0            1                 0.8000
97        0            0                 0.0000
Updated Model Evaluation:
              precision    recall  f1-score   support

           0       0.83      0.85      0.84       285
           1       0.58      0.53      0.56       109

    accuracy                           0.76       394
   macro avg       0.70      0.69      0.70       394
weighted avg       0.76      0.76      0.76       394

Updated Model Accuracy: 0.7639593908629442


# Part 3 Exceptional Model Mining 