In [41]:
%config Completer.use_jedi=False # comment if not needed
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import classification_report


In [42]:
#read the data into a Pandas Dataframe object
df= pd.read_csv('risk_factors.csv', sep=',')
#return the random 10 rows of datasets
df.sample(10)

Unnamed: 0,Age,Number of sexual partners,First sexual intercourse,Num of pregnancies,Smokes,Smokes (years),Smokes (packs/year),Hormonal Contraceptives,Hormonal Contraceptives (years),IUD,...,STDs: Time since first diagnosis,STDs: Time since last diagnosis,Dx:Cancer,Dx:CIN,Dx:HPV,Dx,Hinselmann,Schiller,Citology,Biopsy
543,16,3.0,14.0,2.0,0.0,0.0,0.0,1.0,1.0,0.0,...,?,?,0,0,0,0,0,0,0,0
26,39,5.0,23.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,...,?,?,0,0,0,0,0,0,0,0
256,24,4.0,16.0,1.0,0.0,0.0,0.0,?,?,?,...,?,?,0,0,0,0,0,0,0,0
788,38,2.0,19.0,5.0,0.0,0.0,0.0,1.0,30.0,?,...,?,?,0,0,0,0,0,0,1,0
112,30,3.0,19.0,2.0,0.0,0.0,0.0,1.0,9.0,0.0,...,9.0,9.0,0,0,0,0,0,0,0,0
619,23,3.0,18.0,4.0,1.0,8.0,1.2,?,?,?,...,?,?,0,0,0,0,0,0,0,0
816,21,2.0,19.0,?,0.0,0.0,0.0,1.0,0.5,?,...,?,?,0,0,0,0,0,0,0,0
700,28,1.0,17.0,?,0.0,0.0,0.0,?,?,?,...,?,?,0,0,0,0,0,0,0,0
676,70,4.0,27.0,3.0,1.0,3.0,0.75,0.0,0.0,0.0,...,?,?,0,0,0,0,0,0,0,0
822,36,3.0,20.0,2.0,0.0,0.0,0.0,1.0,6.0,0.0,...,16.0,16.0,1,0,1,1,0,0,0,0


In [43]:
#To check the dataset is a classification or regression problem
#For each target in this list, it prints out the unique values found in the DataFrame df under the column specified by target.
for target in ['Hinselmann', 'Schiller', 'Citology', 'Biopsy']:
    print(f"Unique values in {target}: {df[target].unique()}")


Unique values in Hinselmann: [0 1]
Unique values in Schiller: [0 1]
Unique values in Citology: [0 1]
Unique values in Biopsy: [0 1]


In [44]:
#Display the statistical value of the datasets for each columns
df.describe()

Unnamed: 0,Age,STDs: Number of diagnosis,Dx:Cancer,Dx:CIN,Dx:HPV,Dx,Hinselmann,Schiller,Citology,Biopsy
count,858.0,858.0,858.0,858.0,858.0,858.0,858.0,858.0,858.0,858.0
mean,26.820513,0.087413,0.020979,0.01049,0.020979,0.027972,0.040793,0.086247,0.051282,0.064103
std,8.497948,0.302545,0.143398,0.101939,0.143398,0.164989,0.197925,0.280892,0.220701,0.245078
min,13.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,25.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,32.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,84.0,3.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [45]:
# Display all the columns of the dataset
df_col= pd.DataFrame(df.columns)
df_col.columns = ['features']
df_col

Unnamed: 0,features
0,Age
1,Number of sexual partners
2,First sexual intercourse
3,Num of pregnancies
4,Smokes
5,Smokes (years)
6,Smokes (packs/year)
7,Hormonal Contraceptives
8,Hormonal Contraceptives (years)
9,IUD


In [46]:
# Check for column that has non-numeric values in the datasets
non_numeric_columns = df.columns[df.apply(lambda col: pd.to_numeric(col, errors='coerce').isna().any())]

print("Non-numeric columns:", non_numeric_columns)

Non-numeric columns: Index(['Number of sexual partners', 'First sexual intercourse',
       'Num of pregnancies', 'Smokes', 'Smokes (years)', 'Smokes (packs/year)',
       'Hormonal Contraceptives', 'Hormonal Contraceptives (years)', 'IUD',
       'IUD (years)', 'STDs', 'STDs (number)', 'STDs:condylomatosis',
       'STDs:cervical condylomatosis', 'STDs:vaginal condylomatosis',
       'STDs:vulvo-perineal condylomatosis', 'STDs:syphilis',
       'STDs:pelvic inflammatory disease', 'STDs:genital herpes',
       'STDs:molluscum contagiosum', 'STDs:AIDS', 'STDs:HIV',
       'STDs:Hepatitis B', 'STDs:HPV', 'STDs: Time since first diagnosis',
       'STDs: Time since last diagnosis'],
      dtype='object')


In [47]:
##replace '?' with NaN values
df.replace('?', np.nan, inplace=True)
df

Unnamed: 0,Age,Number of sexual partners,First sexual intercourse,Num of pregnancies,Smokes,Smokes (years),Smokes (packs/year),Hormonal Contraceptives,Hormonal Contraceptives (years),IUD,...,STDs: Time since first diagnosis,STDs: Time since last diagnosis,Dx:Cancer,Dx:CIN,Dx:HPV,Dx,Hinselmann,Schiller,Citology,Biopsy
0,18,4.0,15.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,0,0,0,0,0,0,0,0
1,15,1.0,14.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,0,0,0,0,0,0,0,0
2,34,1.0,,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,0,0,0,0,0,0,0,0
3,52,5.0,16.0,4.0,1.0,37.0,37.0,1.0,3.0,0.0,...,,,1,0,1,0,0,0,0,0
4,46,3.0,21.0,4.0,0.0,0.0,0.0,1.0,15.0,0.0,...,,,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
853,34,3.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,0,0,0,0,0,0,0,0
854,32,2.0,19.0,1.0,0.0,0.0,0.0,1.0,8.0,0.0,...,,,0,0,0,0,0,0,0,0
855,25,2.0,17.0,0.0,0.0,0.0,0.0,1.0,0.08,0.0,...,,,0,0,0,0,0,0,1,0
856,33,2.0,24.0,2.0,0.0,0.0,0.0,1.0,0.08,0.0,...,,,0,0,0,0,0,0,0,0


In [48]:
#Check the number of null value for each column in the dataset
df.apply(lambda x: sum(x.isnull()),axis=0)

Age                                     0
Number of sexual partners              26
First sexual intercourse                7
Num of pregnancies                     56
Smokes                                 13
Smokes (years)                         13
Smokes (packs/year)                    13
Hormonal Contraceptives               108
Hormonal Contraceptives (years)       108
IUD                                   117
IUD (years)                           117
STDs                                  105
STDs (number)                         105
STDs:condylomatosis                   105
STDs:cervical condylomatosis          105
STDs:vaginal condylomatosis           105
STDs:vulvo-perineal condylomatosis    105
STDs:syphilis                         105
STDs:pelvic inflammatory disease      105
STDs:genital herpes                   105
STDs:molluscum contagiosum            105
STDs:AIDS                             105
STDs:HIV                              105
STDs:Hepatitis B                  

In [49]:
#Drop columns STDs: Time since first diagnosis and STDs: Time since last diagnosis
#To ensure the model stability and performance, we drop these columns because it contains a lot of NaN values.
df= df.drop(['STDs: Time since first diagnosis', 'STDs: Time since last diagnosis'], axis=1)
df

Unnamed: 0,Age,Number of sexual partners,First sexual intercourse,Num of pregnancies,Smokes,Smokes (years),Smokes (packs/year),Hormonal Contraceptives,Hormonal Contraceptives (years),IUD,...,STDs:HPV,STDs: Number of diagnosis,Dx:Cancer,Dx:CIN,Dx:HPV,Dx,Hinselmann,Schiller,Citology,Biopsy
0,18,4.0,15.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0,0,0,0,0,0,0,0,0
1,15,1.0,14.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0,0,0,0,0,0,0,0,0
2,34,1.0,,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0,0,0,0,0,0,0,0,0
3,52,5.0,16.0,4.0,1.0,37.0,37.0,1.0,3.0,0.0,...,0.0,0,1,0,1,0,0,0,0,0
4,46,3.0,21.0,4.0,0.0,0.0,0.0,1.0,15.0,0.0,...,0.0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
853,34,3.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0,0,0,0,0,0,0,0,0
854,32,2.0,19.0,1.0,0.0,0.0,0.0,1.0,8.0,0.0,...,0.0,0,0,0,0,0,0,0,0,0
855,25,2.0,17.0,0.0,0.0,0.0,0.0,1.0,0.08,0.0,...,0.0,0,0,0,0,0,0,0,1,0
856,33,2.0,24.0,2.0,0.0,0.0,0.0,1.0,0.08,0.0,...,0.0,0,0,0,0,0,0,0,0,0


**Handling Missing Value**

In [50]:
# List categorical column contains missing value to replace the missing value with the mode.
cat_columns = [
    'Smokes',
    'Hormonal Contraceptives',
    'IUD',
    'STDs',
    'STDs:condylomatosis',
    'STDs:cervical condylomatosis',
    'STDs:vaginal condylomatosis',
    'STDs:vulvo-perineal condylomatosis',
    'STDs:syphilis',
    'STDs:pelvic inflammatory disease',
    'STDs:genital herpes',
    'STDs:molluscum contagiosum',
    'STDs:AIDS',
    'STDs:HIV',
    'STDs:Hepatitis B',
    'STDs:HPV'
]

# Replace categorical columns with mode
for column in cat_columns:
    mode_value = df[column].mode()[0]
    df[column].fillna(mode_value, inplace=True)

In [51]:
# To ensure no missing value in the corresponding categorical column.
print(df[cat_columns].isna().sum())

Smokes                                0
Hormonal Contraceptives               0
IUD                                   0
STDs                                  0
STDs:condylomatosis                   0
STDs:cervical condylomatosis          0
STDs:vaginal condylomatosis           0
STDs:vulvo-perineal condylomatosis    0
STDs:syphilis                         0
STDs:pelvic inflammatory disease      0
STDs:genital herpes                   0
STDs:molluscum contagiosum            0
STDs:AIDS                             0
STDs:HIV                              0
STDs:Hepatitis B                      0
STDs:HPV                              0
dtype: int64


In [52]:
# List numerical column contains missing value to replace the missing value with the mean
numeric_columns = [
    'Number of sexual partners',
    'First sexual intercourse',
    'Num of pregnancies',
    'Smokes (years)',
    'STDs (number)',
    'Hormonal Contraceptives (years)',
    'IUD (years)',
    'Smokes (packs/year)',
]

# Convert columns to numeric (errors='coerce' converts non-numeric values to NaN)
df[numeric_columns] = df[numeric_columns].apply(pd.to_numeric, errors='coerce')

# Replace numerical columns with mean.
for column in numeric_columns:
    df[column].fillna(df[column].mean(), inplace=True)

In [53]:
# To ensure no missing value in the corresponding numerical column.
print(df[numeric_columns].isna().sum())

Number of sexual partners          0
First sexual intercourse           0
Num of pregnancies                 0
Smokes (years)                     0
STDs (number)                      0
Hormonal Contraceptives (years)    0
IUD (years)                        0
Smokes (packs/year)                0
dtype: int64


In [54]:
#Check the number of null value for each column in the dataset
df.apply(lambda x: sum(x.isnull()),axis=0)

Age                                   0
Number of sexual partners             0
First sexual intercourse              0
Num of pregnancies                    0
Smokes                                0
Smokes (years)                        0
Smokes (packs/year)                   0
Hormonal Contraceptives               0
Hormonal Contraceptives (years)       0
IUD                                   0
IUD (years)                           0
STDs                                  0
STDs (number)                         0
STDs:condylomatosis                   0
STDs:cervical condylomatosis          0
STDs:vaginal condylomatosis           0
STDs:vulvo-perineal condylomatosis    0
STDs:syphilis                         0
STDs:pelvic inflammatory disease      0
STDs:genital herpes                   0
STDs:molluscum contagiosum            0
STDs:AIDS                             0
STDs:HIV                              0
STDs:Hepatitis B                      0
STDs:HPV                              0


#### Split the dataset
Split the dataset into training, validation and test sets.

In [55]:
#Size of datasets, 858 instances and 30 features, 4 target variable
df.shape

(858, 34)

In [56]:
from sklearn.model_selection import train_test_split

#x is features variable and y is target variable
y = df[['Hinselmann', 'Schiller', 'Citology', 'Biopsy']] #assigning the 'Hinselmann', 'Schiller', 'Citology', 'Biopsy' column of DataFrame as target variable, y
X = df.drop(columns=['Hinselmann', 'Schiller', 'Citology', 'Biopsy']) #assigning all columns of DataFrame label as features variables, X

seed_num = 42   #set a random seed for reproductibility
#split data into 70% as training data and 30% as temporary sets
#futher splitting the temporary data into validation and testing sets, 
#so that validation sets is 10% of original datasets and training setst is 20% of original datasets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3,random_state=seed_num)
X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp,test_size=1/3, random_state=seed_num)

print(X_train.shape)     
print(y_train.shape)     
print(X_val.shape)       
print(y_val.shape)       
print(X_test.shape)      
print(y_test.shape)      

(600, 30)
(600, 4)
(86, 30)
(86, 4)
(172, 30)
(172, 4)


In [57]:
# Display size for training, validation and test. 
print("Training Set Size:", len(X_train))
print("Validation Set Size:", len(X_val))
print("Test Set Size:", len(X_test))

Training Set Size: 600
Validation Set Size: 86
Test Set Size: 172


#### Data preprocessing

Description: We perform normalization on numerical features to scale the features to specific range, between 0 and 1. This will make data modelling process work better with data that have similar scale. Since our datasets do not contain categorical variables so we do not perform label encoding.

In [58]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)    #scale the training dataset
X_val_scaled = scaler.transform(X_val)            #scale the validation dataset
X_test_scaled = scaler.transform(X_test)          #scale the testing dataset

# Display the sizes of the resulting sets (Optional)
print("Scaled Training Set Size:", X_train_scaled.shape)
print("Scaled Validation Set Size:", X_val_scaled.shape)
print("Scaled Test Set Size:", X_test_scaled.shape)

Scaled Training Set Size: (600, 30)
Scaled Validation Set Size: (86, 30)
Scaled Test Set Size: (172, 30)


#### Feature Selection

Description: To select the most relevant features, we perform Univariate Feature Selection method on the datasets. It will calculate the score for each features, X. We select the top 10 features that have highest score. Then we transform for each splitted datasets with the selected features and assign each of them as X_train_selected, X_test_selected and X_val_selected.

In [59]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
bestfeatures = SelectKBest(score_func=chi2, k=10)
fit = bestfeatures.fit(X_train,y_train)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)
dfscores

Unnamed: 0,0
0,12.794631
1,0.830441
2,0.317354
3,4.01918
4,0.663767
5,37.51743
6,16.119424
7,0.688492
8,19.69867
9,6.461688


In [60]:
#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  #naming the dataframe columns
# Sort the dataframe by 'Score' column in descending order
featureScores = featureScores.sort_values(by='Score', ascending=False)
featureScores

Unnamed: 0,Specs,Score
28,Dx:HPV,40.396562
26,Dx:Cancer,40.396562
29,Dx,39.844088
5,Smokes (years),37.51743
12,STDs (number),26.523799
8,Hormonal Contraceptives (years),19.69867
16,STDs:vulvo-perineal condylomatosis,19.415302
22,STDs:HIV,18.813229
13,STDs:condylomatosis,18.114754
6,Smokes (packs/year),16.119424


In [61]:
print(featureScores.nlargest(10,'Score'))  #print 10 best features

                                 Specs      Score
28                              Dx:HPV  40.396562
26                           Dx:Cancer  40.396562
29                                  Dx  39.844088
5                       Smokes (years)  37.517430
12                       STDs (number)  26.523799
8      Hormonal Contraceptives (years)  19.698670
16  STDs:vulvo-perineal condylomatosis  19.415302
22                            STDs:HIV  18.813229
13                 STDs:condylomatosis  18.114754
6                  Smokes (packs/year)  16.119424


In [62]:
# Transform training, validation and testing datasets using the selected features

X_train_selected = bestfeatures.transform(X_train)
X_val_selected = bestfeatures.transform(X_val)
X_test_selected = bestfeatures.transform(X_test)

#### Data modeling
In this code, we build two predictive models: Neural Network and Hybrid Neural Network-K Nearest Neighbour. 
In this section, we focus on building the models and fine tuning the hyperparameter as explained below:

**1. Model Building:** 
- **Neural Network:** Create the model using Keras and Tensorflow. It build which consists 3 different layer namely input layer has   64 neurons   and ReLU activation, hidden layer has 32 neurons and ReLu activation and output layer has 1 neuron and sigmoid     activation for   binary classification. The learning rate has set to the default value which is 0.001. 
- **Hybrid (NN-KNN):** The model first created with Neural Network which will learn the complex pattern and extract the features in data during training. The output produced by the Neural Network will then be used as input for KNN. KNN algorithm will used the features extracted by NN to classify new data points. The extracted features will be the intermediate layer between NN and KNN.

**2. Hyperparameter Tuning:**
- **Neural Network:** The parameters consist of hidden layer size and learning rate. We used the training datasets to tune the        hyperparameters. We tune these hyperparameters using the combination of GridSearch and Cross-Validation which help to systematically find the optimize hyperparameters combination. GridSearch plays a role to consider all parameters combination meanwhile Cross-Validation will evaluate the performance of datasets by separating training and validations set few times and obtain the result to prevent overfitting. 
- **Hybrid (NN-KNN):** The hyperparameter tuning is completely done with KNN algorithm using the GridSearch and Cross Validation. The parameters consists of number of neighbours, weights and distance metric namely euclidean, minkowski and manhattan. A grid search with 5-fold cross-validation is performed to find the best hyperparameters based on accuracy. A new KNN classifier is trained using the best hyperparameters.The trained model is stored in the dictionary best_Hybridknn_classifiers. Then, we evaluate model with best parameters using the validation sets to obtain the model performance with best parameters. 
___________________________________________

#### Building Neural Network Model

In [63]:
#model_building NN
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Function to create a neural network model
def create_model(input_dim, hidden_layer_size=32, learning_rate=0.001):
    model = Sequential()
    model.add(Dense(64, input_dim=input_dim, activation='relu'))
    model.add(Dense(hidden_layer_size, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))  # Sigmoid for binary classification
     # Adam optimizer with a configurable learning rate
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    # Use binary crossentropy to calculate the loss
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model


#### Hyperparameter Tuning for Neural Network Model

In [66]:
import numpy as np
from sklearn.model_selection import GridSearchCV

# Define target variables and hyperparameter grid
target_vars = ['Hinselmann', 'Schiller', 'Citology', 'Biopsy']
param_grid = {
    'hidden_layer_size': [32, 64, 128],
    'learning_rate': [0.001, 0.01, 0.1]
}

# Initialize results and best neural network model dictionary
results = {}
best_nn_classifier={}

# Iterate over each target variable
for target_column in target_vars:
    #convert training datasets to tensors because TensorFlow's neural network models need the input data in tensor.
    X_train_nn = tf.convert_to_tensor(X_train_selected, dtype=tf.float32)
    y_train_nn = tf.convert_to_tensor(y_train[target_column].values, dtype=tf.float32)

    # Convert tensors to numpy arrays to be use in GridSearchCV which require the input data in NumPy Arrays. 
    # The GridSearchCV will be use to find the optimize parameters. 
    X_train_np = np.array(X_train_nn)
    y_train_np = np.array(y_train_nn)

    print(f"Hyperparameter tuning for {target_column}...")

    # Create the KerasClassifier model
    model = tf.keras.wrappers.scikit_learn.KerasClassifier(
        build_fn=create_model, input_dim=X_train_nn.shape[1], epochs=50, batch_size=32, verbose=0)

    # Perform GridSearchCV
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='accuracy')

    # Fit the model
    grid_search.fit(X_train_np, y_train_np)

    # Get the best model and parameters
    best_model = grid_search.best_estimator_
    results[target_column] = {
        'best_params': grid_search.best_params_,
        'best_score': grid_search.best_score_
    }

    print(f"Best parameters found: {grid_search.best_params_}")
    print(f"Best cross-validation accuracy: {grid_search.best_score_}")
    print("\n")
    
    nn_best_params = tf.keras.wrappers.scikit_learn.KerasClassifier(
        build_fn=create_model,
        hidden_layer_size=grid_search.best_params_['hidden_layer_size'],
        learning_rate=grid_search.best_params_['learning_rate'],
        input_dim=X_train_nn.shape[1],
        epochs=50,
        batch_size=32,
        verbose=0)
    nn_best_params.fit(X_train_nn, y_train_nn)
    best_nn_classifier[target_column]=nn_best_params

# Print overall results
print('Model Accuracy for each target variable:')
for target, accuracy in results.items():
    print(f'{target}: {accuracy}')


Hyperparameter tuning for Hinselmann...
Best parameters found: {'hidden_layer_size': 32, 'learning_rate': 0.1}
Best cross-validation accuracy: 0.9583333333333334


Hyperparameter tuning for Schiller...
Best parameters found: {'hidden_layer_size': 32, 'learning_rate': 0.1}
Best cross-validation accuracy: 0.9116666666666667


Hyperparameter tuning for Citology...
Best parameters found: {'hidden_layer_size': 64, 'learning_rate': 0.1}
Best cross-validation accuracy: 0.94


Hyperparameter tuning for Biopsy...
Best parameters found: {'hidden_layer_size': 32, 'learning_rate': 0.1}
Best cross-validation accuracy: 0.9333333333333332


Model Accuracy for each target variable:
Hinselmann: {'best_params': {'hidden_layer_size': 32, 'learning_rate': 0.1}, 'best_score': 0.9583333333333334}
Schiller: {'best_params': {'hidden_layer_size': 32, 'learning_rate': 0.1}, 'best_score': 0.9116666666666667}
Citology: {'best_params': {'hidden_layer_size': 64, 'learning_rate': 0.1}, 'best_score': 0.94}
Biopsy: {'

#### Building Hybrid NN-KNN Model

In [67]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

# Define the neural network
def create_nn(input_dim):
    model = Sequential()
    model.add(Dense(64, input_dim=input_dim, activation='relu'))
    # use dropout to prevent overfitting by randomly setting a fraction of input units at each update during training.
    model.add(Dropout(0.5))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(16, activation='relu'))
    return model

# Create and train the neural network
input_dim = X_train_scaled.shape[1]
nn_model = create_nn(input_dim)
# Output layer for multi-label classification
nn_model.add(Dense(4, activation='sigmoid'))  
# Use binary crossentropy to calculate the loss
nn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# fit the model to the training datasets. Validation set is using to monitor model performance on unseen data during training.
nn_model.fit(X_train_scaled, y_train, epochs=50, batch_size=32, validation_data=(X_val_scaled, y_val))


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x2440c501630>

In [68]:
# Create a model that outputs the features from an intermediate layer
feature_extractor = tf.keras.Model(inputs=nn_model.input, outputs=nn_model.layers[-3].output)

#These extracted features can be used as input for a K-Nearest Neighbors (KNN) model for future analysis and prediction
X_train_features = feature_extractor.predict(X_train_scaled)
X_val_features = feature_extractor.predict(X_val_scaled)
X_test_features = feature_extractor.predict(X_test_scaled)




#### Hyperparameter Tuning for Hybrid (NN-KNN) Model

In [69]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix

# Define the parameters for KNN
param_knn = {
    "n_neighbors": range(1, 31),
    "weights": ["distance", "uniform"],
    "metric": ["euclidean", "minkowski", "manhattan"]
}
#This dictionary will store the best KNN models (optimized with GridSearchCV) for each target column.
best_Hybridknn_classifiers = {}

# Hyperparameter tuning and training for each target column
for target_column in y_train.columns:
    # Initialize the K-Nearest Neighbors classifier
    knn = KNeighborsClassifier()
    
    # Fit the grid search for the current target column
    grid_search = GridSearchCV(estimator=knn, param_grid=param_knn,
                               scoring="accuracy", cv=5)
    grid_search.fit(X_train_features, y_train[target_column])
    
    # Print the optimal parameters
    print(f"Best Parameters for {target_column}: {grid_search.best_params_}")
    
    # Train KNN model with optimal parameters found by GridSearchCV
    knn_best_params = KNeighborsClassifier(**grid_search.best_params_)
    knn_best_params.fit(X_train_features, y_train[target_column])
    best_Hybridknn_classifiers[target_column] = knn_best_params


Best Parameters for Hinselmann: {'metric': 'euclidean', 'n_neighbors': 2, 'weights': 'uniform'}
Best Parameters for Schiller: {'metric': 'euclidean', 'n_neighbors': 15, 'weights': 'distance'}
Best Parameters for Citology: {'metric': 'euclidean', 'n_neighbors': 15, 'weights': 'distance'}
Best Parameters for Biopsy: {'metric': 'euclidean', 'n_neighbors': 16, 'weights': 'distance'}


In [70]:
#evaluation of model with the best parameters for each target column using validation datasets. 
for target_column, model in best_Hybridknn_classifiers.items():
    # Evaluate on the validation set
    y_val_pred = model.predict(X_val_features)
    print(f"Validation set performance after hyperparameter tuning for target '{target_column}':")
    
    # Print confusion matrix for validation set
    print("Confusion Matrix (Validation):")
    print(confusion_matrix(y_val[target_column], y_val_pred))
    
    # Print classification report for validation set
    print("Classification Report (Validation):")
    print(classification_report(y_val[target_column], y_val_pred, zero_division=0))

Validation set performance after hyperparameter tuning for target 'Hinselmann':
Confusion Matrix (Validation):
[[81  0]
 [ 5  0]]
Classification Report (Validation):
              precision    recall  f1-score   support

           0       0.94      1.00      0.97        81
           1       0.00      0.00      0.00         5

    accuracy                           0.94        86
   macro avg       0.47      0.50      0.49        86
weighted avg       0.89      0.94      0.91        86

Validation set performance after hyperparameter tuning for target 'Schiller':
Confusion Matrix (Validation):
[[75  1]
 [10  0]]
Classification Report (Validation):
              precision    recall  f1-score   support

           0       0.88      0.99      0.93        76
           1       0.00      0.00      0.00        10

    accuracy                           0.87        86
   macro avg       0.44      0.49      0.47        86
weighted avg       0.78      0.87      0.82        86

Validation set p

#### Evaluate the models

In this code, we evaluate two predictive models: Neural Network and Hybrid Neural Network-K Nearest Neighbour. We uses key performance metrics to evaluate the models including accuracy, precision, recall and f1-score. The performance of models presented in confusion matrix and classification report.
In this section, we focus on evaluate the optimized models using testing dataset as explained below:

**1. Neural network**: 
- For each target variable, we uses their corressponding best parameters from hyperparameter tuning to predict the target of the test set. This helps us to assess the performance of the models to unseen data. 

**2. Hybrid (NN-KNN)**: 
- For each target variable, we evaluates the performance of the best Hybrid (NN-KNN) classifiers, which combine input features extracted by Neural Network model and best parameter tuned by KNN classifiers.
___________________________________________

#### Evaluating Neural Network Model

In [71]:
# Evaluate the best model on the test set
# This dictionary will store the accuracy results for each target column.
test_results_NN = {}
#Iterate over each target with the corresponding best parameters NN model to evaluate the performance.
for target_column, best_params in results.items():
    #convert training and testing datasets to TensorFlow tensor to be used in Neural Network model
    X_train_nn = tf.convert_to_tensor(X_train_selected, dtype=tf.float32)
    y_train_nn = tf.convert_to_tensor(y_train[target_column].values, dtype=tf.float32)
    X_test_nn = tf.convert_to_tensor(X_test_selected, dtype=tf.float32)
    y_test_nn = tf.convert_to_tensor(y_test[target_column].values, dtype=tf.float32)
    print(f"Evaluating best model for {target_column} on the test set...")
    
    # Create the model with best parameters
    model = create_model(X_train_nn.shape[1], **best_params['best_params'])
    
    # Train the model on the full training set with best parameters
    model.fit(X_train_nn, y_train_nn, epochs=50, batch_size=32, verbose=0)
    
    # Predict on the test set to get individual predictions
    y_test_pred = model.predict(X_test_nn)
    y_test_pred = (y_test_pred > 0.5).astype(int)  # Assuming binary classification

     # Print confusion matrix for test set
    print("Confusion Matrix (Test):")
    print(confusion_matrix(y_test_nn, y_test_pred))
    
    # Print accuracy score for test set
    print("Accuracy Score (Test):")
    test_results_NN[target_column]= accuracy_score(y_test_nn, y_test_pred)
    print( test_results_NN[target_column])
    
    # Print classification report for test set
    print("Classification Report (Test):")
    print(classification_report(y_test_nn, y_test_pred, zero_division=0))
    print("\n")

# Print overall test results
print('Test set accuracy for each target variable:')
for target, result in test_results_NN.items():
    print(f'{target}: {result}')


Evaluating best model for Hinselmann on the test set...
Confusion Matrix (Test):
[[167   0]
 [  5   0]]
Accuracy Score (Test):
0.9709302325581395
Classification Report (Test):
              precision    recall  f1-score   support

         0.0       0.97      1.00      0.99       167
         1.0       0.00      0.00      0.00         5

    accuracy                           0.97       172
   macro avg       0.49      0.50      0.49       172
weighted avg       0.94      0.97      0.96       172



Evaluating best model for Schiller on the test set...
Confusion Matrix (Test):
[[161   0]
 [ 11   0]]
Accuracy Score (Test):
0.936046511627907
Classification Report (Test):
              precision    recall  f1-score   support

         0.0       0.94      1.00      0.97       161
         1.0       0.00      0.00      0.00        11

    accuracy                           0.94       172
   macro avg       0.47      0.50      0.48       172
weighted avg       0.88      0.94      0.91       

#### Evaluating Hybrid (NN-KNN) Model

In [72]:
# Evaluate the best model on the test set
# This dictionary will store the accuracy results for each target column.
test_results_Hybrid = {}
#Iterate over each target with the corresponding best parameters Hybrid (NN-KNN) model to evaluate the performance.
for target_column, model in best_Hybridknn_classifiers.items():
    
    # Evaluate on the test set
    y_test_pred = model.predict(X_test_features)
    y_test_pred = (y_test_pred > 0.5).astype(int) 
    print(f"Test set performance after hyperparameter tuning for target '{target_column}':")
    
    # Print confusion matrix for test set
    print("Confusion Matrix (Test):")
    print(confusion_matrix(y_test[target_column], y_test_pred))
    
    # Print accuracy score for test set
    print("Accuracy Score (Test):")
    test_results_Hybrid[target_column]= accuracy_score(y_test[target_column], y_test_pred)
    print( test_results_Hybrid[target_column])
    
    # Print classification report for test set
    print("Classification Report (Test):")
    print(classification_report(y_test[target_column], y_test_pred, zero_division=0))
    print("\n")
    
# Print overall test results
print('Test set accuracy for each target variable:')
for target, result in test_results_Hybrid.items():
    print(f'{target}: {result}')


Test set performance after hyperparameter tuning for target 'Hinselmann':
Confusion Matrix (Test):
[[166   1]
 [  5   0]]
Accuracy Score (Test):
0.9651162790697675
Classification Report (Test):
              precision    recall  f1-score   support

           0       0.97      0.99      0.98       167
           1       0.00      0.00      0.00         5

    accuracy                           0.97       172
   macro avg       0.49      0.50      0.49       172
weighted avg       0.94      0.97      0.95       172



Test set performance after hyperparameter tuning for target 'Schiller':
Confusion Matrix (Test):
[[161   0]
 [ 11   0]]
Accuracy Score (Test):
0.936046511627907
Classification Report (Test):
              precision    recall  f1-score   support

           0       0.94      1.00      0.97       161
           1       0.00      0.00      0.00        11

    accuracy                           0.94       172
   macro avg       0.47      0.50      0.48       172
weighted avg  

#### Comparison Between Neural Network and Hybrid (NN-KNN)

In [74]:
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Initialize dictionaries to store performance metrics
metrics_nn = {'Accuracy': [], 'Precision': [], 'Recall': [], 'F1 Score': []}
metrics_hybrid = {'Accuracy': [], 'Precision': [], 'Recall': [], 'F1 Score': []}

# Function to evaluate a model and return all necessary metrics
def evaluate_model(model, X_test, y_test, target_column):
    y_test_pred = model.predict(X_test)
    y_test_pred = (y_test_pred > 0.5).astype(int)  # Assuming binary classification
    
    accuracy = accuracy_score(y_test, y_test_pred)
    precision = precision_score(y_test, y_test_pred, average='weighted', zero_division=0)
    recall = recall_score(y_test, y_test_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_test, y_test_pred, average='weighted', zero_division=0)
    
    return accuracy, precision, recall, f1

# Evaluate neural network model and store metrics
for target_column, model in best_nn_classifier.items():
    X_test_nn = tf.convert_to_tensor(X_test_selected, dtype=tf.float32)
    y_test_nn = tf.convert_to_tensor(y_test[target_column].values, dtype=tf.float32)
    
#     # Create the model with best parameters
#     model_nn = create_model(X_test_nn.shape[1], **best_params['best_params'])
    
#     # Train the model on the full training set with best parameters
#     model_nn.fit(X_train_nn, y_train_nn, epochs=50, batch_size=32, verbose=0)
    
    # Evaluate the model
    acc_nn, prec_nn, rec_nn, f1_nn = evaluate_model(model, X_test_nn, y_test_nn, target_column)
    metrics_nn['Accuracy'].append(acc_nn)
    metrics_nn['Precision'].append(prec_nn)
    metrics_nn['Recall'].append(rec_nn)
    metrics_nn['F1 Score'].append(f1_nn)

# Evaluate hybrid model and store metrics
for target_column, model in best_Hybridknn_classifiers.items():
    X_test_features = feature_extractor.predict(X_test_scaled)
    
    # Evaluate the model
    acc_hybrid, prec_hybrid, rec_hybrid, f1_hybrid = evaluate_model(model, X_test_features, y_test[target_column], target_column)
    metrics_hybrid['Accuracy'].append(acc_hybrid)
    metrics_hybrid['Precision'].append(prec_hybrid)
    metrics_hybrid['Recall'].append(rec_hybrid)
    metrics_hybrid['F1 Score'].append(f1_hybrid)

# Plotting function using Plotly
def plot_metric_comparison(metrics_nn, metrics_hybrid, target_columns):
    labels = target_columns
    x = np.arange(len(labels))  # the label locations
    
    fig = make_subplots(rows=2, cols=2, subplot_titles=('Accuracy Comparison', 'Precision Comparison', 'Recall Comparison', 'F1 Score Comparison'))
    
    # Accuracy comparison
    fig.add_trace(go.Bar(x=labels, y=metrics_nn['Accuracy'], name='Neural Network', marker_color='rgba(50, 171, 96, 0.6)'), row=1, col=1)
    fig.add_trace(go.Bar(x=labels, y=metrics_hybrid['Accuracy'], name='Hybrid Model', marker_color='rgba(255, 140, 0, 0.6)'), row=1, col=1)
    
    # Precision comparison
    fig.add_trace(go.Bar(x=labels, y=metrics_nn['Precision'], name='Neural Network', marker_color='rgba(50, 171, 96, 0.6)'), row=1, col=2)
    fig.add_trace(go.Bar(x=labels, y=metrics_hybrid['Precision'], name='Hybrid Model', marker_color='rgba(255, 140, 0, 0.6)'), row=1, col=2)
    
    # Recall comparison
    fig.add_trace(go.Bar(x=labels, y=metrics_nn['Recall'], name='Neural Network', marker_color='rgba(50, 171, 96, 0.6)'), row=2, col=1)
    fig.add_trace(go.Bar(x=labels, y=metrics_hybrid['Recall'], name='Hybrid Model', marker_color='rgba(255, 140, 0, 0.6)'), row=2, col=1)
    
    # F1 Score comparison
    fig.add_trace(go.Bar(x=labels, y=metrics_nn['F1 Score'], name='Neural Network', marker_color='rgba(50, 171, 96, 0.6)'), row=2, col=2)
    fig.add_trace(go.Bar(x=labels, y=metrics_hybrid['F1 Score'], name='Hybrid Model', marker_color='rgba(255, 140, 0, 0.6)'), row=2, col=2)
    
    fig.update_xaxes(title_text='Target Columns', row=1, col=1)
    fig.update_xaxes(title_text='Target Columns', row=1, col=2)
    fig.update_xaxes(title_text='Target Columns', row=2, col=1)
    fig.update_xaxes(title_text='Target Columns', row=2, col=2)
    
    fig.update_yaxes(title_text='Metric Value', row=1, col=1)
    fig.update_yaxes(title_text='Metric Value', row=1, col=2)
    fig.update_yaxes(title_text='Metric Value', row=2, col=1)
    fig.update_yaxes(title_text='Metric Value', row=2, col=2)
    
    
    fig.update_layout(title='Metric Comparison for Neural Network and Hybrid Models', height=800, width=900)
    
    fig.show()

# List of target columns
target_columns = list(results.keys())

# Plot metrics using Plotly
plot_metric_comparison(metrics_nn, metrics_hybrid, target_columns)



