## BRIEF OVERVIEW 

there are two pipelines : 
1) Jia En Low :  ordinal encoder + label encoder -> MICE imputer -> standard scaler -> multitaskelasticnet feature selection -> GBT Model / Logistic Model 

2) Kar Yan Ng :   label encoder -> simple imputer(mean) -> standard scaler -> PLS dimensionality reduction -> GBT Model / Logistic Model 

Analysis has been carried out by both of us at the end of the code 
  

In [None]:
pip install fancyimpute==0.7

In [None]:
pip install plotly

In [None]:
pip install scikit.learn 

In [None]:
conda install -c conda-forge sklearn.metrics==1.0.2

In [None]:
import pandas as pd 
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import KNNImputer
from fancyimpute import IterativeImputer
from numpy import set_printoptions
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import plotly.graph_objects as go

from sklearn.svm import SVC 
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from statistics import mean 
from sklearn.linear_model import MultiTaskElasticNet, MultiTaskElasticNetCV,  MultiTaskLassoCV

from sklearn.feature_selection import SelectFromModel
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import cross_validate, KFold
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_predict, KFold
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
#import shap

In [None]:
train_df = pd.read_csv('training_set_features.csv')
labels = pd.read_csv('training_set_labels.csv')
train_df_labels = pd.merge(train_df, labels, how= 'inner' ,on = 'respondent_id')
test_df = pd.read_csv('test_set_features.csv')
frames = [train_df, test_df]
df_merged = pd.concat(frames)

### EXPLORING DATASET

In [None]:
train_df_labels.info()

In [None]:
missingdata = train_df_labels.iloc[:, 1:39].notnull()
for column in missingdata.columns.values.tolist():
    print(column)
    print (missingdata[column].value_counts())
    print("") 

### PREPROCESSING 

drop missing values

In [None]:
train_df_labels.dropna(thresh=16, inplace = True)

combination of label and ordinal encoding

In [None]:
## ORDINAL ENCODING 

def ordinal_encoder(data,feature,feature_rank):
    
    ordinal_dict = {}
    
    for i, feature_value in enumerate(feature_rank):
        ordinal_dict[feature_value]=i+1
    
    data[feature] = data[feature].map(lambda x: ordinal_dict[x])
    
    return data

# replace with mode for income_poverty column
from collections import Counter
Counter(train_df_labels["income_poverty"])
train_df_labels['income_poverty'].fillna('<= $75,000, Above Poverty', inplace=True)

# then ordinal encode ( since OE cannot take NA values)
ordinal_encoder(train_df_labels, 'age_group', ['18 - 34 Years', '35 - 44 Years', '45 - 54 Years', '55 - 64 Years', '65+ Years'])
ordinal_encoder(train_df_labels, 'income_poverty', ['Below Poverty', '<= $75,000, Above Poverty', '> $75,000'])


# LABEL ENCODING  
le = LabelEncoder()
cols_to_encode = ['employment_industry', 'employment_occupation', 'education', 'health_insurance', 'doctor_recc_h1n1', 'doctor_recc_seasonal']

# apply label encoding to selected columns
for col in cols_to_encode:
    train_df_labels[col] = le.fit_transform(train_df_labels[col])


train_df_labels_withNA = train_df_labels.apply(lambda series: pd.Series(
    le.fit_transform(series[series.notnull()]),
    index=series[series.notnull()].index
))

dealing with missing data

In [None]:
train_df_labels_withNA = train_df_labels_withNA.astype(pd.Int64Dtype() )

# calling the  MICE class
mice_imputer = IterativeImputer()
# imputing the missing value with mice imputer
mice_imputed_df = mice_imputer.fit_transform(train_df_labels_withNA)

mice_imputed_df = mice_imputed_df.round(0)

mice_imputed_df = pd.DataFrame(mice_imputed_df, columns=train_df_labels_withNA.columns)
mice_imputed_df = mice_imputed_df.astype(pd.Int64Dtype() )

corr matrix 

In [None]:
corrMatt = mice_imputed_df.corr()
#generate a mask for the upper triangle
mask = np.zeros_like(corrMatt) # returns the matrix with each value as 0
mask[np.triu_indices_from(mask)] = True # value 1 in upper triangle of the matrix 

#set up matplot fig 
fig, ax = plt.subplots(figsize = (20,12))
plt.title('Flu dataset correlation')

#generate a custom diverging colormap 
cmap= sns.diverging_palette(260,10,as_cmap = True) 

#Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corrMatt,vmax = 1.2, square = False, cmap = cmap , mask = mask, ax= ax, annot = True, fmt = '.2g', linewidths = 1 )

Split dataset to features and outcome 

In [None]:
h1n1 = mice_imputed_df.h1n1_vaccine.astype('int')
seas = mice_imputed_df.seasonal_vaccine.astype('int')
x = mice_imputed_df.drop(columns= ['h1n1_vaccine', 'seasonal_vaccine'], axis= 1)

# target multilabel output 
y = pd.concat([seas, h1n1], axis =1)

Data Normalisation

In [None]:
scaler = StandardScaler()
scaler.fit_transform(x)
scaled_x = pd.DataFrame(x)

## PIPELINE BY JIA EN LOW


### LOGISTIC REGRESSION + MULTITASKELASTICNET

before feature selection

In [None]:
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_predict, KFold
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression

clf_lr_bfs = MultiOutputClassifier(LogisticRegression())
clf_lr_bfs.fit(scaled_x, y)
cv = KFold(n_splits=5, shuffle=True, random_state=42)

# Use cross_val_predict to obtain probability predictions
y_pred_proba = cross_val_predict(clf_lr_bfs, scaled_x, y, cv=cv, method='predict_proba')

# Separate the probabilities for each target variable
y_pred_proba_target1 = np.array([prob[1] for prob in y_pred_proba[0]])
y_pred_proba_target2 = np.array([prob[1] for prob in y_pred_proba[1]])

# # Calculate the ROC AUC scores for each target variable
roc_auc_target1 = roc_auc_score(y.iloc[:, 0], y_pred_proba_target1)
roc_auc_target2 = roc_auc_score(y.iloc[:, 1], y_pred_proba_target2)

# # Print the ROC AUC scores
print("ROC AUC score for Seasonal Flu vaccine:", roc_auc_target1)
print("ROC AUC score for H1N1 Vaccine:", roc_auc_target2)


after feature selection 

In [None]:
# nested K-fold 
from sklearn.model_selection import KFold, GridSearchCV, cross_val_predict
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import cross_val_score

# Define the pipeline with the model and feature selection
pipeline = Pipeline([ 
    ('fs', SelectFromModel(MultiTaskElasticNet())), 
    ('clf',  MultiOutputClassifier(LogisticRegression(), n_jobs = -1))
])

# Define the parameter grid for hyperparameter tuning
grid_params = {
    'fs__estimator__alpha': [0.01, 0.1, 1.0, 10],
    'fs__estimator__l1_ratio': [0.1, 0.3, 0.5, 1.0]
}

# Define the outer cross-validation strategy
outer_cv = KFold(n_splits=5, shuffle=True, random_state=42)

# Initialize variables to store the cross-validation results
roc_auc_scores_target1 = []
roc_auc_scores_target2 = []

# Outer loop for nested cross-validation
for train_outer, test_outer in outer_cv.split(scaled_x, y):
    # Split the data into training and test sets for the outer fold
    x_train_outer, x_test_outer = scaled_x.loc[train_outer,:], scaled_x.loc[test_outer,:]
    
    y_train_outer, y_test_outer = y.iloc[train_outer], y.iloc[test_outer]
    
    # Define the inner cross-validation strategy
    inner_cv = KFold(n_splits=5, shuffle=True, random_state=42)
    
    # Perform grid search with cross-validation on the training set of the outer fold
    clf = GridSearchCV(pipeline, grid_params, cv=inner_cv, scoring='roc_auc', return_train_score=True)
    clf.fit(x_train_outer, y_train_outer)
    logistic_best = clf.best_estimator_
    
    # Use cross_val_predict to obtain probability predictions on the test set of the outer fold
    y_pred_proba = cross_val_predict(logistic_best, x_test_outer, y_test_outer, cv=inner_cv, method='predict_proba')
    
    # Separate the probabilities for each target variable
    y_pred_proba_target1 = np.array([prob[1] for prob in y_pred_proba[0]])
    y_pred_proba_target2 = np.array([prob[1] for prob in y_pred_proba[1]])
    
    # Calculate the ROC AUC scores for each target variable and store them
    roc_auc_scores_target1.append(roc_auc_score(y_test_outer.iloc[:, 0], y_pred_proba_target1))
    roc_auc_scores_target2.append(roc_auc_score(y_test_outer.iloc[:, 1], y_pred_proba_target2))

# Print the mean ROC AUC scores for each target variable across all outer folds
print("Mean ROC AUC score for Seasonal Flu vaccine:", np.mean(roc_auc_scores_target1))
print("Mean ROC AUC score for H1N1 Vaccine:", np.mean(roc_auc_scores_target2))


## PIPELINE BY KAR YAN NG

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, cross_val_predict
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.multioutput import MultiOutputClassifier
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.cross_decomposition import PLSRegression, CCA

# Load data
features_dataset = pd.read_csv('training_set_features.csv') 
target_dataset = pd.read_csv('training_set_labels.csv')  
# Preprocess the features dataset

categorical_columns = features_dataset.select_dtypes(include=['object']).columns
numerical_columns = features_dataset.select_dtypes(include=['float64', 'int64']).columns

# Create preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='mean'), numerical_columns),
        ('cat', OneHotEncoder(), categorical_columns)
    ])

# Apply preprocessing
features_dataset = preprocessor.fit_transform(features_dataset)

# Standardize the features
scaler = StandardScaler()
scaled_x = scaler.fit_transform(features_dataset)

# Create the target variable dataset
y = target_dataset[['h1n1_vaccine', 'seasonal_vaccine']] 

# Define the pipelines with the models, one for PLS and another for CCA, now with Logistic Regression
pls_pipeline = Pipeline([
    ('fs', SelectFromModel(PLSRegression(n_components=2))),
    ('clf', MultiOutputClassifier(LogisticRegression(), n_jobs=-1))
])

#cca_pipeline = Pipeline([
#    ('fs', SelectFromModel(CCA(n_components=2))),
#    ('clf', MultiOutputClassifier(LogisticRegression(), n_jobs=-1))
#])

# Function to perform nested cross-validation and return mean ROC AUC scores
def nested_cv(pipeline):
    outer_cv = KFold(n_splits=5, shuffle=True, random_state=42)
    roc_auc_scores_target1 = []
    roc_auc_scores_target2 = []
    for train_outer, test_outer in outer_cv.split(scaled_x, y):
        x_train_outer, x_test_outer = scaled_x[train_outer], scaled_x[test_outer]
        y_train_outer, y_test_outer = y.iloc[train_outer], y.iloc[test_outer]
        inner_cv = KFold(n_splits=5, shuffle=True, random_state=42)
        pipeline.fit(x_train_outer, y_train_outer)
        y_pred_proba = cross_val_predict(pipeline, x_test_outer, y_test_outer, cv=inner_cv, method='predict_proba')
        y_pred_proba_target1 = y_pred_proba[0][:, 1]
        y_pred_proba_target2 = y_pred_proba[1][:, 1]
        roc_auc_scores_target1.append(roc_auc_score(y_test_outer.iloc[:, 0], y_pred_proba_target1))
        roc_auc_scores_target2.append(roc_auc_score(y_test_outer.iloc[:, 1], y_pred_proba_target2))
    return np.mean(roc_auc_scores_target1), np.mean(roc_auc_scores_target2)

# Perform nested cross-validation for the PLS pipeline
pls_roc_auc_scores = nested_cv(pls_pipeline)

# Perform nested cross-validation for the CCA pipeline
#cca_roc_auc_scores = nested_cv(cca_pipeline)

# Print the mean ROC AUC scores for each pipeline
print("PLS Pipeline")
print("Mean ROC AUC score for H1N1 Vaccine:", pls_roc_auc_scores[0])
print("Mean ROC AUC score for Seasonal Flu Vaccine:", pls_roc_auc_scores[1])
#print("\nCCA Pipeline")
#print("Mean ROC AUC score for H1N1 Vaccine:", cca_roc_auc_scores[0])
#print("Mean ROC AUC score for Seasonal Flu Vaccine:", cca_roc_auc_scores[1])


CCA DIMENSIONALITY REDUCTION  + ONEHOTENCODER + LOGISTIC REGRESSION 

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, GridSearchCV, cross_val_predict
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.multioutput import MultiOutputClassifier
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.cross_decomposition import PLSRegression, CCA

# Load data
features_dataset = pd.read_csv('training_set_features.csv')
target_dataset = pd.read_csv('training_set_labels.csv')

# Make copies of the original dataframes
K_features_dataset = features_dataset.copy()
K_target_dataset = target_dataset.copy()

# Preprocess the features dataset
categorical_columns = K_features_dataset.select_dtypes(include=['object']).columns
numerical_columns = K_features_dataset.select_dtypes(include=['float64', 'int64']).columns

# One-hot encoding for categorical features
encoder = OneHotEncoder()
encoded_categorical = encoder.fit_transform(K_features_dataset[categorical_columns]).toarray()

# Impute missing values in numerical columns
imputer = SimpleImputer(strategy='mean')
K_features_dataset[numerical_columns] = imputer.fit_transform(K_features_dataset[numerical_columns])

# Concatenate the encoded categorical and numerical features
scaled_x = np.concatenate([encoded_categorical, K_features_dataset[numerical_columns]], axis=1)

# Create the target variable dataset
y = K_target_dataset[['h1n1_vaccine', 'seasonal_vaccine']]

# Components to iterate over
components = [2]

for comp in components:
    pls_pipeline = Pipeline([
        ('fs', SelectFromModel(PLSRegression(n_components=comp))),
        ('clf', MultiOutputClassifier(LogisticRegression(), n_jobs=-1))
    ])

#    cca_pipeline = Pipeline([
#       ('fs', SelectFromModel(CCA(n_components=comp))),
#       ('clf', MultiOutputClassifier(LogisticRegression(), n_jobs=-1))
#    ])

    outer_cv = KFold(n_splits=5, shuffle=True, random_state=42)

    def nested_cv(pipeline):
        roc_auc_scores_target1 = []
        roc_auc_scores_target2 = []
        for train_outer, test_outer in outer_cv.split(scaled_x, y):
            x_train_outer, x_test_outer = scaled_x[train_outer], scaled_x[test_outer]
            y_train_outer, y_test_outer = y.iloc[train_outer], y.iloc[test_outer]
            inner_cv = KFold(n_splits=5, shuffle=True, random_state=42)
            pipeline.fit(x_train_outer, y_train_outer)
            y_pred_proba = cross_val_predict(pipeline, x_test_outer, y_test_outer, cv=inner_cv, method='predict_proba')
            y_pred_proba_target1 = y_pred_proba[0][:, 1]
            y_pred_proba_target2 = y_pred_proba[1][:, 1]
            roc_auc_scores_target1.append(roc_auc_score(y_test_outer.iloc[:, 0], y_pred_proba_target1))
            roc_auc_scores_target2.append(roc_auc_score(y_test_outer.iloc[:, 1], y_pred_proba_target2))
        return np.mean(roc_auc_scores_target1), np.mean(roc_auc_scores_target2)

    pls_roc_auc_scores = nested_cv(pls_pipeline)
    cca_roc_auc_scores = nested_cv(cca_pipeline)

    print(f"PLS Pipeline with {comp} components")
    print("Mean ROC AUC score for H1N1 Vaccine:", pls_roc_auc_scores[0])
    print("Mean ROC AUC score for Seasonal Flu Vaccine:", pls_roc_auc_scores[1])

#     print(f"\nCCA Pipeline with {comp} components")
#     print("Mean ROC AUC score for H1N1 Vaccine:", cca_roc_auc_scores[0])
#     print("Mean ROC AUC score for Seasonal Flu Vaccine:", cca_roc_auc_scores[1])



In [None]:
PLSRegression

before feature selection

## PIPELINE BY JIA EN LOW 

### GRADIENT BOOSTING CLASSIFIER 

In [None]:
scaled_x = pd.DataFrame(scaled_x)

before feature selection 

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
clf_beforefs =  MultiOutputClassifier(GradientBoostingClassifier())
clf_beforefs.fit(scaled_x, y)

cv = KFold(n_splits=5, shuffle=True, random_state=42)

# Use cross_val_predict to obtain probability predictions
y_pred_proba = cross_val_predict(clf_beforefs, scaled_x, y, cv=cv, method='predict_proba')

# Separate the probabilities for each target variable
y_pred_proba_target1 = np.array([prob[1] for prob in y_pred_proba[0]])
y_pred_proba_target2 = np.array([prob[1] for prob in y_pred_proba[1]])

# # Calculate the ROC AUC scores for each target variable
roc_auc_target1 = roc_auc_score(y.iloc[:, 0], y_pred_proba_target1)
roc_auc_target2 = roc_auc_score(y.iloc[:, 1], y_pred_proba_target2)

# # Print the ROC AUC scores
print("ROC AUC score for Seasonal Flu vaccine:", roc_auc_target1)
print("ROC AUC score for H1N1 Vaccine:", roc_auc_target2)


after feature selection 

In [None]:
type(scaled_x)

In [None]:
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.feat/ure_selection import SelectFromModel
from sklearn.linear_model import MultiTaskElasticNet
from sklearn.metrics import roc_auc_score
import numpy as np

# Define the pipeline
pipeline = Pipeline([
    ('fs', SelectFromModel(MultiTaskElasticNet())),
    ('clf', MultiOutputClassifier(GradientBoostingClassifier(), n_jobs=-1))
])

# Define the hyperparameter grid
grid_params = {
    'fs__estimator__alpha': [0.01, 0.1, 1.0, 10],
    'fs__estimator__l1_ratio': [0.1, 0.3, 0.5, 1.0],
    'clf__estimator__learning_rate': [0.01, 0.05, 0.1],
    'clf__estimator__max_depth': [5, 8, 12],
    'clf__estimator__criterion': ['friedman_mse', 'squared_error']
}

# Define the nested k-fold cross-validation
cv = KFold(n_splits=5, shuffle=True, random_state=42)

# Define the GridSearchCV object with nested k-fold cross-validation
clf = GridSearchCV(
    pipeline,
    grid_params,
    cv=cv,
    scoring='roc_auc',
    n_jobs=-1
)

# Perform the nested k-fold cross-validation
roc_auc_scores_target1 = []
roc_auc_scores_target2 = []
for train_outer_idx, test_outer_idx in cv.split(scaled_x, y):
    X_train_outer, X_test_outer = scaled_x.iloc[train_outer_idx], scaled_x.iloc[test_outer_idx]
    y_train_outer, y_test_outer = y.iloc[train_outer_idx], y.iloc[test_outer_idx]
    
    
    clf.fit(X_train_outer, y_train_outer)
    rf_best = clf.best_estimator_

        # Use cross_val_predict to obtain probability predictions on the test set of the outer fold
    y_pred_proba = cross_val_predict(rf_best, X_test_outer, y_test_outer, cv=cv, method='predict_proba')
    
    # Separate the probabilities for each target variable
    y_pred_proba_target1 = np.array([prob[1] for prob in y_pred_proba[0]])
    y_pred_proba_target2 = np.array([prob[1] for prob in y_pred_proba[1]])
    
    # Calculate the ROC AUC scores for each target variable and store them
    roc_auc_scores_target1.append(roc_auc_score(y_test_outer.iloc[:, 0], y_pred_proba_target1))
    roc_auc_scores_target2.append(roc_auc_score(y_test_outer.iloc[:, 1], y_pred_proba_target2))

# Print the mean ROC AUC scores for each target variable across all outer folds
print("Mean ROC AUC score for Seasonal Flu vaccine:", np.mean(roc_auc_scores_target1))
print("Mean ROC AUC score for H1N1 Vaccine:", np.mean(roc_auc_scores_target2))


## PIPELINE BY KAR YAN NG

CCA DIMENSIONALITY REDUCTION  + ONEHOTENCODER + LOGISTIC REGRESSION 

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, cross_val_predict, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.multioutput import MultiOutputClassifier
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.cross_decomposition import PLSRegression, CCA

# Load data
features_dataset = pd.read_csv('training_set_features.csv')  # replace with your actual file
target_dataset = pd.read_csv('training_set_labels.csv')  # replace with your actual file

# Preprocess the features dataset
categorical_columns = features_dataset.select_dtypes(include=['object']).columns
numerical_columns = features_dataset.select_dtypes(include=['float64', 'int64']).columns

# Create preprocessor
preprocessor = ColumnTransformer(
     transformers=[
        ('num', SimpleImputer(strategy='mean'), numerical_columns),
        ('cat', OneHotEncoder(), categorical_columns)
 ])

# Apply preprocessing
features_dataset = preprocessor.fit_transform(features_dataset)

# Standardize the features
scaler = StandardScaler()
scaled_x = scaler.fit_transform(features_dataset)

# Create the target variable dataset
y = target_dataset[['h1n1_vaccine', 'seasonal_vaccine']]  

# Define the pipelines with the models, one for PLS and another for CCA, now with Gradient Boosting
pls_pipeline = Pipeline([
    ('fs', SelectFromModel(PLSRegression(n_components=2))),
    ('clf', MultiOutputClassifier(GradientBoostingClassifier(), n_jobs=-1))
])

# cca_pipeline = Pipeline([
#     ('fs', SelectFromModel(CCA(n_components=2))),
#     ('clf', MultiOutputClassifier(GradientBoostingClassifier(), n_jobs=-1))
# ])

# Define the parameter grids for GridSearchCV
pls_param_grid = {
    'fs__estimator__n_components': [1, 2, 3],
    'clf__estimator__n_estimators': [50, 100, 200],
    'clf__estimator__learning_rate': [0.1, 0.05, 0.01]
}

# cca_param_grid = {
#    'fs__estimator__n_components': [1, 2, 3],
#    'clf__estimator__n_estimators': [50, 100, 200],
#    'clf__estimator__learning_rate': [0.1, 0.05, 0.01]
#}

# Perform nested cross-validation with GridSearchCV for the PLS pipeline
pls_grid_search = GridSearchCV(pls_pipeline, param_grid=pls_param_grid, cv=5, scoring='roc_auc')
pls_grid_search.fit(scaled_x, y)
best_pls_params = pls_grid_search.best_params_
best_pls_roc_auc_scores = nested_cv(pls_grid_search.best_estimator_)

# Perform nested cross-validation with GridSearchCV for the CCA pipeline
# cca_grid_search = GridSearchCV(cca_pipeline, param_grid=cca_param_grid, cv=5, scoring='roc_auc')
# cca_grid_search.fit(scaled_x, y)
# best_cca_params = cca_grid_search.best_params_
# best_cca_roc_auc_scores = nested_cv(cca_grid_search.best_estimator_)

# Print the best parameters and mean ROC AUC scores for each pipeline
print("Best parameters for PLS pipeline:", best_pls_params)
print("Mean ROC AUC score for H1N1 Vaccine (PLS):", best_pls_roc_auc_scores[0])
print("Mean ROC AUC score for Seasonal Flu Vaccine (PLS):", best_pls_roc_auc_scores[1])

# print("\nBest parameters for CCA pipeline:", best_cca_params)
# print("Mean ROC AUC score for H1N1 Vaccine (CCA):", best_cca_roc_auc_scores[0])
# print("Mean ROC AUC score for Seasonal Flu Vaccine (CCA):", best_cca_roc_auc_scores[1])



### ANALYSIS - BY JIA EN LOW 

number of features with corresponding roc auc score 

In [None]:
# getting the performance 
from sklearn.linear_model import MultiTaskElasticNetCV
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt

# Define a range of alpha values to use for regularization
# alphas = [0.001, 0.01, 0.1, 1, 10]

# Define a range of numbers of features to select
num_features = range(8,36,2)

# Initialize an empty list to store ROC AUC scores
roc_means = []

# Train a MultitaskElasticNet model for each number of features
for n in num_features:
    # Fit the model with L1 and L2 regularization using cross-validation
    selector = SelectFromModel(max_features=n , estimator = MultiTaskElasticNet(alpha=0.1, l1_ratio=0.1)).fit(scaled_x, y)
    X_selected = selector.transform(scaled_x)
    model = MultiOutputClassifier(LogisticRegression())
    model.fit(X_selected, y)
    # Compute the ROC AUC score using cross-validation
    # Separate the probabilities for each target variable
    cv = KFold(n_splits=5, shuffle=True, random_state=42)

    # Use cross_val_predict to obtain probability predictions
    y_pred_proba = cross_val_predict(model, X_selected, y, cv=cv, method='predict_proba')
    y_pred_proba_target1 = np.array([prob[1] for prob in y_pred_proba[0]])
    y_pred_proba_target2 = np.array([prob[1] for prob in y_pred_proba[1]])

    # # Calculate the ROC AUC scores for each target variable
    roc_auc_target1 = roc_auc_score(y.iloc[:, 0], y_pred_proba_target1)
    roc_auc_target2 = roc_auc_score(y.iloc[:, 1], y_pred_proba_target2)
    # # Print the ROC AUC scores
    print("ROC AUC score for Seasonal Flu vaccine:", roc_auc_target1)
    print("ROC AUC score for H1N1 Vaccine:", roc_auc_target2)
    roc_mean = (roc_auc_target1+roc_auc_target2)/2
    roc_means.append(roc_mean)
    
# Plot the ROC AUC scores vs the number of features selected
plt.plot(num_features, roc_means)
plt.xlabel('Number of features selected')
plt.ylabel('ROC AUC score')
plt.show()


SHAP values before feature selection

Logistic regression

In [None]:
explainer = shap.LinearExplainer(clf_lr_bfs.estimators_[1], scaled_x)
shap_values = explainer.shap_values(scaled_x)
shap.summary_plot(shap_values, features= scaled_x.columns, plot_type='bar')

GBT

In [None]:
## interpretable Using SHAP 
explainer = shap.Explainer(clf_beforefs.estimators_[1])
shap_values = explainer(scaled_x)
shap.summary_plot(shap_values, scaled_x, plot_type='bar')

### Feature importance graph

logistic regression

In [None]:
## fit on multittaskelasticnet to get feature importance
estimator = MultiTaskElasticNet(alpha=0.1, l1_ratio=0.1)
selector = SelectFromModel(estimator)
selector.fit(scaled_x, y)

# # Transform the data using the feature selector
X_selected = selector.transform(scaled_x)

# # Print the selected features
print("Selected features: ", selector.get_feature_names_out())

import matplotlib.pyplot as plt

# Get feature importance
feature_importance = selector.estimator_.coef_

# Compute absolute feature importance
abs_feature_importance = abs(feature_importance)

# Compute mean feature importance across tasks
mean_feature_importance = abs_feature_importance.mean(axis=0)

# Sort features by mean importance
sorted_indices = mean_feature_importance.argsort()

feature_names = selector.feature_names_in_
# Get the feature names
selected_features= selector.get_feature_names_out()

# Reverse the order of feature names to match the descending order of importance
feature_selected =  [feature_names[i] for i in sorted_indices]

# Plot feature importances horizontally
fig, ax = plt.subplots(figsize = (10,10))

plt.barh(feature_selected, mean_feature_importance[sorted_indices])
# plt.yticks(range(scaled_x.shape[1]), sorted_indices)
plt.xlabel('Feature importance')
plt.ylabel('Feature index')
plt.title('Feature importances')
plt.show()


GBT

In [None]:
# getting performance of different number of features selected
# getting the performance 
from sklearn.linear_model import MultiTaskElasticNetCV
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt


# Define a range of numbers of features to select
num_features = range(8,36,2)

# Initialize an empty list to store ROC AUC scores
roc_means = []

# Train a MultitaskElasticNet model for each number of features
for n in num_features:
    # Fit the model with L1 and L2 regularization using cross-validation
    selector = SelectFromModel(max_features=n , estimator = MultiTaskElasticNet(alpha=0.1, l1_ratio=0.1)).fit(scaled_x, y)
    X_selected = selector.transform(scaled_x)
    model = MultiOutputClassifier(GradientBoostingClassifier())
    model.fit(X_selected, y)
    # Compute the ROC AUC score using cross-validation
    # Separate the probabilities for each target variable
    cv = KFold(n_splits=5, shuffle=True, random_state=42)

    # Use cross_val_predict to obtain probability predictions
    y_pred_proba = cross_val_predict(model, X_selected, y, cv=cv, method='predict_proba')
    y_pred_proba_target1 = np.array([prob[1] for prob in y_pred_proba[0]])
    y_pred_proba_target2 = np.array([prob[1] for prob in y_pred_proba[1]])

    # # Calculate the ROC AUC scores for each target variable
    roc_auc_target1 = roc_auc_score(y.iloc[:, 0], y_pred_proba_target1)
    roc_auc_target2 = roc_auc_score(y.iloc[:, 1], y_pred_proba_target2)
    # # Print the ROC AUC scores
    print("ROC AUC score for Seasonal Flu vaccine:", roc_auc_target1)
    print("ROC AUC score for H1N1 Vaccine:", roc_auc_target2)
    roc_mean = (roc_auc_target1+roc_auc_target2)/2
    roc_means.append(roc_mean)
    
# Plot the ROC AUC scores vs the number of features selected
plt.plot(num_features, roc_means)
plt.xlabel('Number of features selected')
plt.ylabel('ROC AUC score')
plt.show()


### ANALYSIS - BY KAR YAN NG 

In [None]:
from sklearn.cross_decomposition import PLSRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import cross_val_predict, KFold
from sklearn.metrics import roc_auc_score
import pandas as pd
from sklearn.impute import SimpleImputer
import numpy as np

# Load data
features_dataset = pd.read_csv('training_set_features.csv')
target_dataset = pd.read_csv('training_set_labels.csv')

# Create the target variable dataset
target_dataset = target_dataset[['h1n1_vaccine', 'seasonal_vaccine']]

# Preprocess the features dataset
categorical_columns = features_dataset.select_dtypes(include=['object']).columns
numerical_columns = features_dataset.select_dtypes(include=['float64', 'int64']).columns

encoder = OneHotEncoder()
encoded_categorical = encoder.fit_transform(features_dataset[categorical_columns]).toarray()

scaler = StandardScaler()
scaled_numerical = scaler.fit_transform(features_dataset[numerical_columns])

# Impute missing values in numerical columns
imputer = SimpleImputer(strategy='mean')
features_dataset[numerical_columns] = imputer.fit_transform(features_dataset[numerical_columns])

# Concatenate encoded categorical and scaled numerical features
scaled_x = np.concatenate((encoded_categorical, scaled_numerical), axis=1)

# Create PLS object
pls = PLSRegression(n_components=2)

# Fit
pls.fit(scaled_x, target_dataset)

# Cross-validation
kf = KFold(n_splits=10, shuffle=True, random_state=1)

# Get predicted probabilities via cross-validation
y_pred = cross_val_predict(pls, scaled_x, target_dataset, cv=kf, method='predict')

# Compute ROC AUC scores for each target
roc_auc_scores = []
for i in range(target_dataset.shape[1]):
    roc_auc_scores.append(roc_auc_score(target_dataset.iloc[:, i], y_pred[:, i]))

print("ROC AUC scores:", roc_auc_scores)


In [None]:
from sklearn.metrics import roc_curve, auc
from sklearn.cross_decomposition import PLSRegression
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_predict, KFold

from sklearn.preprocessing import OneHotEncoder

# Load the dataset containing the target variables
dfl = pd.read_csv('training_set_labels.csv')
df = pd.read_csv('training_set_features.csv')

# Preprocess the features dataset
categorical_columns = df.select_dtypes(include=['object']).columns
numerical_columns = df.select_dtypes(include=['float64', 'int64']).columns

encoder = OneHotEncoder()
encoded_categorical = encoder.fit_transform(features_dataset[categorical_columns]).toarray()

scaled_x = encoded_categorical

# Make copies of the original dataframes
features_dataset = df
target_dataset = dfl[['h1n1_vaccine', 'seasonal_vaccine']]
target_dataset = target_dataset.astype(int)

# Compute ROC curves for each target
plt.figure(figsize=(10, 8))

target_labels = ['h1n1_vaccine', 'seasonal_vaccine']  # Adjust this to your column names

# Cross-validation
kf = KFold(n_splits=10, shuffle=True, random_state=1)

# Create PLS object
pls = PLSRegression(n_components=2)
y_pred_proba = cross_val_predict(pls, scaled_x, target_dataset, cv=kf, method='predict')

for i in range(target_dataset.shape[1]):
    fpr, tpr, _ = roc_curve(target_dataset.iloc[:, i], y_pred_proba[:, i])
    roc_auc = auc(fpr, tpr)

    plt.plot(fpr, tpr, label=f'ROC curve of class {i} (area = {roc_auc:.2f})')
    
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()