### This notebook contains the code to generate the submission for the "Flu Shot Learning: Predict H1N1 and Seasonal Flu Vaccines" competition.

The submission should contain 3 columns, with the respondent_id, the probability someone gets the H1N1 vaccine (h1n1_vaccine), and the probability that someone gets the flu shot (seasonal_vaccine).

To train the model, we use the training_set_features.csv data, with the training_set_labels.csv data as the known probabilities. Finally we want to predict the values for the test_set_features.csv data.

The score is evaluated using the receiver operating characteristic curve (ROC AUC), with default "average='macro'".

In this script we train an XGBoost model.

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score
#from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

In [2]:
X_train_prep = pd.read_csv('X_train_prep.csv')
X_valid_prep = pd.read_csv('X_valid_prep.csv')
y_train = pd.read_csv('y_train.csv')
y_valid = pd.read_csv('y_valid.csv')

In [11]:
# Now we can train a model
model = XGBClassifier(n_estimators=1000, learning_rate=0.05, n_jobs=-1, early_stopping_rounds=5, random_state=42)
model.fit(X_train_prep, y_train,
          eval_set=[(X_valid_prep, y_valid)],
          verbose=False)
y_pred = model.predict_proba(X_valid_prep)
print(y_pred)
y_pred_h1n1 = y_pred[:, 0]
y_pred_seas = y_pred[:, 1]
print(y_pred_h1n1)

score = roc_auc_score(y_valid, y_pred, average='macro')
score_h1n1 = roc_auc_score(y_valid["h1n1_vaccine"], y_pred_h1n1)
score_seas = roc_auc_score(y_valid["seasonal_vaccine"], y_pred_seas)
print(score, score_h1n1, score_seas)

[[0.02175624 0.14506897]
 [0.09852405 0.31719488]
 [0.40834075 0.45852268]
 ...
 [0.13499749 0.6152501 ]
 [0.5259399  0.8973954 ]
 [0.01885252 0.14102215]]
[0.02175624 0.09852405 0.40834075 ... 0.13499749 0.5259399  0.01885252]
0.869938070343568 0.8726949549958399 0.8671811856912961


In [12]:
# Optimise hyperparameters; using GridSearchCV does not work well
param_grid = {
    'n_estimators': [500, 1000, 1200],
    'learning_rate': [0.01, 0.05, 0.1]
}

scores = []
for n_est in param_grid["n_estimators"]:
    for lr in param_grid["learning_rate"]:
        model = XGBClassifier(n_estimators=n_est, learning_rate=lr, n_jobs=-1, early_stopping_rounds=5, random_state=42)
        model.fit(X_train_prep, y_train,
                  eval_set=[(X_valid_prep, y_valid)],
                  verbose=False)
        y_pred = model.predict_proba(X_valid_prep)
        score = roc_auc_score(y_valid, y_pred, average='macro')
        print(n_est, lr, score)
        scores.append(score)
print(max(scores))

500 0.01 0.8685084462894883
500 0.05 0.869938070343568
500 0.1 0.8689307147462157
1000 0.01 0.8695709917362966
1000 0.05 0.869938070343568
1000 0.1 0.8689307147462157
1200 0.01 0.8695709917362966
1200 0.05 0.869938070343568
1200 0.1 0.8689307147462157
0.869938070343568


In [13]:
# See the best one is n_estimators=1000, learning_rate=0.05
# So train final model
model_fin = XGBClassifier(n_estimators=1000, learning_rate=0.05, n_jobs=-1, early_stopping_rounds=5, random_state=42)
model_fin.fit(X_train_prep, y_train,
             eval_set=[(X_valid_prep, y_valid)],
             verbose=False)
y_pred = model_fin.predict_proba(X_valid_prep)
score = roc_auc_score(y_valid, y_pred, average='macro')
print(score)

0.869938070343568


In [14]:
# Load the test data
X_test = pd.read_csv("test_set_features.csv")

# Initiate the output dataframe with id's
output = pd.DataFrame(X_test["respondent_id"])

# Preprocess the test data
X_test_prep = X_test.copy()

# Fill in missing values
X_test_prep[numerical_cols] = imputer_num.transform(X_test[numerical_cols])

for col in nominal_cols:
    X_test_prep[col].fillna("Unknown", inplace=True)

X_test_prep[ordinal_cols] = imputer_ord.transform(X_test[ordinal_cols])

# Sort by respondent_id
X_test_prep = X_test_prep.sort_values("respondent_id")

# Preprocess the ordinal columns
X_test_prep["age_group"] = pd.Categorical(X_test_prep["age_group"], categories=age_group_order, ordered=True)
X_test_prep["age_group"] = X_test_prep["age_group"].cat.codes

X_test_prep["education"] = pd.Categorical(X_test_prep["education"], categories=edu_order, ordered=True)
X_test_prep["education"] = X_test_prep["education"].cat.codes

X_test_prep["income_poverty"] = pd.Categorical(X_test_prep["income_poverty"], categories=inc_order, ordered=True)
X_test_prep["income_poverty"] = X_test_prep["income_poverty"].cat.codes

# One-hot encode the nominal columns
X_test_prep = pd.get_dummies(X_test_prep, columns=nominal_cols, dtype=int)
X_test_prep.drop(columns=["respondent_id"], inplace=True) # Drop the id column
X_test_prep.head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_test_prep[col].fillna("Unknown", inplace=True)


Unnamed: 0,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,...,employment_occupation_qxajmpny,employment_occupation_rcertsgn,employment_occupation_tfqavkke,employment_occupation_ukymxvdu,employment_occupation_uqqtjvyb,employment_occupation_vlluhbov,employment_occupation_xgwztkwe,employment_occupation_xqwwgdyp,employment_occupation_xtkaffoo,employment_occupation_xzmlyyjv
0,2.0,2.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0
2,2.0,2.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0,0,0,0,0,0,0,0,0,0
4,3.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
# Make predictions
y_pred = model_fin.predict_proba(X_test_prep)
y_pred_h1n1 = y_pred[:, 0]
y_pred_seas = y_pred[:, 1]

# Add the predictions to the output dataframe
output["h1n1_vaccine"] = y_pred_h1n1
output["seasonal_vaccine"] = y_pred_seas
output.head()

Unnamed: 0,respondent_id,h1n1_vaccine,seasonal_vaccine
0,26707,0.133186,0.180806
1,26708,0.02626,0.034958
2,26709,0.164766,0.663686
3,26710,0.703327,0.917676
4,26711,0.212604,0.552317


In [None]:
# Save the output as csv
output.to_csv("submission_xgb.csv", index=False)

### Final note:

After submission, this result got a score of 0.8595. The best score on the leaderboard was 0.8658. I'm ranked 569/2031.
So that's pretty good!