In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import BernoulliNB, GaussianNB

In [2]:
train_features = pd.read_csv('training_set_features.csv')
train_labels = pd.read_csv('training_set_labels.csv')
test_features = pd.read_csv('test_set_features.csv')
submission_format = pd.read_csv('submission_format.csv')

In [3]:
train_features.head()

Unnamed: 0,respondent_id,xyz_concern,xyz_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation
0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,Below Poverty,Not Married,Own,Not in Labor Force,oxchjgsf,Non-MSA,0.0,0.0,,
1,1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,Below Poverty,Not Married,Rent,Employed,bhuqouqj,"MSA, Not Principle City",0.0,0.0,pxcmvdjn,xgwztkwe
2,2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,"<= $75,000, Above Poverty",Not Married,Own,Employed,qufhixun,"MSA, Not Principle City",2.0,0.0,rucpziij,xtkaffoo
3,3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,Below Poverty,Not Married,Rent,Not in Labor Force,lrircsnp,"MSA, Principle City",0.0,0.0,,
4,4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,"<= $75,000, Above Poverty",Married,Own,Employed,qufhixun,"MSA, Not Principle City",1.0,0.0,wxleyezf,emcorrxb


In [4]:
train_data = pd.merge(train_features, train_labels, on='respondent_id')

X_train = train_data.drop(columns=['respondent_id', 'xyz_vaccine', 'seasonal_vaccine'])
y_train = train_data[['xyz_vaccine', 'seasonal_vaccine']]
X_test = test_features.drop(columns=['respondent_id'])


In [5]:
categorical_cols = X_train.select_dtypes(include=['object']).columns
numerical_cols = X_train.select_dtypes(include=['int64', 'float64']).columns

In [6]:
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [7]:
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

In [8]:
if np.any(np.isnan(X_train_preprocessed)):
    print("NaNs found in X_train_preprocessed")
if np.any(np.isnan(X_test_preprocessed)):
    print("NaNs found in X_test_preprocessed")

In [9]:
X_train_split, X_valid, y_train_split, y_valid = train_test_split(X_train_preprocessed, y_train, test_size=0.2, random_state=42)

In [10]:
models = {
    'LogisticRegression': MultiOutputClassifier(LogisticRegression(max_iter=1000, solver='saga', n_jobs=-1)),
    'RandomForest': MultiOutputClassifier(RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)),
    'GradientBoosting': MultiOutputClassifier(GradientBoostingClassifier(n_estimators=100, random_state=42)),
    'SVM': MultiOutputClassifier(SVC(probability=True, random_state=42)),
    'BernoulliNB': MultiOutputClassifier(BernoulliNB()),
    'GaussianNB': MultiOutputClassifier(GaussianNB())
}

In [11]:
def evaluate_models(models, X_train_split, y_train_split, X_valid, y_valid):
    results = {}
    for model_name, model in models.items():
        model.fit(X_train_split, y_train_split)
        
        y_pred_proba = model.predict_proba(X_valid)
        y_pred_proba_xyz = y_pred_proba[0][:, 1]
        y_pred_proba_seasonal = y_pred_proba[1][:, 1]
        
        roc_auc_xyz = roc_auc_score(y_valid['xyz_vaccine'], y_pred_proba_xyz)
        roc_auc_seasonal = roc_auc_score(y_valid['seasonal_vaccine'], y_pred_proba_seasonal)
        mean_roc_auc = (roc_auc_xyz + roc_auc_seasonal) / 2
        
        results[model_name] = {
            'roc_auc_xyz': roc_auc_xyz,
            'roc_auc_seasonal': roc_auc_seasonal,
            'mean_roc_auc': mean_roc_auc
        }
        
        print(f"{model_name} - ROC AUC (xyz_vaccine): {roc_auc_xyz:.4f}, ROC AUC (seasonal_vaccine): {roc_auc_seasonal:.4f}")
        print(f"Mean ROC AUC: {mean_roc_auc:.4f}")
    
    return results

In [12]:
model_results = evaluate_models(models, X_train_split, y_train_split, X_valid, y_valid)


LogisticRegression - ROC AUC (xyz_vaccine): 0.8314, ROC AUC (seasonal_vaccine): 0.8561
Mean ROC AUC: 0.8437
RandomForest - ROC AUC (xyz_vaccine): 0.8305, ROC AUC (seasonal_vaccine): 0.8517
Mean ROC AUC: 0.8411
GradientBoosting - ROC AUC (xyz_vaccine): 0.8390, ROC AUC (seasonal_vaccine): 0.8609
Mean ROC AUC: 0.8499
SVM - ROC AUC (xyz_vaccine): 0.8052, ROC AUC (seasonal_vaccine): 0.8554
Mean ROC AUC: 0.8303
BernoulliNB - ROC AUC (xyz_vaccine): 0.7951, ROC AUC (seasonal_vaccine): 0.8148
Mean ROC AUC: 0.8049
GaussianNB - ROC AUC (xyz_vaccine): 0.7096, ROC AUC (seasonal_vaccine): 0.7420
Mean ROC AUC: 0.7258


In [13]:
best_model_name = max(model_results, key=lambda k: model_results[k]['mean_roc_auc'])
best_model = models[best_model_name]

In [14]:
if best_model_name == 'GradientBoosting' or best_model_name == 'LogisticRegression':
    param_grid = {}
    
    if best_model_name == 'GradientBoosting':
        param_grid = {
            'estimator__n_estimators': [50, 100, 200],
            'estimator__learning_rate': [0.01, 0.1],
            'estimator__max_depth': [3, 5, 7]
        }
    elif best_model_name == 'LogisticRegression':
        param_grid = {
            'estimator__C': [0.01, 0.1, 1, 10]
        }

    grid_search = GridSearchCV(best_model, param_grid, cv=3, scoring='roc_auc', n_jobs=-1)
    grid_search.fit(X_train_preprocessed, y_train)

    best_model = grid_search.best_estimator_
    y_pred_proba = best_model.predict_proba(X_valid)
    y_pred_proba_xyz = y_pred_proba[0][:, 1]
    y_pred_proba_seasonal = y_pred_proba[1][:, 1]

    # Calculate ROC AUC for both targets
    roc_auc_xyz = roc_auc_score(y_valid['xyz_vaccine'], y_pred_proba_xyz)
    roc_auc_seasonal = roc_auc_score(y_valid['seasonal_vaccine'], y_pred_proba_seasonal)

    print(f"Best {best_model_name} - ROC AUC (xyz_vaccine): {roc_auc_xyz:.4f}, ROC AUC (seasonal_vaccine): {roc_auc_seasonal:.4f}")
    print(f"Mean ROC AUC: {(roc_auc_xyz + roc_auc_seasonal) / 2:.4f}")

best_model.fit(X_train_preprocessed, y_train)

y_pred_proba = best_model.predict_proba(X_test_preprocessed)
y_pred_proba_xyz = y_pred_proba[0][:, 1]
y_pred_proba_seasonal = y_pred_proba[1][:, 1]

submission = pd.DataFrame({
    'respondent_id': test_features['respondent_id'],
    'xyz_vaccine': y_pred_proba_xyz,
    'seasonal_vaccine': y_pred_proba_seasonal
})


Best GradientBoosting - ROC AUC (xyz_vaccine): 0.8575, ROC AUC (seasonal_vaccine): 0.8747
Mean ROC AUC: 0.8661


In [16]:
submission.to_csv('submission.csv', index=False)

In [17]:
submission

Unnamed: 0,respondent_id,xyz_vaccine,seasonal_vaccine
0,26707,0.059922,0.218180
1,26708,0.029525,0.035725
2,26709,0.438984,0.783506
3,26710,0.574924,0.848055
4,26711,0.238449,0.481067
...,...,...,...
26703,53410,0.352128,0.555066
26704,53411,0.134150,0.350883
26705,53412,0.111307,0.147715
26706,53413,0.051564,0.335222
