In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import roc_auc_score


In [8]:
train_features = pd.read_csv('training_set_features.csv')
train_labels = pd.read_csv('training_set_labels.csv')

data = pd.merge(train_features, train_labels, on='respondent_id')


X = data.drop(columns=['respondent_id', 'xyz_vaccine', 'seasonal_vaccine'])
y = data[['xyz_vaccine', 'seasonal_vaccine']]

num_features = X.select_dtypes(include=['int64', 'float64']).columns
cat_features = X.select_dtypes(include=['object', 'category']).columns

num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_features),
        ('cat', cat_transformer, cat_features)
    ])

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

X_train = preprocessor.fit_transform(X_train)
X_val = preprocessor.transform(X_val)


In [9]:
base_model = RandomForestClassifier(random_state=42)

model = MultiOutputClassifier(base_model, n_jobs=-1)

model.fit(X_train, y_train)


In [10]:
y_pred_proba = model.predict_proba(X_val)

y_pred_proba_xyz = y_pred_proba[0][:, 1]
y_pred_proba_seasonal = y_pred_proba[1][:, 1]

roc_auc_xyz = roc_auc_score(y_val['xyz_vaccine'], y_pred_proba_xyz)
roc_auc_seasonal = roc_auc_score(y_val['seasonal_vaccine'], y_pred_proba_seasonal)

mean_roc_auc = (roc_auc_xyz + roc_auc_seasonal) / 2

print(f'ROC AUC for xyz vaccine: {roc_auc_xyz}')
print(f'ROC AUC for seasonal vaccine: {roc_auc_seasonal}')
print(f'Mean ROC AUC: {mean_roc_auc}')


ROC AUC for xyz vaccine: 0.8304061719990924
ROC AUC for seasonal vaccine: 0.8517016540450174
Mean ROC AUC: 0.8410539130220549


In [11]:
test_features = pd.read_csv('test_set_features.csv')
X_test = test_features.drop(columns=['respondent_id'])
X_test = preprocessor.transform(X_test)

y_test_pred_proba = model.predict_proba(X_test)

y_test_pred_proba_xyz = y_test_pred_proba[0][:, 1]
y_test_pred_proba_seasonal = y_test_pred_proba[1][:, 1]

submission = pd.DataFrame({
    'respondent_id': test_features['respondent_id'],
    'xyz_vaccine': y_test_pred_proba_xyz,
    'seasonal_vaccine': y_test_pred_proba_seasonal
})

submission.to_csv('submission.csv', index=False)
