In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import roc_auc_score

In [11]:
train_labels = pd.read_csv('training_set_labels.csv')
train_features = pd.read_csv('training_set_features.csv')
test_features = pd.read_csv('test_set_features.csv')
submission_format = pd.read_csv('submission_format.csv')

In [12]:
train_data = pd.merge(train_features, train_labels, on='respondent_id')

In [13]:
X = train_data.drop(columns=['respondent_id', 'xyz_vaccine', 'seasonal_vaccine'])
y = train_data[['xyz_vaccine', 'seasonal_vaccine']]


In [14]:
categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns

In [15]:
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [16]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

In [17]:
X_preprocessed = preprocessor.fit_transform(X)

In [18]:
X_train, X_valid, y_train, y_valid = train_test_split(X_preprocessed, y, test_size=0.2, random_state=42)

In [19]:
model = MultiOutputClassifier(RandomForestClassifier(random_state=42))

In [20]:
model.fit(X_train, y_train)


In [21]:
y_pred_prob = model.predict_proba(X_valid)

In [22]:
y_pred_prob_df = pd.DataFrame({
    'xyz_vaccine': y_pred_prob[0][:, 1],
    'seasonal_vaccine': y_pred_prob[1][:, 1]
})


In [23]:
roc_auc_xyz = roc_auc_score(y_valid['xyz_vaccine'], y_pred_prob_df['xyz_vaccine'])
roc_auc_seasonal = roc_auc_score(y_valid['seasonal_vaccine'], y_pred_prob_df['seasonal_vaccine'])

In [24]:
mean_roc_auc = np.mean([roc_auc_xyz, roc_auc_seasonal])

print(f'Mean ROC AUC: {mean_roc_auc}')

Mean ROC AUC: 0.840671491285722


In [25]:
X_test = test_features.drop(columns=['respondent_id'])
X_test_preprocessed = preprocessor.transform(X_test)

In [26]:
test_pred_prob = model.predict_proba(X_test_preprocessed)

In [27]:
submission = pd.DataFrame({
    'respondent_id': test_features['respondent_id'],
    'xyz_vaccine': test_pred_prob[0][:, 1],
    'seasonal_vaccine': test_pred_prob[1][:, 1]
})


In [28]:
submission.to_csv('final_submission.csv', index=False)