In [1]:
import pandas as pd

# Load the datasets
train_features = pd.read_csv('training_set_features.csv')
train_labels = pd.read_csv('training_set_labels.csv')
test_features = pd.read_csv('test_set_features.csv')

# Display the first few rows of the datasets
print(train_features.head())
print(train_labels.head())
print(test_features.head())

# Check for missing values
print(train_features.isnull().sum())
print(train_labels.isnull().sum())
print(test_features.isnull().sum())


   respondent_id  xyz_concern  xyz_knowledge  behavioral_antiviral_meds  \
0              0          1.0            0.0                        0.0   
1              1          3.0            2.0                        0.0   
2              2          1.0            1.0                        0.0   
3              3          1.0            1.0                        0.0   
4              4          2.0            1.0                        0.0   

   behavioral_avoidance  behavioral_face_mask  behavioral_wash_hands  \
0                   0.0                   0.0                    0.0   
1                   1.0                   0.0                    1.0   
2                   1.0                   0.0                    0.0   
3                   1.0                   0.0                    1.0   
4                   1.0                   0.0                    1.0   

   behavioral_large_gatherings  behavioral_outside_home  \
0                          0.0                      1.0  

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Merge the features and labels for the training set
train_data = train_features.merge(train_labels, on='respondent_id')

# Define the feature columns
features = train_features.columns.drop('respondent_id')

# Define the target variables
X = train_features[features]
y = train_labels[['xyz_vaccine', 'seasonal_vaccine']]

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Identify categorical and numerical features
categorical_features = X.select_dtypes(include=['object']).columns
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns

# Create preprocessing pipelines for both numeric and categorical data
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)])


In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline

# Define the model
log_reg_model = Pipeline(steps=[('preprocessor', preprocessor),
                                ('classifier', MultiOutputClassifier(LogisticRegression(max_iter=1000)))])

# Train the model
log_reg_model.fit(X_train, y_train)


In [4]:
from sklearn.metrics import roc_auc_score

# Predict probabilities on the validation set
y_val_pred_proba = log_reg_model.predict_proba(X_val)

# Calculate ROC AUC for each target variable
xyz_vaccine_roc_auc = roc_auc_score(y_val['xyz_vaccine'], y_val_pred_proba[0][:, 1])
seasonal_vaccine_roc_auc = roc_auc_score(y_val['seasonal_vaccine'], y_val_pred_proba[1][:, 1])

# Calculate the mean ROC AUC
mean_roc_auc = (xyz_vaccine_roc_auc + seasonal_vaccine_roc_auc) / 2
print(f'Mean ROC AUC: {mean_roc_auc:.4f}')


Mean ROC AUC: 0.8437


In [None]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'classifier__estimator__C': [0.1, 1.0, 10.0]
}

# Perform grid search
grid_search = GridSearchCV(log_reg_model, param_grid, cv=5, scoring='roc_auc')
grid_search.fit(X_train, y_train)

# Display the best parameters
print("Best parameters:", grid_search.best_params_)

# Evaluate the best model
best_model = grid_search.best_estimator_
y_val_pred_proba = best_model.predict_proba(X_val)
xyz_vaccine_roc_auc = roc_auc_score(y_val['xyz_vaccine'], y_val_pred_proba[0][:, 1])
seasonal_vaccine_roc_auc = roc_auc_score(y_val['seasonal_vaccine'], y_val_pred_proba[1][:, 1])

mean_roc_auc = (xyz_vaccine_roc_auc + seasonal_vaccine_roc_auc) / 2
print(f'Mean ROC AUC after tuning: {mean_roc_auc:.4f}')


In [None]:
# Predict probabilities on the test set
test_pred_proba = best_model.predict_proba(test_features)

# Prepare the submission file
submission = pd.DataFrame({
    'respondent_id': test_features['respondent_id'],
    'xyz_vaccine': test_pred_proba[0][:, 1],
    'seasonal_vaccine': test_pred_proba[1][:, 1]
})

submission.to_csv('final CSV file submission.csv', index=False)
print("Submission file created successfully!")


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Merge the features and labels for the training set
train_data = train_features.merge(train_labels, on='respondent_id')

# Define the feature columns
features = train_features.columns.drop('respondent_id')

# Define the target variables
X = train_features[features]
y = train_labels[['xyz_vaccine', 'seasonal_vaccine']]

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Identify categorical and numerical features
categorical_features = X.select_dtypes(include=['object']).columns
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns

# Create preprocessing pipelines for both numeric and categorical data
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)])

# Show the column names to confirm correctness
print("Categorical features:", categorical_features)
print("Numerical features:", numerical_features)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline

# Define the model
log_reg_model = Pipeline(steps=[('preprocessor', preprocessor),
                                ('classifier', MultiOutputClassifier(LogisticRegression(max_iter=1000)))])

# Train the model
log_reg_model.fit(X_train, y_train)


In [None]:
from sklearn.metrics import roc_auc_score

# Predict probabilities on the validation set
y_val_pred_proba = log_reg_model.predict_proba(X_val)

# Calculate ROC AUC for each target variable
xyz_vaccine_roc_auc = roc_auc_score(y_val['xyz_vaccine'], y_val_pred_proba[0][:, 1])
seasonal_vaccine_roc_auc = roc_auc_score(y_val['seasonal_vaccine'], y_val_pred_proba[1][:, 1])

# Calculate the mean ROC AUC
mean_roc_auc = (xyz_vaccine_roc_auc + seasonal_vaccine_roc_auc) / 2
print(f'Mean ROC AUC: {mean_roc_auc:.4f}')


In [None]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'classifier__estimator__C': [0.1, 1.0, 10.0]
}

# Perform grid search
grid_search = GridSearchCV(log_reg_model, param_grid, cv=5, scoring='roc_auc')
grid_search.fit(X_train, y_train)

# Display the best parameters
print("Best parameters:", grid_search.best_params_)

# Evaluate the best model
best_model = grid_search.best_estimator_
y_val_pred_proba = best_model.predict_proba(X_val)
xyz_vaccine_roc_auc = roc_auc_score(y_val['xyz_vaccine'], y_val_pred_proba[0][:, 1])
seasonal_vaccine_roc_auc = roc_auc_score(y_val['seasonal_vaccine'], y_val_pred_proba[1][:, 1])

mean_roc_auc = (xyz_vaccine_roc_auc + seasonal_vaccine_roc_auc) / 2
print(f'Mean ROC AUC after tuning: {mean_roc_auc:.4f}')


In [None]:
# Predict probabilities on the test set
test_pred_proba = best_model.predict_proba(test_features)

# Prepare the submission file
submission = pd.DataFrame({
    'respondent_id': test_features['respondent_id'],
    'xyz_vaccine': test_pred_proba[0][:, 1],
    'seasonal_vaccine': test_pred_proba[1][:, 1]
})

submission.to_csv('final CSV file submission.csv', index=False)
print("Submission file created successfully!")


In [None]:
# Define the parameter grid with fewer values
param_grid = {
    'classifier__estimator__C': [0.1, 1.0]
}

# Perform grid search with fewer folds
grid_search = GridSearchCV(log_reg_model, param_grid, cv=3, scoring='roc_auc')
grid_search.fit(X_train, y_train)

# Display the best parameters
print("Best parameters:", grid_search.best_params_)

# Evaluate the best model
best_model = grid_search.best_estimator_
y_val_pred_proba = best_model.predict_proba(X_val)
xyz_vaccine_roc_auc = roc_auc_score(y_val['xyz_vaccine'], y_val_pred_proba[0][:, 1])
seasonal_vaccine_roc_auc = roc_auc_score(y_val['seasonal_vaccine'], y_val_pred_proba[1][:, 1])

mean_roc_auc = (xyz_vaccine_roc_auc + seasonal_vaccine_roc_auc) / 2
print(f'Mean ROC AUC after tuning: {mean_roc_auc:.4f}')

# Predict probabilities on the test set
test_pred_proba = best_model.predict_proba(test_features)

# Prepare the submission file
submission = pd.DataFrame({
    'respondent_id': test_features['respondent_id'],
    'xyz_vaccine': test_pred_proba[0][:, 1],
    'seasonal_vaccine': test_pred_proba[1][:, 1]
})

submission.to_csv('final CSV file submission.csv', index=False)
print("Submission file created successfully!")
