In [143]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

In [144]:
df_sf = pd.read_csv('/content/submission_format.csv')
df_sf.columns

Index(['respondent_id', 'h1n1_vaccine', 'seasonal_vaccine'], dtype='object')

In [145]:
df_tsf = pd.read_csv('/content/test_set_features.csv')
df_tsf
df_tsf.columns

Index(['respondent_id', 'xyz_concern', 'xyz_knowledge',
       'behavioral_antiviral_meds', 'behavioral_avoidance',
       'behavioral_face_mask', 'behavioral_wash_hands',
       'behavioral_large_gatherings', 'behavioral_outside_home',
       'behavioral_touch_face', 'doctor_recc_xyz', 'doctor_recc_seasonal',
       'chronic_med_condition', 'child_under_6_months', 'health_worker',
       'health_insurance', 'opinion_xyz_vacc_effective', 'opinion_xyz_risk',
       'opinion_xyz_sick_from_vacc', 'opinion_seas_vacc_effective',
       'opinion_seas_risk', 'opinion_seas_sick_from_vacc', 'age_group',
       'education', 'race', 'sex', 'income_poverty', 'marital_status',
       'rent_or_own', 'employment_status', 'hhs_geo_region', 'census_msa',
       'household_adults', 'household_children', 'employment_industry',
       'employment_occupation'],
      dtype='object')

In [146]:
df_trsf = pd.read_csv('/content/training_set_features.csv')
df_trsf.columns

Index(['respondent_id', 'xyz_concern', 'xyz_knowledge',
       'behavioral_antiviral_meds', 'behavioral_avoidance',
       'behavioral_face_mask', 'behavioral_wash_hands',
       'behavioral_large_gatherings', 'behavioral_outside_home',
       'behavioral_touch_face', 'doctor_recc_xyz', 'doctor_recc_seasonal',
       'chronic_med_condition', 'child_under_6_months', 'health_worker',
       'health_insurance', 'opinion_xyz_vacc_effective', 'opinion_xyz_risk',
       'opinion_xyz_sick_from_vacc', 'opinion_seas_vacc_effective',
       'opinion_seas_risk', 'opinion_seas_sick_from_vacc', 'age_group',
       'education', 'race', 'sex', 'income_poverty', 'marital_status',
       'rent_or_own', 'employment_status', 'hhs_geo_region', 'census_msa',
       'household_adults', 'household_children', 'employment_industry',
       'employment_occupation'],
      dtype='object')

In [147]:
df_trsl = pd.read_csv('/content/training_set_labels.csv')
df_trsl.columns

Index(['respondent_id', 'xyz_vaccine', 'seasonal_vaccine'], dtype='object')

In [148]:
# Drop the original 'age_group' column
df_trsf = df_trsf.drop('age_group', axis=1)
df_tsf = df_tsf.drop('age_group', axis=1)

In [149]:
# Create a mapping for age groups
sex_mapping = {
    'Male': 0,
    'Female': 1,
}

# Replace age_group values with numerical values
df_trsf['sex'] = df_trsf['sex'].replace(sex_mapping)
df_tsf['sex'] = df_tsf['sex'].replace(sex_mapping)

In [150]:
df_trsf.columns

Index(['respondent_id', 'xyz_concern', 'xyz_knowledge',
       'behavioral_antiviral_meds', 'behavioral_avoidance',
       'behavioral_face_mask', 'behavioral_wash_hands',
       'behavioral_large_gatherings', 'behavioral_outside_home',
       'behavioral_touch_face', 'doctor_recc_xyz', 'doctor_recc_seasonal',
       'chronic_med_condition', 'child_under_6_months', 'health_worker',
       'health_insurance', 'opinion_xyz_vacc_effective', 'opinion_xyz_risk',
       'opinion_xyz_sick_from_vacc', 'opinion_seas_vacc_effective',
       'opinion_seas_risk', 'opinion_seas_sick_from_vacc', 'education', 'race',
       'sex', 'income_poverty', 'marital_status', 'rent_or_own',
       'employment_status', 'hhs_geo_region', 'census_msa', 'household_adults',
       'household_children', 'employment_industry', 'employment_occupation'],
      dtype='object')

In [151]:
df_trsf.drop(columns=['marital_status','household_adults','household_children','rent_or_own','employment_status','employment_occupation','education','race','employment_industry'], inplace=True)
df_tsf.drop(columns=['marital_status','household_adults','household_children','rent_or_own','employment_status','employment_occupation','education','race','employment_industry'], inplace=True)

In [152]:
df_tsf.columns

Index(['respondent_id', 'xyz_concern', 'xyz_knowledge',
       'behavioral_antiviral_meds', 'behavioral_avoidance',
       'behavioral_face_mask', 'behavioral_wash_hands',
       'behavioral_large_gatherings', 'behavioral_outside_home',
       'behavioral_touch_face', 'doctor_recc_xyz', 'doctor_recc_seasonal',
       'chronic_med_condition', 'child_under_6_months', 'health_worker',
       'health_insurance', 'opinion_xyz_vacc_effective', 'opinion_xyz_risk',
       'opinion_xyz_sick_from_vacc', 'opinion_seas_vacc_effective',
       'opinion_seas_risk', 'opinion_seas_sick_from_vacc', 'sex',
       'income_poverty', 'hhs_geo_region', 'census_msa'],
      dtype='object')

In [153]:
df_trsf.columns

Index(['respondent_id', 'xyz_concern', 'xyz_knowledge',
       'behavioral_antiviral_meds', 'behavioral_avoidance',
       'behavioral_face_mask', 'behavioral_wash_hands',
       'behavioral_large_gatherings', 'behavioral_outside_home',
       'behavioral_touch_face', 'doctor_recc_xyz', 'doctor_recc_seasonal',
       'chronic_med_condition', 'child_under_6_months', 'health_worker',
       'health_insurance', 'opinion_xyz_vacc_effective', 'opinion_xyz_risk',
       'opinion_xyz_sick_from_vacc', 'opinion_seas_vacc_effective',
       'opinion_seas_risk', 'opinion_seas_sick_from_vacc', 'sex',
       'income_poverty', 'hhs_geo_region', 'census_msa'],
      dtype='object')

In [154]:
df_trsf['census_msa'].unique()

array(['Non-MSA', 'MSA, Not Principle  City', 'MSA, Principle City'],
      dtype=object)

In [155]:
census_msa_mapping = {
    'Non-MSA': 0,
    'MSA, Not Principle  City': 1,
    'MSA, Principle City' : 2,
}

# Replace age_group values with numerical values
df_trsf['census_msa'] = df_trsf['census_msa'].replace(census_msa_mapping)
df_tsf['census_msa'] = df_tsf['census_msa'].replace(census_msa_mapping)

In [156]:
df_trsf.drop(columns=['hhs_geo_region'], inplace=True)
df_tsf.drop(columns=['hhs_geo_region'], inplace=True)

In [157]:
df_trsf['income_poverty'].unique()

array(['Below Poverty', '<= $75,000, Above Poverty', '> $75,000', nan],
      dtype=object)

In [158]:
income_poverty_mapping = {
    'Below Poverty': 0,
    '<= $75,000, Above Poverty': 1,
    '> $75,000' : 2,
}

# Replace age_group values with numerical values
df_trsf['income_poverty'] = df_trsf['income_poverty'].replace(income_poverty_mapping)
df_tsf['income_poverty'] = df_tsf['income_poverty'].replace(income_poverty_mapping)

In [159]:
df_trsf

Unnamed: 0,respondent_id,xyz_concern,xyz_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,health_insurance,opinion_xyz_vacc_effective,opinion_xyz_risk,opinion_xyz_sick_from_vacc,opinion_seas_vacc_effective,opinion_seas_risk,opinion_seas_sick_from_vacc,sex,income_poverty,census_msa
0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,1.0,3.0,1.0,2.0,2.0,1.0,2.0,1,0.0,0
1,1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,1.0,5.0,4.0,4.0,4.0,2.0,4.0,0,0.0,1
2,2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,,3.0,1.0,1.0,4.0,1.0,2.0,0,1.0,1
3,3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,,3.0,3.0,5.0,5.0,4.0,1.0,1,0.0,2
4,4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,,3.0,3.0,2.0,3.0,1.0,4.0,1,1.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26702,26702,2.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,,3.0,1.0,1.0,5.0,2.0,2.0,1,1.0,0
26703,26703,1.0,2.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,1.0,4.0,2.0,2.0,5.0,1.0,1.0,0,1.0,2
26704,26704,2.0,2.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,...,,4.0,4.0,2.0,5.0,4.0,2.0,1,,1
26705,26705,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,,...,0.0,3.0,1.0,2.0,2.0,1.0,2.0,1,1.0,0


In [160]:
df_tsf

Unnamed: 0,respondent_id,xyz_concern,xyz_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,health_insurance,opinion_xyz_vacc_effective,opinion_xyz_risk,opinion_xyz_sick_from_vacc,opinion_seas_vacc_effective,opinion_seas_risk,opinion_seas_sick_from_vacc,sex,income_poverty,census_msa
0,26707,2.0,2.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,1.0,5.0,1.0,1.0,5.0,1.0,1.0,1,2.0,1
1,26708,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,4.0,1.0,1.0,4.0,1.0,1.0,0,0.0,0
2,26709,2.0,2.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,...,,5.0,4.0,2.0,5.0,4.0,4.0,0,2.0,0
3,26710,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,4.0,2.0,2.0,4.0,4.0,2.0,1,1.0,1
4,26711,3.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,...,1.0,5.0,2.0,4.0,4.0,4.0,2.0,1,1.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26703,53410,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,,4.0,2.0,2.0,4.0,2.0,1.0,1,,2
26704,53411,3.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,...,0.0,4.0,1.0,1.0,5.0,2.0,2.0,0,0.0,0
26705,53412,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,4.0,3.0,1.0,4.0,3.0,1.0,1,0.0,1
26706,53413,3.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,,2.0,3.0,4.0,4.0,3.0,2.0,1,1.0,1


In [161]:
df_trsl

Unnamed: 0,respondent_id,xyz_vaccine,seasonal_vaccine
0,0,0,0
1,1,0,1
2,2,0,0
3,3,0,1
4,4,0,0
...,...,...,...
26702,26702,0,0
26703,26703,0,0
26704,26704,0,1
26705,26705,0,0


In [162]:
df_sf

Unnamed: 0,respondent_id,h1n1_vaccine,seasonal_vaccine
0,26707,0.5,0.7
1,26708,0.5,0.7
2,26709,0.5,0.7
3,26710,0.5,0.7
4,26711,0.5,0.7
...,...,...,...
26703,53410,0.5,0.7
26704,53411,0.5,0.7
26705,53412,0.5,0.7
26706,53413,0.5,0.7


In [163]:
# Drop rows with NaN values from df_trsf
df_trsf_cleaned = df_trsf.dropna().reset_index(drop=True)

# Drop rows with NaN values from df_tsf
df_tsf_cleaned = df_tsf.dropna().reset_index(drop=True)

In [164]:
df_trsf_cleaned

Unnamed: 0,respondent_id,xyz_concern,xyz_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,health_insurance,opinion_xyz_vacc_effective,opinion_xyz_risk,opinion_xyz_sick_from_vacc,opinion_seas_vacc_effective,opinion_seas_risk,opinion_seas_sick_from_vacc,sex,income_poverty,census_msa
0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,1.0,3.0,1.0,2.0,2.0,1.0,2.0,1,0.0,0
1,1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,1.0,5.0,4.0,4.0,4.0,2.0,4.0,0,0.0,1
2,7,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,1.0,5.0,2.0,1.0,4.0,2.0,1.0,1,1.0,0
3,9,2.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,...,1.0,4.0,2.0,2.0,4.0,2.0,2.0,0,1.0,1
4,10,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,0.0,4.0,1.0,2.0,5.0,4.0,4.0,0,1.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11975,26697,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,1.0,4.0,2.0,2.0,4.0,2.0,2.0,0,2.0,2
11976,26699,2.0,2.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,1.0,5.0,1.0,5.0,5.0,1.0,4.0,1,1.0,1
11977,26701,2.0,2.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,1.0,4.0,2.0,4.0,4.0,2.0,4.0,1,2.0,2
11978,26703,1.0,2.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,1.0,4.0,2.0,2.0,5.0,1.0,1.0,0,1.0,2


In [167]:
df_trsl

Unnamed: 0,respondent_id,xyz_vaccine,seasonal_vaccine
0,0,0,0
1,1,0,1
2,2,0,0
3,3,0,1
4,4,0,0
...,...,...,...
26702,26702,0,0
26703,26703,0,0
26704,26704,0,1
26705,26705,0,0


In [165]:
df_tsf_cleaned

Unnamed: 0,respondent_id,xyz_concern,xyz_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,health_insurance,opinion_xyz_vacc_effective,opinion_xyz_risk,opinion_xyz_sick_from_vacc,opinion_seas_vacc_effective,opinion_seas_risk,opinion_seas_sick_from_vacc,sex,income_poverty,census_msa
0,26707,2.0,2.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,1.0,5.0,1.0,1.0,5.0,1.0,1.0,1,2.0,1
1,26708,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,4.0,1.0,1.0,4.0,1.0,1.0,0,0.0,0
2,26710,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,4.0,2.0,2.0,4.0,4.0,2.0,1,1.0,1
3,26711,3.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,...,1.0,5.0,2.0,4.0,4.0,4.0,2.0,1,1.0,0
4,26712,2.0,2.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,1.0,4.0,4.0,1.0,5.0,5.0,1.0,1,2.0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11934,53407,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,1.0,3.0,1.0,2.0,2.0,2.0,2.0,0,1.0,1
11935,53408,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,...,1.0,4.0,1.0,2.0,1.0,2.0,4.0,1,0.0,0
11936,53409,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,4.0,1.0,2.0,4.0,1.0,1.0,0,1.0,1
11937,53411,3.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,...,0.0,4.0,1.0,1.0,5.0,2.0,2.0,0,0.0,0


In [170]:
# Extract unique respondent_ids from df_trsf
respondent_ids = df_trsf_cleaned['respondent_id'].unique()

# Filter df_trsl based on respondent_id in df_trsf
df_trsl_filtered = df_trsl[df_trsl['respondent_id'].isin(respondent_ids)]

# Optionally, you can reset the index of the filtered DataFrame
df_trsl_filtered = df_trsl_filtered.reset_index(drop=True)

In [171]:
df_trsl_filtered

Unnamed: 0,respondent_id,xyz_vaccine,seasonal_vaccine
0,0,0,0
1,1,0,1
2,7,1,1
3,9,0,0
4,10,1,1
...,...,...,...
11975,26697,0,0
11976,26699,0,0
11977,26701,0,0
11978,26703,0,0


In [177]:
df_trsf_cleaned.drop(['respondent_id'], axis=1)

Unnamed: 0,xyz_concern,xyz_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_xyz,...,health_insurance,opinion_xyz_vacc_effective,opinion_xyz_risk,opinion_xyz_sick_from_vacc,opinion_seas_vacc_effective,opinion_seas_risk,opinion_seas_sick_from_vacc,sex,income_poverty,census_msa
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,1.0,3.0,1.0,2.0,2.0,1.0,2.0,1,0.0,0
1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,1.0,5.0,4.0,4.0,4.0,2.0,4.0,0,0.0,1
2,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,...,1.0,5.0,2.0,1.0,4.0,2.0,1.0,1,1.0,0
3,2.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,...,1.0,4.0,2.0,2.0,4.0,2.0,2.0,0,1.0,1
4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,4.0,1.0,2.0,5.0,4.0,4.0,0,1.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11975,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,...,1.0,4.0,2.0,2.0,4.0,2.0,2.0,0,2.0,2
11976,2.0,2.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,...,1.0,5.0,1.0,5.0,5.0,1.0,4.0,1,1.0,1
11977,2.0,2.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,1.0,4.0,2.0,4.0,4.0,2.0,4.0,1,2.0,2
11978,1.0,2.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,1.0,4.0,2.0,2.0,5.0,1.0,1.0,0,1.0,2


In [None]:
df_tsf_cleaned.drop(['respondent_id'], axis=1)

In [183]:
df_tsf_cleaned.drop(['respondent_id'], axis=1)# # Example of handling missing values
# imputer = SimpleImputer(strategy='most_frequent')
# df_filled = pd.DataFrame(imputer.fit_transform(df_trsf), columns=df_trsf.columns)

# Convert categorical features to numeric if necessary
# categorical_features = ['sex', 'race', 'age_group', 'education', 'marital_status', 'employment_status', 'hhs_geo_region', 'census_msa', 'employment_industry', 'employment_occupation', 'rent_or_own']
# df_filled = pd.get_dummies(df_filled, columns=categorical_features)

# Splitting the data into training and testing sets
# X = df_filled.drop(['respondent_id', 'xyz_vaccine', 'seasonal_vaccine'], axis=1)
# y = df_filled[['xyz_vaccine', 'seasonal_vaccine']]
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train = df_trsf_cleaned
y_train = df_trsl_filtered[['xyz_vaccine', 'seasonal_vaccine']]

X_test = df_tsf_cleaned
y_test = df_trsl[['xyz_vaccine', 'seasonal_vaccine']]

In [179]:
X_train.shape

(11980, 25)

In [180]:
y_train.shape

(11980, 2)

In [185]:
X_test.shape

(11939, 25)

In [186]:
y_test.shape

(26707, 2)

In [204]:
y_train_xyz = y_train['xyz_vaccine']
y_train_seasonal = y_train['seasonal_vaccine']

In [187]:
# Extract unique respondent_ids from df_trsf
respondent_ids = df_tsf_cleaned['respondent_id'].unique()

# Filter df_trsl based on respondent_id in df_trsf
df_sf_filtered = df_sf[df_sf['respondent_id'].isin(respondent_ids)]

# Optionally, you can reset the index of the filtered DataFrame
df_sf_filtered = df_sf_filtered.reset_index(drop=True)

In [189]:
df_sf_filtered.shape

(11939, 3)

In [190]:
df_sf_filtered.drop(['respondent_id'], axis=1)

Unnamed: 0,h1n1_vaccine,seasonal_vaccine
0,0.5,0.7
1,0.5,0.7
2,0.5,0.7
3,0.5,0.7
4,0.5,0.7
...,...,...
11934,0.5,0.7
11935,0.5,0.7
11936,0.5,0.7
11937,0.5,0.7


In [192]:
df_sf_filtered['xyz_vaccine'] = df_sf_filtered['h1n1_vaccine']

In [193]:
y_test = df_sf_filtered[['xyz_vaccine', 'seasonal_vaccine']]
y_test.shape

(11939, 2)

In [202]:
y_test_xyz = y_test['xyz_vaccine']

In [203]:
y_test_seasonal = y_test['seasonal_vaccine']

In [205]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

# Assuming df_filled contains your preprocessed DataFrame with features and target variables
# X: Features, y_xyz: Target variable for xyz_vaccine, y_seasonal: Target variable for seasonal_vaccine

# Initialize XGBoost models
model_xyz = xgb.XGBClassifier()
model_seasonal = xgb.XGBClassifier()

# Fit models
model_xyz.fit(X_train, y_train_xyz)
model_seasonal.fit(X_train, y_train_seasonal)

# Predict probabilities
probabilities_xyz = model_xyz.predict_proba(X_test)[:, 1]  # Probability of class 1 (yes)
probabilities_seasonal = model_seasonal.predict_proba(X_test)[:, 1]  # Probability of class 1 (yes)

# Evaluate performance (example using ROC AUC score)
# auc_xyz = roc_auc_score(y_test, probabilities_xyz)
# auc_seasonal = roc_auc_score(y_test, probabilities_seasonal)

In [206]:
probabilities_xyz

array([0.21628222, 0.00475895, 0.5074692 , ..., 0.04867297, 0.05740806,
       0.11755202], dtype=float32)

In [207]:
probabilities_seasonal

array([0.81089187, 0.00492992, 0.07211391, ..., 0.08555594, 0.08623747,
       0.09514356], dtype=float32)

In [225]:
df_sf_pred = df_sf_filtered.copy()

In [226]:
df_sf_pred['prob_xyz'] = probabilities_xyz
df_sf_pred['prob_seasonal'] = probabilities_seasonal

In [227]:
df_sf_pred.drop(['xyz_vaccine', 'seasonal_vaccine','h1n1_vaccine'], axis=1)

Unnamed: 0,respondent_id,prob_xyz,prob_seasonal
0,26707,0.216282,0.810892
1,26708,0.004759,0.004930
2,26710,0.507469,0.072114
3,26711,0.217558,0.711769
4,26712,0.981795,0.995127
...,...,...,...
11934,53407,0.022837,0.088233
11935,53408,0.110981,0.191398
11936,53409,0.048673,0.085556
11937,53411,0.057408,0.086237


In [216]:
df_sf_pred

Unnamed: 0,respondent_id,h1n1_vaccine,seasonal_vaccine,xyz_vaccine,prob_xyz,prob_seasonal
0,26707,0.5,0.7,0.5,0.216282,0.810892
1,26708,0.5,0.7,0.5,0.004759,0.004930
2,26710,0.5,0.7,0.5,0.507469,0.072114
3,26711,0.5,0.7,0.5,0.217558,0.711769
4,26712,0.5,0.7,0.5,0.981795,0.995127
...,...,...,...,...,...,...
11934,53407,0.5,0.7,0.5,0.022837,0.088233
11935,53408,0.5,0.7,0.5,0.110981,0.191398
11936,53409,0.5,0.7,0.5,0.048673,0.085556
11937,53411,0.5,0.7,0.5,0.057408,0.086237


In [219]:
submission = pd.DataFrame()

In [220]:
submission['h1n1_vaccine'] = df_sf_pred['prob_xyz']
submission['seasonal_vaccine'] = df_sf_pred['prob_seasonal']

In [221]:
submission.to_csv('submission.csv', index=False)

In [222]:
submission

Unnamed: 0,h1n1_vaccine,seasonal_vaccine
0,0.216282,0.810892
1,0.004759,0.004930
2,0.507469,0.072114
3,0.217558,0.711769
4,0.981795,0.995127
...,...,...
11934,0.022837,0.088233
11935,0.110981,0.191398
11936,0.048673,0.085556
11937,0.057408,0.086237


In [223]:
import os

In [235]:
from google.colab import files

files.download('submission.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
submission2 = pd.DataFrame()
submission2['respondent_id'] = df_sf_pred['respondent_id']
submission2['h1n1_vaccine'] = df_sf_pred['prob_xyz']
submission2['seasonal_vaccine'] = df_sf_pred['prob_seasonal']

In [None]:
submission2.to_csv('submission2.csv', index=False)
files.download('submission2.csv')