In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import OneHotEncoder

In [None]:
df_features = pd.read_csv('training_set_features.csv')
df_test_features = pd.read_csv('test_set_features.csv')
df_labels = pd.read_csv('training_set_labels.csv')
df_features.head()

In [None]:
df_features.isnull().sum()

In [None]:
df_features.info()

In [None]:
sns.heatmap(df_features.isnull())

In [None]:
sns.countplot(hue='rent_or_own', x='income_poverty', data=df_features)

In [None]:
sns.barplot(x='education', hue='income_poverty', data=df_features, palette='magma')

In [None]:
sns.countplot(x='age_group', hue='marital_status', data=df_features)

In [None]:
df_features.columns

In [None]:


def fill_missing_values(df):
    df['census_msa'].fillna('null', inplace=True)
    df['employment_status'].fillna('unemployed', inplace=True)

    mode_income = df['income_poverty'].mode()[0]
    df['income_poverty'].fillna(mode_income, inplace=True)

    df['rent_or_own'] = np.where(
        df['rent_or_own'].isna(),
        np.where(df['income_poverty'].isin(['<= $75,000, Above Poverty', '> $75,000']), 'Own', 'Rent'),
        df['rent_or_own']
    )
    
    df['marital_status'] = np.where(
        df['marital_status'].isna(),
        np.where(df['age_group'].isin(['65+ Years', '18 - 34 Years']), 'Married', 'Not Married'),
        df['marital_status']
    )
    
    df['education'] = np.where(
        df['education'].isna(),
        np.where(df['income_poverty'].isin(['<= $75,000, Above Poverty', '> $75,000']), 'College Graduate', '12 Years'),
        df['education']
    )
    mode_columns = [
        'xyz_concern', 'xyz_knowledge', 'behavioral_antiviral_meds', 'behavioral_avoidance',
        'behavioral_face_mask', 'behavioral_wash_hands', 'behavioral_large_gatherings',
        'behavioral_outside_home', 'behavioral_touch_face', 'doctor_recc_xyz', 'doctor_recc_seasonal',
        'chronic_med_condition', 'child_under_6_months', 'health_worker', 'opinion_xyz_vacc_effective',
        'opinion_xyz_risk', 'opinion_xyz_sick_from_vacc', 'opinion_seas_vacc_effective',
        'opinion_seas_risk', 'opinion_seas_sick_from_vacc'
    ]

    for col in mode_columns:
        if col in df.columns:
            df[col].fillna(df[col].mode()[0], inplace=True)

    # Fill missing values with 0 for household columns
    zero_columns = ['household_adults', 'household_children']
    df[zero_columns] = df[zero_columns].fillna(0)
    return df

def drop_unnecessary_columns(df):
    # Drop unnecessary columns from the dataframe.
    columns_to_drop = ['employment_industry', 'employment_occupation', 'hhs_geo_region', 'health_insurance', 'respondent_id']
    df.drop(columns=columns_to_drop, axis=1, inplace=True)
    return df

def preprocess_features(df):
    df = df.copy()
    df = drop_unnecessary_columns(df)
    df = fill_missing_values(df)

    categorical_cols = df.select_dtypes(include=['object']).columns
    numeric_cols = df.select_dtypes(exclude=['object']).columns

    encoder = OneHotEncoder()
    encoded_data = encoder.fit_transform(df[categorical_cols])
    encoded_df = pd.DataFrame(encoded_data.toarray(), columns=encoder.get_feature_names_out(categorical_cols))

    preprocessed_df = pd.concat([encoded_df, df[numeric_cols]], axis=1)
    return preprocessed_df


In [None]:
df_features.info()

In [None]:
df_features = preprocess_features(df_features)
df_test_features = preprocess_features(df_test_features)

In [None]:
df_features.select_dtypes(include=['object']).columns

In [None]:
df_features.head()

In [None]:
y_train_xyz = df_labels['xyz_vaccine']
y_train_xyz = y_train_xyz
y_train_seasonal = df_labels['seasonal_vaccine']

In [None]:
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(128, activation='relu', input_shape=(df_features.shape[1],)),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
# Training xyz_vaccine model
history_xyz = model.fit(
    df_features, y_train_xyz, 
    validation_data=(df_features, y_train_xyz),
    epochs=100, 
    batch_size=32, 
    callbacks=[EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)]
)


In [None]:
x_train.info()

In [None]:
y_train.shape

In [None]:
df_features.dtypes

In [None]:
print(np.isnan(x_train).any())
print(np.isnan(y_train).any())