In [2]:
# Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
df = pd.read_csv("vaccine_prediction.csv")

# EDA
print("Dataset shape:", df.shape)
print("Column names:")
print(df.columns)
print("Sample data:")
print(df.head())
print("Summary statistics:")
print(df.describe())
print("Missing values:")
print(df.isnull().sum())

# Data preprocessing
# Handling missing values
df.fillna(method='ffill', inplace=True)

# Encoding categorical variables
df = pd.get_dummies(df, drop_first=True)

# Splitting the dataset into features and target variable
X = df.drop('h1n1_vaccine', axis=1)
y = df['h1n1_vaccine']

# Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardizing the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Training logistic regression models
# Using Maximum Likelihood Estimation (MLE)
log_reg_mle = LogisticRegression(max_iter=1000)
log_reg_mle.fit(X_train, y_train)

# Using Stochastic Gradient Descent (SGD)
log_reg_sgd = LogisticRegression(solver='sag', max_iter=1000)
log_reg_sgd.fit(X_train, y_train)

# Predictions
y_pred_mle = log_reg_mle.predict(X_test)
y_pred_sgd = log_reg_sgd.predict(X_test)

# Evaluation
print("Accuracy using MLE:", accuracy_score(y_test, y_pred_mle))
print("Classification Report using MLE:\n", classification_report(y_test, y_pred_mle))
print("Accuracy using SGD:", accuracy_score(y_test, y_pred_sgd))
print("Classification Report using SGD:\n", classification_report(y_test, y_pred_sgd))


Dataset shape: (26707, 34)
Column names:
Index(['unique_id', 'h1n1_worry', 'h1n1_awareness', 'antiviral_medication',
       'contact_avoidance', 'bought_face_mask', 'wash_hands_frequently',
       'avoid_large_gatherings', 'reduced_outside_home_cont',
       'avoid_touch_face', 'dr_recc_h1n1_vacc', 'dr_recc_seasonal_vacc',
       'chronic_medic_condition', 'cont_child_undr_6_mnths',
       'is_health_worker', 'has_health_insur', 'is_h1n1_vacc_effective',
       'is_h1n1_risky', 'sick_from_h1n1_vacc', 'is_seas_vacc_effective',
       'is_seas_risky', 'sick_from_seas_vacc', 'age_bracket', 'qualification',
       'race', 'sex', 'income_level', 'marital_status', 'housing_status',
       'employment', 'census_msa', 'no_of_adults', 'no_of_children',
       'h1n1_vaccine'],
      dtype='object')
Sample data:
   unique_id  h1n1_worry  h1n1_awareness  antiviral_medication  \
0          0         1.0             0.0                   0.0   
1          1         3.0             2.0               

  df.fillna(method='ffill', inplace=True)


Accuracy using MLE: 0.8363908648446274
Classification Report using MLE:
               precision    recall  f1-score   support

           0       0.86      0.95      0.90      4212
           1       0.69      0.41      0.51      1130

    accuracy                           0.84      5342
   macro avg       0.77      0.68      0.71      5342
weighted avg       0.82      0.84      0.82      5342

Accuracy using SGD: 0.8363908648446274
Classification Report using SGD:
               precision    recall  f1-score   support

           0       0.86      0.95      0.90      4212
           1       0.69      0.41      0.51      1130

    accuracy                           0.84      5342
   macro avg       0.77      0.68      0.71      5342
weighted avg       0.82      0.84      0.82      5342

