In [22]:
#Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression, SGDClassifier
from sklearn.metrics import accuracy_score, classification_report

In [2]:
#Loading the data
Vaccine = pd.read_csv("C:/Users/User/Documents/DATA ANALYST/Info Tech Projects/Task 3 ML/Vaccine usage Prediction/vaccine_prediction.csv")

Performing exploratory data analysis

In [3]:
#Checking first rows
Vaccine.head(5)

Unnamed: 0,unique_id,h1n1_worry,h1n1_awareness,antiviral_medication,contact_avoidance,bought_face_mask,wash_hands_frequently,avoid_large_gatherings,reduced_outside_home_cont,avoid_touch_face,...,race,sex,income_level,marital_status,housing_status,employment,census_msa,no_of_adults,no_of_children,h1n1_vaccine
0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,White,Female,Below Poverty,Not Married,Own,Not in Labor Force,Non-MSA,0.0,0.0,0
1,1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,White,Male,Below Poverty,Not Married,Rent,Employed,"MSA, Not Principle City",0.0,0.0,0
2,2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,White,Male,"<= $75,000, Above Poverty",Not Married,Own,Employed,"MSA, Not Principle City",2.0,0.0,0
3,3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,White,Female,Below Poverty,Not Married,Rent,Not in Labor Force,"MSA, Principle City",0.0,0.0,0
4,4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,White,Female,"<= $75,000, Above Poverty",Married,Own,Employed,"MSA, Not Principle City",1.0,0.0,0


In [4]:
#Checking missing values and data types
Vaccine.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26707 entries, 0 to 26706
Data columns (total 34 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   unique_id                  26707 non-null  int64  
 1   h1n1_worry                 26615 non-null  float64
 2   h1n1_awareness             26591 non-null  float64
 3   antiviral_medication       26636 non-null  float64
 4   contact_avoidance          26499 non-null  float64
 5   bought_face_mask           26688 non-null  float64
 6   wash_hands_frequently      26665 non-null  float64
 7   avoid_large_gatherings     26620 non-null  float64
 8   reduced_outside_home_cont  26625 non-null  float64
 9   avoid_touch_face           26579 non-null  float64
 10  dr_recc_h1n1_vacc          24547 non-null  float64
 11  dr_recc_seasonal_vacc      24547 non-null  float64
 12  chronic_medic_condition    25736 non-null  float64
 13  cont_child_undr_6_mnths    25887 non-null  flo

In [7]:
#Checking distribution of target variable
Vaccine['h1n1_vaccine'].value_counts()

h1n1_vaccine
0    21033
1     5674
Name: count, dtype: int64

Data Preprocessing

In [9]:
#feature selection
x = Vaccine.drop(['unique_id','h1n1_vaccine'], axis=1)
y = Vaccine['h1n1_vaccine']

In [10]:
#Defining categorical and numerical features
categorical_features = ['age_bracket', 'qualification', 'race', 'sex', 'income_level', 'marital_status', 'housing_status', 'employment', 'census_msa']
numerical_features = x.select_dtypes(include=['int64', 'float64']).columns.tolist()

In [28]:
#Creating Data Preprocessing pipeline
numeric_transformer = Pipeline(steps = [
    ('imputer',SimpleImputer(strategy = 'median')),
    ('scaler',StandardScaler())
])
categorical_transformer = Pipeline(steps = [
    ('imputer'),SimpleImputer(strategy='most_frequent'),
    ('onehot'),OneHotEncoder(handle_unknown='ignore')
])
preprocessor = ColumnTransformer(
    transformers =[
        ('num', numeric_transformer,numerical_features),
        ('cat',categorical_transformer, categorical_features)
    ]
)

In [16]:
#splitting data into train and test tests
x_train, y_train, x_test, y_test = train_test_split(x,y, test_size=0.2, random_state=42)

In [None]:
#Logistic Regression with Maximum Likelihood Estimator
from sklearn.linear_model import LogisticRegression
model_mle = Pipeline(steps = [
    ('preprocessor', preprocessor),
    ('classifier',LogisticRegression(max_iter = 1000))
])
model_mle.fit(x_train, y_train)
y_pred_mle = model_mle.predict(x_test)
print('Logistic Regression (MLE) Accuracy:', accuracy_score(y_test,y_pred_mle))
print(classification_report(y_test, y_pred_mle))

In [None]:
#Logistic Regression with SDG
model_sgd = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', SGDClassifier(max_iter=1000, tol=1e-3))
])

model_sgd.fit(x_train, y_train)
y_pred_sgd = model_sgd.predict(X_test)
print("Logistic Regression (SGD) Accuracy:", accuracy_score(y_test, y_pred_sgd))
print(classification_report(y_test, y_pred_sgd))