In [12]:
# Data Preprocessing
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer

df = pd.read_csv(r'C:\Users\user\Desktop\RemoteProjects\InterninfoTech\datasets\vaccine_prediction.csv') 


# Separate the features and target variable
X = df.drop(columns=['unique_id', 'h1n1_vaccine'])
y = df['h1n1_vaccine']

# Handle missing values
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X.select_dtypes(include=['float64', 'int64']))
X[X.select_dtypes(include=['float64', 'int64']).columns] = X_imputed

# Encode categorical variables
for col in X.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Check the preprocessed data
print("First 5 rows of training features:")
print(pd.DataFrame(X_train).head())

print("First 5 rows of testing features:")
print(pd.DataFrame(X_test).head())

print("First 5 values of training target:")
print(y_train.head())

print("First 5 values of testing target:")
print(y_test.head())


First 5 rows of training features:
         0         1         2         3         4         5         6   \
0  0.420798 -0.429869 -0.228099  0.618990 -0.275171  0.463221 -0.747794   
1  0.420798 -0.429869  4.396083  0.618990  3.636446  0.463221  1.341793   
2 -1.782488 -0.429869 -0.228099 -1.627924 -0.275171  0.463221 -0.747794   
3 -0.680845 -0.429869 -0.228099 -1.627924 -0.275171 -2.162406 -0.747794   
4 -0.680845 -2.051971 -0.228099 -1.627924 -0.275171  0.463221 -0.747794   

         7         8         9   ...        22        23        24        25  \
0  1.403111  0.694382 -0.556479  ...  0.969965  0.465004 -0.822161  0.947574   
1  1.403111  0.694382 -0.556479  ...  0.127417  0.465004  1.216306  0.048319   
2  1.403111 -1.447021 -0.556479  ...  1.812512  0.465004  1.216306  1.846828   
3 -0.714675 -1.447021 -0.556479  ...  0.127417 -0.624817 -0.822161  1.846828   
4 -0.714675 -1.447021 -0.556479  ...  0.127417 -2.804460 -0.822161 -0.850935   

         26        27        28  