### This notebook contains the code to generate the preprocessed data for the "Flu Shot Learning: Predict H1N1 and Seasonal Flu Vaccines" competition.

We deal with categorical variables, one-hot encode and impute missing values.

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

In [3]:
# Start by loading the data. X and y are already split in two datasets
X = pd.read_csv("training_set_features.csv")
y = pd.read_csv("training_set_labels.csv")
X.head()

Unnamed: 0,respondent_id,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation
0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,Below Poverty,Not Married,Own,Not in Labor Force,oxchjgsf,Non-MSA,0.0,0.0,,
1,1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,Below Poverty,Not Married,Rent,Employed,bhuqouqj,"MSA, Not Principle City",0.0,0.0,pxcmvdjn,xgwztkwe
2,2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,"<= $75,000, Above Poverty",Not Married,Own,Employed,qufhixun,"MSA, Not Principle City",2.0,0.0,rucpziij,xtkaffoo
3,3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,Below Poverty,Not Married,Rent,Not in Labor Force,lrircsnp,"MSA, Principle City",0.0,0.0,,
4,4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,"<= $75,000, Above Poverty",Married,Own,Employed,qufhixun,"MSA, Not Principle City",1.0,0.0,wxleyezf,emcorrxb


In [4]:
# Split the data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=42)
# NOTE: you need to split first in this case because we will impute with the mean and mode, so if you only split after, you will have data leakage

# For preprocessing, we'll fill in missing values and encode categorical variables
X_train_prep = X_train.copy()
X_valid_prep = X_valid.copy()

In [5]:
# To fill in missing values, we differentiate between numerical and categorical columns
# We'll fill in missing values for numerical columns with the mean of the column
numerical_cols = [cname for cname in X_train.columns if X_train[cname].dtype in ['int64', 'float64']]
imputer_num = SimpleImputer(strategy="mean")
X_train_prep[numerical_cols] = imputer_num.fit_transform(X_train[numerical_cols])
X_valid_prep[numerical_cols] = imputer_num.transform(X_valid[numerical_cols])  # only transform!

# For categorical columns, we distinguish ordinal and nominal columns
# For the nominal, we add the "Unknown" category. For ordinal, we fill in with the mode (as unknown cannot be sorted in the ranking)
nominal_cols = ["race", "sex", "marital_status", "rent_or_own", "employment_status", "hhs_geo_region", 
                "census_msa", "employment_industry", "employment_occupation"]
for col in nominal_cols:
    X_train_prep[col].fillna("Unknown", inplace=True)
    X_valid_prep[col].fillna("Unknown", inplace=True)

ordinal_cols = [cname for cname in X_train.columns if (X_train[cname].dtype == "object" and cname not in nominal_cols)]
imputer_ord = SimpleImputer(strategy="most_frequent")
X_train_prep[ordinal_cols] = imputer_ord.fit_transform(X_train[ordinal_cols])
X_valid_prep[ordinal_cols] = imputer_ord.transform(X_valid[ordinal_cols])

#print(X_train_prep.isnull().sum())
#print("\n")
#print(X_valid_prep.isnull().sum())
# Sort by respondent_id
X_train_prep = X_train_prep.sort_values("respondent_id")
X_valid_prep = X_valid_prep.sort_values("respondent_id")

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train_prep[col].fillna("Unknown", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_valid_prep[col].fillna("Unknown", inplace=True)


In [6]:
# Then there are a number of columns that are categorical to deal with in preprocessing
# Start with ordinal ones, where a specific order is implied

# First age groups
#print(X_train_prep["age_group"].unique())
age_group_order = ["18 - 34 Years", "35 - 44 Years", "45 - 54 Years", "55 - 64 Years", "65+ Years"]
X_train_prep["age_group"] = pd.Categorical(X_train_prep["age_group"], categories=age_group_order, ordered=True)
X_valid_prep["age_group"] = pd.Categorical(X_valid_prep["age_group"], categories=age_group_order, ordered=True)

X_train_prep["age_group"] = X_train_prep["age_group"].cat.codes
X_valid_prep["age_group"] = X_valid_prep["age_group"].cat.codes
X_train_prep[["respondent_id", "age_group"]].head()

Unnamed: 0,respondent_id,age_group
0,0.0,3
1,1.0,1
2,2.0,0
3,3.0,4
4,4.0,2


In [7]:
# education
#print(X_train_prep["education"].unique())
edu_order = ["< 12 Years", "12 Years", "Some College", "College Graduate"]
X_train_prep["education"] = pd.Categorical(X_train_prep["education"], categories=edu_order, ordered=True)
X_valid_prep["education"] = pd.Categorical(X_valid_prep["education"], categories=edu_order, ordered=True)

X_train_prep["education"] = X_train_prep["education"].cat.codes
X_valid_prep["education"] = X_valid_prep["education"].cat.codes
X_train_prep[["respondent_id", "education"]].head()

Unnamed: 0,respondent_id,education
0,0.0,0
1,1.0,1
2,2.0,3
3,3.0,1
4,4.0,2


In [8]:
# income_poverty
#print(X_train_prep["income_poverty"].unique())
inc_order = ["Below Poverty", "<= $75,000, Above Poverty", "> $75,000"]
X_train_prep["income_poverty"] = pd.Categorical(X_train_prep["income_poverty"], categories=inc_order, ordered=True)
X_valid_prep["income_poverty"] = pd.Categorical(X_valid_prep["income_poverty"], categories=inc_order, ordered=True)

X_train_prep["income_poverty"] = X_train_prep["income_poverty"].cat.codes
X_valid_prep["income_poverty"] = X_valid_prep["income_poverty"].cat.codes
X_train_prep[["respondent_id", "income_poverty"]].head()

Unnamed: 0,respondent_id,income_poverty
0,0.0,0
1,1.0,0
2,2.0,1
3,3.0,0
4,4.0,1


In [9]:
# Next are nominal columns, where no order is implied. We check if there are not too many unique values, and one-hot encode them if so
#for col in nominal_cols:
#    print(col, X_train_prep[col].nunique())

# At most 23, so we can one-hot encode them
X_train_prep = pd.get_dummies(X_train_prep, columns=nominal_cols, dtype=float)
X_valid_prep = pd.get_dummies(X_valid_prep, columns=nominal_cols, dtype=float)

# Finally also drop the id column
X_train_prep.drop(columns=["respondent_id"], inplace=True)
X_valid_prep.drop(columns=["respondent_id"], inplace=True)
X_train_prep.head()

Unnamed: 0,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,...,employment_occupation_qxajmpny,employment_occupation_rcertsgn,employment_occupation_tfqavkke,employment_occupation_ukymxvdu,employment_occupation_uqqtjvyb,employment_occupation_vlluhbov,employment_occupation_xgwztkwe,employment_occupation_xqwwgdyp,employment_occupation_xtkaffoo,employment_occupation_xzmlyyjv
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.221662,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
# Sort y by respondent_id and then drop the column
y_train = y_train.sort_values("respondent_id")
y_valid = y_valid.sort_values("respondent_id")
y_train = y_train.drop("respondent_id", axis=1)
y_valid = y_valid.drop("respondent_id", axis=1)
y_train.head()

Unnamed: 0,h1n1_vaccine,seasonal_vaccine
0,0,0
1,0,1
2,0,0
3,0,1
4,0,0


In [11]:
# Same for the test data
X_test = pd.read_csv("test_set_features.csv")

# Preprocess the test data
X_test_prep = X_test.copy()

# Fill in missing values
X_test_prep[numerical_cols] = imputer_num.transform(X_test[numerical_cols])

for col in nominal_cols:
    X_test_prep[col].fillna("Unknown", inplace=True)

X_test_prep[ordinal_cols] = imputer_ord.transform(X_test[ordinal_cols])

# Sort by respondent_id
X_test_prep = X_test_prep.sort_values("respondent_id")

# Preprocess the ordinal columns
X_test_prep["age_group"] = pd.Categorical(X_test_prep["age_group"], categories=age_group_order, ordered=True)
X_test_prep["age_group"] = X_test_prep["age_group"].cat.codes

X_test_prep["education"] = pd.Categorical(X_test_prep["education"], categories=edu_order, ordered=True)
X_test_prep["education"] = X_test_prep["education"].cat.codes

X_test_prep["income_poverty"] = pd.Categorical(X_test_prep["income_poverty"], categories=inc_order, ordered=True)
X_test_prep["income_poverty"] = X_test_prep["income_poverty"].cat.codes

# One-hot encode the nominal columns
X_test_prep = pd.get_dummies(X_test_prep, columns=nominal_cols, dtype=int)
X_test_prep.drop(columns=["respondent_id"], inplace=True) # Drop the id column
X_test_prep.head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_test_prep[col].fillna("Unknown", inplace=True)


Unnamed: 0,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,...,employment_occupation_qxajmpny,employment_occupation_rcertsgn,employment_occupation_tfqavkke,employment_occupation_ukymxvdu,employment_occupation_uqqtjvyb,employment_occupation_vlluhbov,employment_occupation_xgwztkwe,employment_occupation_xqwwgdyp,employment_occupation_xtkaffoo,employment_occupation_xzmlyyjv
0,2.0,2.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0
2,2.0,2.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0,0,0,0,0,0,0,0,0,0
4,3.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
# Save to csv
X_train_prep.to_csv("X_train_prep.csv", index=False)
X_valid_prep.to_csv("X_valid_prep.csv", index=False)
X_test_prep.to_csv("X_test_prep.csv", index=False)
y_train.to_csv("y_train.csv", index=False)
y_valid.to_csv("y_valid.csv", index=False)