## H1N1 Vaccine Prediction

### Problem Statement

Predict how likely it is that the people will take an H1N1 flu vaccine using Logistic Regression.


In [1]:
## Import the libraries
import pandas as pd
import numpy as np

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
## Load the data
df = pd.read_csv(r"D:\Python Project\Data set\H1N1_Dataset\167541607652060ec2379f8793842aa5b168a_.csv")

In [4]:
df.head()

Unnamed: 0,unique_id,h1n1_worry,h1n1_awareness,antiviral_medication,contact_avoidance,bought_face_mask,wash_hands_frequently,avoid_large_gatherings,reduced_outside_home_cont,avoid_touch_face,...,race,sex,income_level,marital_status,housing_status,employment,census_msa,no_of_adults,no_of_children,h1n1_vaccine
0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,White,Female,Below Poverty,Not Married,Own,Not in Labor Force,Non-MSA,0.0,0.0,0
1,1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,White,Male,Below Poverty,Not Married,Rent,Employed,"MSA, Not Principle City",0.0,0.0,0
2,2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,White,Male,"<= $75,000, Above Poverty",Not Married,Own,Employed,"MSA, Not Principle City",2.0,0.0,0
3,3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,White,Female,Below Poverty,Not Married,Rent,Not in Labor Force,"MSA, Principle City",0.0,0.0,0
4,4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,White,Female,"<= $75,000, Above Poverty",Married,Own,Employed,"MSA, Not Principle City",1.0,0.0,0


In [5]:
df.shape

(26707, 34)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26707 entries, 0 to 26706
Data columns (total 34 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   unique_id                  26707 non-null  int64  
 1   h1n1_worry                 26615 non-null  float64
 2   h1n1_awareness             26591 non-null  float64
 3   antiviral_medication       26636 non-null  float64
 4   contact_avoidance          26499 non-null  float64
 5   bought_face_mask           26688 non-null  float64
 6   wash_hands_frequently      26665 non-null  float64
 7   avoid_large_gatherings     26620 non-null  float64
 8   reduced_outside_home_cont  26625 non-null  float64
 9   avoid_touch_face           26579 non-null  float64
 10  dr_recc_h1n1_vacc          24547 non-null  float64
 11  dr_recc_seasonal_vacc      24547 non-null  float64
 12  chronic_medic_condition    25736 non-null  float64
 13  cont_child_undr_6_mnths    25887 non-null  flo

In [7]:
## Missing Values
miss_values = df.isnull().sum()
mv_df = pd.DataFrame(miss_values[miss_values > 0], columns=['Count'])

In [8]:
mv_df['Percent'] = round((mv_df['Count'] / df.shape[0]) * 100,2)

In [9]:
mv_df

Unnamed: 0,Count,Percent
h1n1_worry,92,0.34
h1n1_awareness,116,0.43
antiviral_medication,71,0.27
contact_avoidance,208,0.78
bought_face_mask,19,0.07
wash_hands_frequently,42,0.16
avoid_large_gatherings,87,0.33
reduced_outside_home_cont,82,0.31
avoid_touch_face,128,0.48
dr_recc_h1n1_vacc,2160,8.09


In [10]:
## Columns have more than 5% of missing values in a list
cols_mv5 = mv_df[mv_df.Percent > 5].index.tolist()

In [11]:
cols_mv5

['dr_recc_h1n1_vacc',
 'dr_recc_seasonal_vacc',
 'has_health_insur',
 'qualification',
 'income_level',
 'marital_status',
 'housing_status',
 'employment']

In [12]:
df.dtypes

unique_id                      int64
h1n1_worry                   float64
h1n1_awareness               float64
antiviral_medication         float64
contact_avoidance            float64
bought_face_mask             float64
wash_hands_frequently        float64
avoid_large_gatherings       float64
reduced_outside_home_cont    float64
avoid_touch_face             float64
dr_recc_h1n1_vacc            float64
dr_recc_seasonal_vacc        float64
chronic_medic_condition      float64
cont_child_undr_6_mnths      float64
is_health_worker             float64
has_health_insur             float64
is_h1n1_vacc_effective       float64
is_h1n1_risky                float64
sick_from_h1n1_vacc          float64
is_seas_vacc_effective       float64
is_seas_risky                float64
sick_from_seas_vacc          float64
age_bracket                   object
qualification                 object
race                          object
sex                           object
income_level                  object
m

In [13]:
## create a new df with all numeric columns
num_cols = df.select_dtypes(exclude='object').columns

In [14]:
num_cols

Index(['unique_id', 'h1n1_worry', 'h1n1_awareness', 'antiviral_medication',
       'contact_avoidance', 'bought_face_mask', 'wash_hands_frequently',
       'avoid_large_gatherings', 'reduced_outside_home_cont',
       'avoid_touch_face', 'dr_recc_h1n1_vacc', 'dr_recc_seasonal_vacc',
       'chronic_medic_condition', 'cont_child_undr_6_mnths',
       'is_health_worker', 'has_health_insur', 'is_h1n1_vacc_effective',
       'is_h1n1_risky', 'sick_from_h1n1_vacc', 'is_seas_vacc_effective',
       'is_seas_risky', 'sick_from_seas_vacc', 'no_of_adults',
       'no_of_children', 'h1n1_vaccine'],
      dtype='object')

In [15]:
cat_cols = df.select_dtypes(include='object').columns

In [16]:
cat_cols

Index(['age_bracket', 'qualification', 'race', 'sex', 'income_level',
       'marital_status', 'housing_status', 'employment', 'census_msa'],
      dtype='object')

In [17]:
## Creating a numeric dataframe using numerical columns
num_df = df[num_cols]

In [18]:
num_df.head()

Unnamed: 0,unique_id,h1n1_worry,h1n1_awareness,antiviral_medication,contact_avoidance,bought_face_mask,wash_hands_frequently,avoid_large_gatherings,reduced_outside_home_cont,avoid_touch_face,...,has_health_insur,is_h1n1_vacc_effective,is_h1n1_risky,sick_from_h1n1_vacc,is_seas_vacc_effective,is_seas_risky,sick_from_seas_vacc,no_of_adults,no_of_children,h1n1_vaccine
0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,1.0,3.0,1.0,2.0,2.0,1.0,2.0,0.0,0.0,0
1,1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,1.0,5.0,4.0,4.0,4.0,2.0,4.0,0.0,0.0,0
2,2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,,3.0,1.0,1.0,4.0,1.0,2.0,2.0,0.0,0
3,3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,,3.0,3.0,5.0,5.0,4.0,1.0,0.0,0.0,0
4,4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,,3.0,3.0,2.0,3.0,1.0,4.0,1.0,0.0,0


In [19]:
## Let us inspect the has_health_insur column
num_df.has_health_insur.value_counts()

1.0    12697
0.0     1736
Name: has_health_insur, dtype: int64

From the missing value analysis, we can observe that almost 46 % of has_health_insur column has missing values.  We can assume that those records has missing values, we can treat them as no insurance.

In [20]:
num_df.has_health_insur.fillna(0.0, inplace=True)

In [21]:
miss_values = num_df.isnull().sum()
miss_values[miss_values > 0].sort_values(ascending=False)

dr_recc_h1n1_vacc            2160
dr_recc_seasonal_vacc        2160
chronic_medic_condition       971
cont_child_undr_6_mnths       820
is_health_worker              804
sick_from_seas_vacc           537
is_seas_risky                 514
is_seas_vacc_effective        462
sick_from_h1n1_vacc           395
is_h1n1_vacc_effective        391
is_h1n1_risky                 388
no_of_adults                  249
no_of_children                249
contact_avoidance             208
avoid_touch_face              128
h1n1_awareness                116
h1n1_worry                     92
avoid_large_gatherings         87
reduced_outside_home_cont      82
antiviral_medication           71
wash_hands_frequently          42
bought_face_mask               19
dtype: int64

In [22]:
num_df.shape

(26707, 25)

In [23]:
## Let us drop all missing value rows
num_df.dropna(inplace=True)

In [24]:
num_df.shape

(22976, 25)

In [25]:
## Let us drop the unique_id column too.
num_df.drop('unique_id', axis=1, inplace=True)

In [26]:
num_df.head()

Unnamed: 0,h1n1_worry,h1n1_awareness,antiviral_medication,contact_avoidance,bought_face_mask,wash_hands_frequently,avoid_large_gatherings,reduced_outside_home_cont,avoid_touch_face,dr_recc_h1n1_vacc,...,has_health_insur,is_h1n1_vacc_effective,is_h1n1_risky,sick_from_h1n1_vacc,is_seas_vacc_effective,is_seas_risky,sick_from_seas_vacc,no_of_adults,no_of_children,h1n1_vaccine
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,1.0,3.0,1.0,2.0,2.0,1.0,2.0,0.0,0.0,0
1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,1.0,5.0,4.0,4.0,4.0,2.0,4.0,0.0,0.0,0
3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,3.0,3.0,5.0,5.0,4.0,1.0,0.0,0.0,0
4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,...,0.0,3.0,3.0,2.0,3.0,1.0,4.0,1.0,0.0,0
5,3.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,5.0,2.0,1.0,5.0,4.0,4.0,2.0,3.0,0


### Building the base model

In [27]:
## Importing model building Libraries
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [28]:
X = num_df.drop('h1n1_vaccine', axis=1)
y = num_df['h1n1_vaccine']

In [29]:
X1 = num_df.iloc[:, 0:-1]

In [30]:
X.shape, X1.shape

((22976, 23), (22976, 23))

In [31]:
y.value_counts()

0    17791
1     5185
Name: h1n1_vaccine, dtype: int64

In [32]:
## Let us create train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify = y, random_state = 1)

In [33]:
X_train.shape, X_test.shape

((18380, 23), (4596, 23))

In [34]:
y_train.shape, y_test.shape

((18380,), (4596,))

In [35]:
lr_model = LogisticRegression()

In [36]:
lr_model.fit(X_train, y_train)

In [37]:
lr_pred = lr_model.predict(X_test)

In [38]:
lr_pred[:5]

array([0, 0, 0, 0, 0], dtype=int64)

In [39]:
print(confusion_matrix(y_test, lr_pred))

[[3389  170]
 [ 532  505]]


In [40]:
print("The Model Performance is:",(accuracy_score(y_test, lr_pred) * 100).round(2),'%')

The Model Performance is: 84.73 %


In [41]:
print(classification_report(y_test, lr_pred))

              precision    recall  f1-score   support

           0       0.86      0.95      0.91      3559
           1       0.75      0.49      0.59      1037

    accuracy                           0.85      4596
   macro avg       0.81      0.72      0.75      4596
weighted avg       0.84      0.85      0.83      4596



In [42]:
coeff_dict = dict(zip(X.columns,lr_model.coef_[0].round(2)))
coeff_dict

{'h1n1_worry': 0.0,
 'h1n1_awareness': 0.26,
 'antiviral_medication': 0.05,
 'contact_avoidance': -0.07,
 'bought_face_mask': 0.15,
 'wash_hands_frequently': -0.01,
 'avoid_large_gatherings': -0.15,
 'reduced_outside_home_cont': 0.03,
 'avoid_touch_face': -0.02,
 'dr_recc_h1n1_vacc': 1.91,
 'dr_recc_seasonal_vacc': -0.52,
 'chronic_medic_condition': 0.11,
 'cont_child_undr_6_mnths': 0.27,
 'is_health_worker': 0.84,
 'has_health_insur': 1.42,
 'is_h1n1_vacc_effective': 0.57,
 'is_h1n1_risky': 0.42,
 'sick_from_h1n1_vacc': -0.0,
 'is_seas_vacc_effective': 0.13,
 'is_seas_risky': 0.16,
 'sick_from_seas_vacc': -0.1,
 'no_of_adults': 0.02,
 'no_of_children': -0.07}

From the coefficients of independent variables, we can observe some variables are not at all contributing to the model.  So, we can drop those features and rebuild model to see the effect of dropped variables

### Building the model with reduced columns

In [94]:
var_to_keep = [key for key, val in coeff_dict.items() if (val >= 0.1) or (val <= -0.1)]

In [95]:
var_to_keep

['h1n1_awareness',
 'bought_face_mask',
 'avoid_large_gatherings',
 'dr_recc_h1n1_vacc',
 'dr_recc_seasonal_vacc',
 'chronic_medic_condition',
 'cont_child_undr_6_mnths',
 'is_health_worker',
 'has_health_insur',
 'is_h1n1_vacc_effective',
 'is_h1n1_risky',
 'is_seas_vacc_effective',
 'is_seas_risky',
 'sick_from_seas_vacc']

In [96]:
df_red = num_df[var_to_keep]

In [97]:
df_red.shape

(22976, 14)

In [98]:
df_red.head()

Unnamed: 0,h1n1_awareness,bought_face_mask,avoid_large_gatherings,dr_recc_h1n1_vacc,dr_recc_seasonal_vacc,chronic_medic_condition,cont_child_undr_6_mnths,is_health_worker,has_health_insur,is_h1n1_vacc_effective,is_h1n1_risky,is_seas_vacc_effective,is_seas_risky,sick_from_seas_vacc
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3.0,1.0,2.0,1.0,2.0
1,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,5.0,4.0,4.0,2.0,4.0
3,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,3.0,3.0,5.0,4.0,1.0
4,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,3.0,3.0,1.0,4.0
5,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,5.0,2.0,5.0,4.0,4.0


In [99]:
X = df_red

In [100]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y, random_state = 1)

In [101]:
X_train.shape, X_test.shape

((18380, 14), (4596, 14))

In [102]:
y_train.shape, y_test.shape

((18380,), (4596,))

In [103]:
lr_model.fit(X_train, y_train)

In [104]:
lr_red_pred = lr_model.predict(X_test)

In [105]:
print("The Model Performance is:",(accuracy_score(y_test, lr_red_pred) * 100).round(2),'%')

The Model Performance is: 84.68 %


In [106]:
print(confusion_matrix(y_test, lr_red_pred))

[[3390  169]
 [ 535  502]]


In [107]:
print(classification_report(y_test, lr_red_pred))

              precision    recall  f1-score   support

           0       0.86      0.95      0.91      3559
           1       0.75      0.48      0.59      1037

    accuracy                           0.85      4596
   macro avg       0.81      0.72      0.75      4596
weighted avg       0.84      0.85      0.83      4596



## Let us use the categorical data too

In [108]:
cat_df = df[cat_cols]
cat_df['target'] = df['h1n1_vaccine']

In [109]:
mv_cat = cat_df.isnull().sum()
mv_cat[mv_cat > 0]

qualification     1407
income_level      4423
marital_status    1408
housing_status    2042
employment        1463
dtype: int64

In [110]:
cat_df.shape

(26707, 10)

In [111]:
cat_df.dropna(inplace=True)

In [112]:
cat_df.shape

(21880, 10)

In [113]:
df_red['index'] = df_red.index

In [114]:
cat_df['index'] = cat_df.index

In [115]:
com_df = cat_df.merge(df_red, on = 'index', how='inner')

In [116]:
com_df.shape

(19642, 25)

In [117]:
mv_com = com_df.isnull().sum()
mv_com[mv_com > 0]

Series([], dtype: int64)

In [118]:
## Convert all categorical columns to numerical columns using get_dummies
com_df = pd.get_dummies(com_df, drop_first = True)

In [119]:
com_df.shape

(19642, 35)

In [120]:
com_df.columns

Index(['target', 'index', 'h1n1_awareness', 'bought_face_mask',
       'avoid_large_gatherings', 'dr_recc_h1n1_vacc', 'dr_recc_seasonal_vacc',
       'chronic_medic_condition', 'cont_child_undr_6_mnths',
       'is_health_worker', 'has_health_insur', 'is_h1n1_vacc_effective',
       'is_h1n1_risky', 'is_seas_vacc_effective', 'is_seas_risky',
       'sick_from_seas_vacc', 'age_bracket_35 - 44 Years',
       'age_bracket_45 - 54 Years', 'age_bracket_55 - 64 Years',
       'age_bracket_65+ Years', 'qualification_< 12 Years',
       'qualification_College Graduate', 'qualification_Some College',
       'race_Hispanic', 'race_Other or Multiple', 'race_White', 'sex_Male',
       'income_level_> $75,000', 'income_level_Below Poverty',
       'marital_status_Not Married', 'housing_status_Rent',
       'employment_Not in Labor Force', 'employment_Unemployed',
       'census_msa_MSA, Principle City', 'census_msa_Non-MSA'],
      dtype='object')

In [121]:
## Index column is not required.  So, let us drop index column
com_df.drop('index', axis=1, inplace=True)

#### Building the model with categorical columns

In [122]:
X = com_df.drop('target', axis=1)
y = com_df['target']

In [123]:
y.value_counts()

0    15128
1     4514
Name: target, dtype: int64

In [124]:
## Let us split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify = y, random_state = 42)

In [125]:
X_train.shape, X_test.shape

((15713, 33), (3929, 33))

In [126]:
com_model = LogisticRegression()

In [127]:
com_model.fit(X_train, y_train)

In [128]:
com_pred = com_model.predict(X_test)

In [129]:
print("The Combined Model Performance is:",(accuracy_score(y_test, com_pred) * 100).round(2),'%')

The Combined Model Performance is: 84.25 %


In [130]:
print(confusion_matrix(y_test, com_pred))

[[2847  179]
 [ 440  463]]


In [131]:
print(classification_report(y_test, com_pred))

              precision    recall  f1-score   support

           0       0.87      0.94      0.90      3026
           1       0.72      0.51      0.60       903

    accuracy                           0.84      3929
   macro avg       0.79      0.73      0.75      3929
weighted avg       0.83      0.84      0.83      3929



In [132]:
com_df.head()

Unnamed: 0,target,h1n1_awareness,bought_face_mask,avoid_large_gatherings,dr_recc_h1n1_vacc,dr_recc_seasonal_vacc,chronic_medic_condition,cont_child_undr_6_mnths,is_health_worker,has_health_insur,...,race_White,sex_Male,"income_level_> $75,000",income_level_Below Poverty,marital_status_Not Married,housing_status_Rent,employment_Not in Labor Force,employment_Unemployed,"census_msa_MSA, Principle City",census_msa_Non-MSA
0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1,0,0,1,1,0,1,0,0,1
1,0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1,1,0,1,1,1,0,0,0,0
2,0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,1,0,0,1,1,1,1,0,1,0
3,0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,0,0,0,0,0,0,0,0
4,0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1,1,0,0,0,0,0,0,1,0


### Building the model with scaled data

In [133]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

In [134]:
X_tr_scaled = sc.fit_transform(X_train)
X_test_scaled = sc.transform(X_test)

In [135]:
sc_model = LogisticRegression()

In [136]:
sc_model.fit(X_tr_scaled, y_train)
sc_pred = sc_model.predict(X_test_scaled)

In [137]:
print("The Combined Model Performance is:",(accuracy_score(y_test, sc_pred) * 100).round(2),'%')

The Combined Model Performance is: 84.3 %


In [138]:
print(confusion_matrix(y_test, sc_pred))

[[2849  177]
 [ 440  463]]


In [139]:
print(classification_report(y_test, sc_pred))

              precision    recall  f1-score   support

           0       0.87      0.94      0.90      3026
           1       0.72      0.51      0.60       903

    accuracy                           0.84      3929
   macro avg       0.79      0.73      0.75      3929
weighted avg       0.83      0.84      0.83      3929



In [140]:
df.head()

Unnamed: 0,unique_id,h1n1_worry,h1n1_awareness,antiviral_medication,contact_avoidance,bought_face_mask,wash_hands_frequently,avoid_large_gatherings,reduced_outside_home_cont,avoid_touch_face,...,race,sex,income_level,marital_status,housing_status,employment,census_msa,no_of_adults,no_of_children,h1n1_vaccine
0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,White,Female,Below Poverty,Not Married,Own,Not in Labor Force,Non-MSA,0.0,0.0,0
1,1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,White,Male,Below Poverty,Not Married,Rent,Employed,"MSA, Not Principle City",0.0,0.0,0
2,2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,White,Male,"<= $75,000, Above Poverty",Not Married,Own,Employed,"MSA, Not Principle City",2.0,0.0,0
3,3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,White,Female,Below Poverty,Not Married,Rent,Not in Labor Force,"MSA, Principle City",0.0,0.0,0
4,4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,White,Female,"<= $75,000, Above Poverty",Married,Own,Employed,"MSA, Not Principle City",1.0,0.0,0


In [141]:
df.columns

Index(['unique_id', 'h1n1_worry', 'h1n1_awareness', 'antiviral_medication',
       'contact_avoidance', 'bought_face_mask', 'wash_hands_frequently',
       'avoid_large_gatherings', 'reduced_outside_home_cont',
       'avoid_touch_face', 'dr_recc_h1n1_vacc', 'dr_recc_seasonal_vacc',
       'chronic_medic_condition', 'cont_child_undr_6_mnths',
       'is_health_worker', 'has_health_insur', 'is_h1n1_vacc_effective',
       'is_h1n1_risky', 'sick_from_h1n1_vacc', 'is_seas_vacc_effective',
       'is_seas_risky', 'sick_from_seas_vacc', 'age_bracket', 'qualification',
       'race', 'sex', 'income_level', 'marital_status', 'housing_status',
       'employment', 'census_msa', 'no_of_adults', 'no_of_children',
       'h1n1_vaccine'],
      dtype='object')

In [142]:
for col in cat_cols:
    print(col,'\n')
    print(df[col].value_counts(), '\n\n')

age_bracket 

65+ Years        6843
55 - 64 Years    5563
45 - 54 Years    5238
18 - 34 Years    5215
35 - 44 Years    3848
Name: age_bracket, dtype: int64 


qualification 

College Graduate    10097
Some College         7043
12 Years             5797
< 12 Years           2363
Name: qualification, dtype: int64 


race 

White                21222
Black                 2118
Hispanic              1755
Other or Multiple     1612
Name: race, dtype: int64 


sex 

Female    15858
Male      10849
Name: sex, dtype: int64 


income_level 

<= $75,000, Above Poverty    12777
> $75,000                     6810
Below Poverty                 2697
Name: income_level, dtype: int64 


marital_status 

Married        13555
Not Married    11744
Name: marital_status, dtype: int64 


housing_status 

Own     18736
Rent     5929
Name: housing_status, dtype: int64 


employment 

Employed              13560
Not in Labor Force    10231
Unemployed             1453
Name: employment, dtype: int64 


census_ms