## Introduce the Data

In [None]:
import numpy as np
import pandas as pd

In [None]:
df = pd.read_csv('/content/data.csv', na_values=['#NAME?'])

In [None]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39.0,State-gov,77516.0,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50.0,Self-emp-not-inc,83311.0,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38.0,Private,215646.0,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53.0,Private,234721.0,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,,0,0,40,United-States,<=50K
4,28.0,Private,338409.0,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [None]:
df['race'].unique()

array(['White', 'Black', 'Asian-Pac-Islander', 'Amer-Indian-Eskimo', nan,
       'Other'], dtype=object)

In [None]:
df.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education_num',
       'marital_status', 'occupation', 'relationship', 'race', 'sex',
       'capital_gain', 'capital_loss', 'hours_per_week', 'native_country',
       'income'],
      dtype='object')

In [None]:
df['income']

Unnamed: 0,income
0,<=50K
1,<=50K
2,<=50K
3,<=50K
4,<=50K
...,...
4995,<=50K
4996,>50K
4997,>50K
4998,<=50K


In [None]:
df['income'] = [0 if x == '<=50K' else 1 for x in df['income']]

In [None]:
df['income']

Unnamed: 0,income
0,0
1,0
2,0
3,0
4,0
...,...
4995,0
4996,1
4997,1
4998,0


In [None]:
df

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39.0,State-gov,77516.0,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50.0,Self-emp-not-inc,83311.0,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38.0,Private,215646.0,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53.0,Private,234721.0,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,,0,0,40,United-States,0
4,28.0,Private,338409.0,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,43.0,Private,222971.0,5th-6th,3.0,Never-married,Machine-op-inspct,Unmarried,White,Female,0,0,40,Mexico,0
4996,31.0,Private,259425.0,HS-grad,9.0,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,1
4997,47.0,Self-emp-inc,212120.0,HS-grad,9.0,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,1
4998,,Private,245880.0,HS-grad,9.0,Never-married,Adm-clerical,Not-in-family,White,Male,0,0,60,United-States,0


In [None]:
X = df.drop('income', axis='columns')
y = df.income

In [None]:
X.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country
0,39.0,State-gov,77516.0,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States
1,50.0,Self-emp-not-inc,83311.0,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States
2,38.0,Private,215646.0,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States
3,53.0,Private,234721.0,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,,0,0,40,United-States
4,28.0,Private,338409.0,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba


In [None]:
y

Unnamed: 0,income
0,0
1,0
2,0
3,0
4,0
...,...
4995,0
4996,1
4997,1
4998,0


## Basic data cleaning

In [None]:
X.isnull().sum()

Unnamed: 0,0
age,48
workclass,0
fnlwgt,107
education,0
education_num,57
marital_status,0
occupation,0
relationship,0
race,264
sex,47


In [None]:
X['education_num'].head(10)

Unnamed: 0,education_num
0,13.0
1,13.0
2,9.0
3,7.0
4,13.0
5,14.0
6,5.0
7,9.0
8,14.0
9,13.0


In [None]:

X['education'].unique()

array(['Bachelors', 'HS-grad', '11th', 'Masters', '9th', 'Some-college',
       'Assoc-acdm', 'Assoc-voc', '7th-8th', 'Doctorate', '?',
       'Prof-school', '5th-6th', '10th', '1st-4th', 'Preschool', '12th'],
      dtype=object)

In [None]:
len(X['education'].unique())

17

In [None]:
pd.get_dummies(X['education']).head(50)

Unnamed: 0,10th,11th,12th,1st-4th,5th-6th,7th-8th,9th,?,Assoc-acdm,Assoc-voc,Bachelors,Doctorate,HS-grad,Masters,Preschool,Prof-school,Some-college
0,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False
3,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False
5,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False
6,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False
7,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False
8,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False
9,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False


In [None]:
for col_name in X.columns:
    if X[col_name].dtypes == 'object':
        unique_cat = len(X[col_name].unique())
        print(f"Feature '{col_name}' has {unique_cat} unique categories")

Feature 'workclass' has 8 unique categories
Feature 'education' has 17 unique categories
Feature 'marital_status' has 7 unique categories
Feature 'occupation' has 15 unique categories
Feature 'relationship' has 6 unique categories
Feature 'race' has 6 unique categories
Feature 'sex' has 3 unique categories
Feature 'native_country' has 40 unique categories


In [None]:
X['native_country'].value_counts().sort_values(ascending=False)

Unnamed: 0_level_0,count
native_country,Unnamed: 1_level_1
United-States,4465
Mexico,104
?,97
Canada,28
Philippines,22
Germany,22
El-Salvador,16
Puerto-Rico,16
England,16
China,15


In [None]:
X['native_country'] = ['United-States ' if x == 'United-States' else 'Other' for x in X['native_country']]

In [None]:
X['native_country'].value_counts().sort_values(ascending=False)

Unnamed: 0_level_0,count
native_country,Unnamed: 1_level_1
United-States,4465
Other,535


In [None]:
todummy_list = ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country']

In [None]:
# Function to dummy all the categorical variables used for modeling
def dummy_df(df, todummy_list):
    for x in todummy_list:
        dummies = pd.get_dummies(df[x], prefix=x)
        df = df.drop(x,  axis=1)
        df = pd.concat([df, dummies], axis=1)
    return df

In [None]:
X = dummy_df(X, todummy_list)

In [None]:
X.shape

(5000, 68)

In [None]:
X.head()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week,workclass_?,workclass_Federal-gov,workclass_Local-gov,workclass_Private,...,relationship_Wife,race_Amer-Indian-Eskimo,race_Asian-Pac-Islander,race_Black,race_Other,race_White,sex_Female,sex_Male,native_country_Other,native_country_United-States
0,39.0,77516.0,13.0,2174,0,40,False,False,False,False,...,False,False,False,False,False,True,False,True,False,True
1,50.0,83311.0,13.0,0,0,13,False,False,False,False,...,False,False,False,False,False,True,False,True,False,True
2,38.0,215646.0,9.0,0,0,40,False,False,False,True,...,False,False,False,False,False,True,False,True,False,True
3,53.0,234721.0,7.0,0,0,40,False,False,False,True,...,False,False,False,True,False,False,False,False,False,True
4,28.0,338409.0,13.0,0,0,40,False,False,False,True,...,True,False,False,True,False,False,True,False,True,False


In [None]:
len(X.columns)

68

In [None]:
X.columns

Index(['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss',
       'hours_per_week', 'workclass_?', 'workclass_Federal-gov',
       'workclass_Local-gov', 'workclass_Private', 'workclass_Self-emp-inc',
       'workclass_Self-emp-not-inc', 'workclass_State-gov',
       'workclass_Without-pay', 'education_10th', 'education_11th',
       'education_12th', 'education_1st-4th', 'education_5th-6th',
       'education_7th-8th', 'education_9th', 'education_?',
       'education_Assoc-acdm', 'education_Assoc-voc', 'education_Bachelors',
       'education_Doctorate', 'education_HS-grad', 'education_Masters',
       'education_Preschool', 'education_Prof-school',
       'education_Some-college', 'marital_status_Divorced',
       'marital_status_Married-AF-spouse', 'marital_status_Married-civ-spouse',
       'marital_status_Married-spouse-absent', 'marital_status_Never-married',
       'marital_status_Separated', 'marital_status_Widowed', 'occupation_?',
       'occupation_Adm-clerica

###  Handling missing data

In [None]:
X.isnull().sum().sort_values(ascending=False).head()

Unnamed: 0,0
fnlwgt,107
education_num,57
age,48
native_country_Other,0
sex_Male,0


In [None]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='median')

imputer.fit(X)
X = pd.DataFrame(data=imputer.transform(X) , columns=X.columns)

In [None]:
X.isnull().sum().sort_values(ascending=False).head()

Unnamed: 0,0
age,0
occupation_Farming-fishing,0
occupation_Protective-serv,0
occupation_Prof-specialty,0
occupation_Priv-house-serv,0


In [None]:
X.head()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week,workclass_?,workclass_Federal-gov,workclass_Local-gov,workclass_Private,...,relationship_Wife,race_Amer-Indian-Eskimo,race_Asian-Pac-Islander,race_Black,race_Other,race_White,sex_Female,sex_Male,native_country_Other,native_country_United-States
0,39.0,77516.0,13.0,2174.0,0.0,40.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
1,50.0,83311.0,13.0,0.0,0.0,13.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
2,38.0,215646.0,9.0,0.0,0.0,40.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
3,53.0,234721.0,7.0,0.0,0.0,40.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
4,28.0,338409.0,13.0,0.0,0.0,40.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0


## Feature Engineering

In [None]:
from itertools import combinations
print(df.columns)
# print("***")
print(list(df.columns))
# print("***")
print(list(combinations(list(df.columns), 2)))

Index(['age', 'workclass', 'fnlwgt', 'education', 'education_num',
       'marital_status', 'occupation', 'relationship', 'race', 'sex',
       'capital_gain', 'capital_loss', 'hours_per_week', 'native_country',
       'income'],
      dtype='object')
['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'income']
[('age', 'workclass'), ('age', 'fnlwgt'), ('age', 'education'), ('age', 'education_num'), ('age', 'marital_status'), ('age', 'occupation'), ('age', 'relationship'), ('age', 'race'), ('age', 'sex'), ('age', 'capital_gain'), ('age', 'capital_loss'), ('age', 'hours_per_week'), ('age', 'native_country'), ('age', 'income'), ('workclass', 'fnlwgt'), ('workclass', 'education'), ('workclass', 'education_num'), ('workclass', 'marital_status'), ('workclass', 'occupation'), ('workclass', 'relationship'), ('workclass', 'race'), ('workclass', 'sex'), ('

In [None]:
# Use PolynomialFeatures in sklearn.preprocessing to create two-way interactions for all features
from itertools import combinations
from sklearn.preprocessing import PolynomialFeatures

def add_interactions(df):
    # Get feature names
    combos = list(combinations(list(df.columns), 2))
    colnames = list(df.columns) + ['_'.join(x) for x in combos]

    # Find interactions
    poly = PolynomialFeatures(interaction_only=True, include_bias=False)
    df = poly.fit_transform(df)
    df = pd.DataFrame(df)
    df.columns = colnames

    # Remove interaction terms with all 0 values
    noint_indicies = [i for i, x in enumerate(list((df == 0).all())) if x]
    df = df.drop(df.columns[noint_indicies], axis=1)

    return df

In [None]:
print(X.shape)

(5000, 68)


In [None]:
X = add_interactions(X)

In [None]:
X

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week,workclass_?,workclass_Federal-gov,workclass_Local-gov,workclass_Private,...,race_Other_native_country_Other,race_Other_native_country_United-States,race_White_sex_Female,race_White_sex_Male,race_White_native_country_Other,race_White_native_country_United-States,sex_Female_native_country_Other,sex_Female_native_country_United-States,sex_Male_native_country_Other,sex_Male_native_country_United-States
0,39.0,77516.0,13.0,2174.0,0.0,40.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
1,50.0,83311.0,13.0,0.0,0.0,13.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
2,38.0,215646.0,9.0,0.0,0.0,40.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
3,53.0,234721.0,7.0,0.0,0.0,40.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,28.0,338409.0,13.0,0.0,0.0,40.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,43.0,222971.0,3.0,0.0,0.0,40.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
4996,31.0,259425.0,9.0,0.0,0.0,40.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
4997,47.0,212120.0,9.0,0.0,0.0,40.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
4998,37.0,245880.0,9.0,0.0,0.0,60.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0


In [None]:
print(df.shape)
print(X.shape)

(5000, 15)
(5000, 1737)


In [None]:
print(df.ndim)
print(X.ndim)

2
2
