In [56]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')
sns.set(rc={'figure.figsize':(15,10)})
pd.set_option('display.max_columns', None)

In [4]:
dataset=pd.read_csv('Datasets/outlier_handled')

In [5]:
dataset.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,hours_per_week,native_country,income,capital_income
0,39.0,State-gov,77516.0,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,40.0,United-States,<=50K,2174.0
1,50.0,Self-emp-not-inc,83311.0,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,32.5,United-States,<=50K,0.0
2,38.0,Private,215646.0,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,40.0,United-States,<=50K,0.0
3,53.0,Private,234721.0,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,40.0,United-States,<=50K,0.0
4,28.0,Private,338409.0,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,40.0,Cuba,<=50K,0.0


- education and education_num are same feature eduation_num indicates the education level numerically
- we can drop education column

In [6]:
df1=dataset.copy()
df1.drop('education',inplace=True,axis=1)

In [7]:
df1.workclass.unique()

array(['State-gov', 'Self-emp-not-inc', 'Private', 'Federal-gov',
       'Local-gov', 'Self-emp-inc', 'Without-pay', 'Never-worked'],
      dtype=object)

In [8]:
### Work class is ordinal feature

workclass_encoding={"Private":7, 'Self-emp-not-inc':6, 'Self-emp-inc':4, 'Federal-gov':4, 'Local-gov':3,'State-gov':2, 'Without-pay':1, "Never-worked":0}

In [9]:
df1['workclass'].replace(workclass_encoding,inplace=True)

In [10]:
df1.head()

Unnamed: 0,age,workclass,fnlwgt,education_num,marital_status,occupation,relationship,race,sex,hours_per_week,native_country,income,capital_income
0,39.0,2,77516.0,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,40.0,United-States,<=50K,2174.0
1,50.0,6,83311.0,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,32.5,United-States,<=50K,0.0
2,38.0,7,215646.0,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,40.0,United-States,<=50K,0.0
3,53.0,7,234721.0,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,40.0,United-States,<=50K,0.0
4,28.0,7,338409.0,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,40.0,Cuba,<=50K,0.0


In [11]:
df1.marital_status.unique()

array(['Never-married', 'Married-civ-spouse', 'Divorced',
       'Married-spouse-absent', 'Separated', 'Married-AF-spouse',
       'Widowed'], dtype=object)

In [14]:
marital_status_encoding = {'Never-married':0, 'Married-civ-spouse':1, 'Divorced':2,
       'Married-spouse-absent':3, 'Separated':4, 'Married-AF-spouse':5,
       'Widowed':6}

In [15]:
df1['marital_status'].replace(marital_status_encoding,inplace=True)

In [30]:
df1['income'].unique()

array(['<=50K', '>50K', '<=50K.', '>50K.'], dtype=object)

In [31]:
income_handling={'<=50K':0,'<=50K.':0,'>50K':1,'>50K.':1}

df1['income'].replace(income_handling,inplace=True)


In [19]:
df1['relationship'].unique()

array(['Not-in-family', 'Husband', 'Wife', 'Own-child', 'Unmarried',
       'Other-relative'], dtype=object)

In [20]:
relationship_encoding={'Not-in-family':0, 'Husband':1, 'Wife':2, 'Own-child':3, 'Unmarried':4,
       'Other-relative':5}
df1['relationship'].replace(relationship_encoding,inplace=True)

In [21]:
df1['race'].unique()

array(['White', 'Black', 'Asian-Pac-Islander', 'Amer-Indian-Eskimo',
       'Other'], dtype=object)

In [23]:
race_encoding={'White':0, 'Black':1, 'Asian-Pac-Islander':2, 'Amer-Indian-Eskimo':3,
       'Other':4}
df1['race'].replace(race_encoding,inplace=True)

In [24]:
df1['sex'].unique()

array(['Male', 'Female'], dtype=object)

In [25]:
sex_encoding={"Male":0,"Female":1}
df1['sex'].replace(sex_encoding,inplace=True)

In [32]:
df1.head()

Unnamed: 0,age,workclass,fnlwgt,education_num,marital_status,occupation,relationship,race,sex,hours_per_week,native_country,income,capital_income
0,39.0,2,77516.0,13.0,0,Adm-clerical,0,0,0,40.0,United-States,0,2174.0
1,50.0,6,83311.0,13.0,1,Exec-managerial,1,0,0,32.5,United-States,0,0.0
2,38.0,7,215646.0,9.0,2,Handlers-cleaners,0,0,0,40.0,United-States,0,0.0
3,53.0,7,234721.0,7.0,1,Handlers-cleaners,1,1,0,40.0,United-States,0,0.0
4,28.0,7,338409.0,13.0,1,Prof-specialty,2,1,1,40.0,Cuba,0,0.0


In [35]:
df1['occupation'].unique()

array(['Adm-clerical', 'Exec-managerial', 'Handlers-cleaners',
       'Prof-specialty', 'Other-service', 'Sales', 'Craft-repair',
       'Transport-moving', 'Farming-fishing', 'Machine-op-inspct',
       'Tech-support', 'Protective-serv', 'Armed-Forces',
       'Priv-house-serv'], dtype=object)

In [36]:
occupation_encoding={'Adm-clerical':1, 'Exec-managerial':2, 'Handlers-cleaners':3,
       'Prof-specialty':4, 'Other-service':5, 'Sales':6, 'Craft-repair':7,
       'Transport-moving':8, 'Farming-fishing':9, 'Machine-op-inspct':10,
       'Tech-support':11, 'Protective-serv':12, 'Armed-Forces':13,
       'Priv-house-serv':14}

df1['occupation'].replace(occupation_encoding,inplace=True)

In [48]:
df1.head()

Unnamed: 0,age,workclass,fnlwgt,education_num,marital_status,occupation,relationship,race,sex,hours_per_week,native_country,income,capital_income
0,39.0,2,77516.0,13.0,0,1,0,0,0,40.0,United-States,0,2174.0
1,50.0,6,83311.0,13.0,1,2,1,0,0,32.5,United-States,0,0.0
2,38.0,7,215646.0,9.0,2,3,0,0,0,40.0,United-States,0,0.0
3,53.0,7,234721.0,7.0,1,3,1,1,0,40.0,United-States,0,0.0
4,28.0,7,338409.0,13.0,1,4,2,1,1,40.0,Cuba,0,0.0


In [49]:
df1.native_country.value_counts().sort_values(ascending=False).head(20)

United-States         44570
Mexico                  969
Philippines             305
Germany                 211
Puerto-Rico             188
Canada                  185
El-Salvador             160
India                   154
Cuba                    141
England                 129
China                   122
South                   117
Jamaica                 108
Italy                   107
Dominican-Republic      106
Japan                    94
Guatemala                90
Poland                   88
Vietnam                  88
Columbia                 86
Name: native_country, dtype: int64

In [52]:
top_10 = [x for x in df1.native_country.value_counts().sort_values(ascending=False).head(10).index]
top_10

['United-States',
 'Mexico',
 'Philippines',
 'Germany',
 'Puerto-Rico',
 'Canada',
 'El-Salvador',
 'India',
 'Cuba',
 'England']

In [59]:
for category in top_10:
    df1['native_country'+category]=np.where(df1['native_country']==category,1,0)


In [60]:
df1.head()

Unnamed: 0,age,workclass,fnlwgt,education_num,marital_status,occupation,relationship,race,sex,hours_per_week,native_country,income,capital_income,native_countryUnited-States,native_countryMexico,native_countryPhilippines,native_countryGermany,native_countryPuerto-Rico,native_countryCanada,native_countryEl-Salvador,native_countryIndia,native_countryCuba,native_countryEngland
0,39.0,2,77516.0,13.0,0,1,0,0,0,40.0,United-States,0,2174.0,1,0,0,0,0,0,0,0,0,0
1,50.0,6,83311.0,13.0,1,2,1,0,0,32.5,United-States,0,0.0,1,0,0,0,0,0,0,0,0,0
2,38.0,7,215646.0,9.0,2,3,0,0,0,40.0,United-States,0,0.0,1,0,0,0,0,0,0,0,0,0
3,53.0,7,234721.0,7.0,1,3,1,1,0,40.0,United-States,0,0.0,1,0,0,0,0,0,0,0,0,0
4,28.0,7,338409.0,13.0,1,4,2,1,1,40.0,Cuba,0,0.0,0,0,0,0,0,0,0,0,1,0


In [63]:
df1.drop('native_country',axis=1,inplace=True)

In [64]:
df1.head()

Unnamed: 0,age,workclass,fnlwgt,education_num,marital_status,occupation,relationship,race,sex,hours_per_week,income,capital_income,native_countryUnited-States,native_countryMexico,native_countryPhilippines,native_countryGermany,native_countryPuerto-Rico,native_countryCanada,native_countryEl-Salvador,native_countryIndia,native_countryCuba,native_countryEngland
0,39.0,2,77516.0,13.0,0,1,0,0,0,40.0,0,2174.0,1,0,0,0,0,0,0,0,0,0
1,50.0,6,83311.0,13.0,1,2,1,0,0,32.5,0,0.0,1,0,0,0,0,0,0,0,0,0
2,38.0,7,215646.0,9.0,2,3,0,0,0,40.0,0,0.0,1,0,0,0,0,0,0,0,0,0
3,53.0,7,234721.0,7.0,1,3,1,1,0,40.0,0,0.0,1,0,0,0,0,0,0,0,0,0
4,28.0,7,338409.0,13.0,1,4,2,1,1,40.0,0,0.0,0,0,0,0,0,0,0,0,1,0


In [65]:
df1.to_csv('encoded_csv',index=False)