In [1]:
import pandas as pd
import tensorflow as tf
import numpy as np
from memory_profiler import profile

import seaborn as sns
import matplotlib.pyplot as plt

import xgboost as xgb 
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split,StratifiedKFold,cross_val_score,GridSearchCV


%matplotlib inline

  from pandas import MultiIndex, Int64Index


In [9]:
# read in data
df_train = pd.read_csv('incomes_datasets/train.csv')

df_train.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income_>50K
0,67,Private,366425,Doctorate,16,Divorced,Exec-managerial,Not-in-family,White,Male,99999,0,60,United-States,1
1,17,Private,244602,12th,8,Never-married,Other-service,Own-child,White,Male,0,0,15,United-States,0
2,31,Private,174201,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,40,United-States,1
3,58,State-gov,110199,7th-8th,4,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,40,United-States,0
4,25,State-gov,149248,Some-college,10,Never-married,Other-service,Not-in-family,Black,Male,0,0,40,United-States,0


In [11]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43957 entries, 0 to 43956
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              43957 non-null  int64 
 1   workclass        41459 non-null  object
 2   fnlwgt           43957 non-null  int64 
 3   education        43957 non-null  object
 4   educational-num  43957 non-null  int64 
 5   marital-status   43957 non-null  object
 6   occupation       41451 non-null  object
 7   relationship     43957 non-null  object
 8   race             43957 non-null  object
 9   gender           43957 non-null  object
 10  capital-gain     43957 non-null  int64 
 11  capital-loss     43957 non-null  int64 
 12  hours-per-week   43957 non-null  int64 
 13  native-country   43194 non-null  object
 14  income_>50K      43957 non-null  int64 
dtypes: int64(7), object(8)
memory usage: 5.0+ MB


In [13]:
df_train.describe()

Unnamed: 0,age,fnlwgt,educational-num,capital-gain,capital-loss,hours-per-week,income_>50K
count,43957.0,43957.0,43957.0,43957.0,43957.0,43957.0,43957.0
mean,38.617149,189673.0,10.074118,1093.559797,88.246491,40.407694,0.239279
std,13.734401,105821.5,2.575092,7570.536063,404.58841,12.400303,0.426648
min,17.0,13492.0,1.0,0.0,0.0,1.0,0.0
25%,28.0,117496.0,9.0,0.0,0.0,40.0,0.0
50%,37.0,178100.0,10.0,0.0,0.0,40.0,0.0
75%,48.0,237671.0,12.0,0.0,0.0,45.0,0.0
max,90.0,1490400.0,16.0,99999.0,4356.0,99.0,1.0


: 

In [4]:
#change dataframe to numerical 
def columns_encoder(dataframe):
    type_obj_columns_= [x for x in dataframe.columns if dataframe[x].dtype != 'int64']

    for i in type_obj_columns_:
        dataframe[i] = LabelEncoder().fit_transform(dataframe[i])
    return dataframe

#noramlize function
def columns_normalize(dataframe,type):
    #normalization df
    if type == 'absmax':
        for c in dataframe.columns:
	        dataframe[c] = dataframe[c] / (dataframe[c].abs().max())
    elif type == 'minmax':
        for c in dataframe.columns:
            dataframe[c] = (dataframe[c] - dataframe[c].min()) / (dataframe[c].max() - dataframe[c].min())
    return dataframe 

In [5]:
columns_encoder(dataframe=df_train)

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income_>50K
0,67,3,366425,10,16,0,3,1,4,1,99999,0,60,38,1
1,17,3,244602,2,8,4,7,3,4,1,0,0,15,38,0
2,31,3,174201,9,13,2,3,0,4,1,0,0,40,38,1
3,58,6,110199,5,4,2,13,0,4,1,0,0,40,38,0
4,25,6,149248,15,10,4,7,1,2,1,0,0,40,38,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43952,52,3,68982,9,13,2,3,0,4,1,0,0,50,38,1
43953,19,3,116562,11,9,4,7,3,4,0,0,0,40,38,0
43954,30,3,197947,15,10,0,11,1,4,1,0,0,58,38,0
43955,46,3,97883,9,13,4,11,1,4,0,0,0,35,38,0


In [6]:
columns_normalize(dataframe = df_train,type='absmax')

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income_>50K
0,0.744444,0.375,0.245857,0.666667,1.0000,0.000000,0.214286,0.2,1.0,1.0,1.0,0.0,0.606061,0.926829,1.0
1,0.188889,0.375,0.164118,0.133333,0.5000,0.666667,0.500000,0.6,1.0,1.0,0.0,0.0,0.151515,0.926829,0.0
2,0.344444,0.375,0.116882,0.600000,0.8125,0.333333,0.214286,0.0,1.0,1.0,0.0,0.0,0.404040,0.926829,1.0
3,0.644444,0.750,0.073939,0.333333,0.2500,0.333333,0.928571,0.0,1.0,1.0,0.0,0.0,0.404040,0.926829,0.0
4,0.277778,0.750,0.100140,1.000000,0.6250,0.666667,0.500000,0.2,0.5,1.0,0.0,0.0,0.404040,0.926829,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43952,0.577778,0.375,0.046284,0.600000,0.8125,0.333333,0.214286,0.0,1.0,1.0,0.0,0.0,0.505051,0.926829,1.0
43953,0.211111,0.375,0.078209,0.733333,0.5625,0.666667,0.500000,0.6,1.0,0.0,0.0,0.0,0.404040,0.926829,0.0
43954,0.333333,0.375,0.132815,1.000000,0.6250,0.000000,0.785714,0.2,1.0,1.0,0.0,0.0,0.585859,0.926829,0.0
43955,0.511111,0.375,0.065676,0.600000,0.8125,0.666667,0.785714,0.2,1.0,0.0,0.0,0.0,0.353535,0.926829,0.0


In [8]:
df_train = df_train.fillna(0)
df_train.isna().sum()

age                0
workclass          0
fnlwgt             0
education          0
educational-num    0
marital-status     0
occupation         0
relationship       0
race               0
gender             0
capital-gain       0
capital-loss       0
hours-per-week     0
native-country     0
income_>50K        0
dtype: int64

In [7]:
df_train.to_csv('income_train.csv')