In [1]:
import pandas as pd

In [36]:
columns = ['age', 'workClass', 'fnlwgt', 'education', 'education-num','marital-status', 'occupation', 'relationship',
          'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income']

cat_columns = ['workClass', 'education', 'marital-status', 'occupation', 
        'relationship', 'race', 'sex', 'native-country', 'income']
numeric_columns = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']

<h4> Load data and clean up </h4>

In [37]:
train_data = pd.read_csv('data/adult.data', names=columns, sep=' *, *', na_values='?')
# test data has NAN on first row so skip the first row
test_data  = pd.read_csv('data/adult.test', names=columns, skiprows=1, sep=' *, *', na_values='?')

  """Entry point for launching an IPython kernel.
  This is separate from the ipykernel package so we can avoid doing imports until


So there are 32561 samples with both categorial and numerical columns. workClass, occupation and native-country has missing values

In [38]:
data = pd.concat([train_data, test_data], axis=0)
data.isnull().sum()

age                  0
workClass         2799
fnlwgt               0
education            0
education-num        0
marital-status       0
occupation        2809
relationship         0
race                 0
sex                  0
capital-gain         0
capital-loss         0
hours-per-week       0
native-country     857
income               0
dtype: int64

In [52]:
from sklearn.impute import SimpleImputer
import numpy as np
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imputer = imputer.fit(data[['workClass','occupation','native-country' ]].values)
imputed_cat_data = imputer.transform(data[['workClass','occupation','native-country']].values)
imputed_cat_data.shape

(48842, 3)

In [75]:
pd.options.mode.chained_assignment = None
data['workClass'] = imputed_cat_data[:,0]
data['occupation'] = imputed_cat_data[:,1]
data['native-country'] = imputed_cat_data[:,2]
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 48842 entries, 0 to 16280
Data columns (total 15 columns):
age               48842 non-null int64
workClass         48842 non-null object
fnlwgt            48842 non-null int64
education         48842 non-null object
education-num     48842 non-null int64
marital-status    48842 non-null object
occupation        48842 non-null object
relationship      48842 non-null object
race              48842 non-null object
sex               48842 non-null object
capital-gain      48842 non-null int64
capital-loss      48842 non-null int64
hours-per-week    48842 non-null int64
native-country    48842 non-null object
income            48842 non-null object
dtypes: int64(6), object(9)
memory usage: 6.0+ MB


In [44]:
imputed_df = pd.DataFrame(imputed_cat_data)
imputed_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 9 columns):
0    48842 non-null object
1    48842 non-null object
2    48842 non-null object
3    48842 non-null object
4    48842 non-null object
5    48842 non-null object
6    48842 non-null object
7    48842 non-null object
8    48842 non-null object
dtypes: object(9)
memory usage: 3.4+ MB


In [None]:
imputed_cat_data.r

In [30]:
data.set

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
5,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
6,49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
8,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
9,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K


In [60]:
data['income'] = data['income'].apply(lambda x: 0 if x == '<=50K' else 1)

Unnamed: 0,age,fnlwgt,education,education-num,marital-status,relationship,race,sex,capital-gain,capital-loss,hours-per-week,income
0,39,77516,Bachelors,13,Never-married,Not-in-family,White,Male,2174,0,40,0
1,50,83311,Bachelors,13,Married-civ-spouse,Husband,White,Male,0,0,13,0
2,38,215646,HS-grad,9,Divorced,Not-in-family,White,Male,0,0,40,0
3,53,234721,11th,7,Married-civ-spouse,Husband,Black,Male,0,0,40,0
4,28,338409,Bachelors,13,Married-civ-spouse,Wife,Black,Female,0,0,40,0


In [56]:

#replace all nan with mean
#data.fillna(data.mean())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 15 columns):
age               48842 non-null object
workClass         48842 non-null object
fnlwgt            48842 non-null object
education         48842 non-null object
education-num     48842 non-null object
marital-status    48842 non-null object
occupation        48842 non-null object
relationship      48842 non-null object
race              48842 non-null object
sex               48842 non-null object
capital-gain      48842 non-null object
capital-loss      48842 non-null object
hours-per-week    48842 non-null object
native-country    48842 non-null object
income            48842 non-null object
dtypes: object(15)
memory usage: 5.6+ MB


In [41]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 48842 entries, 0 to 16280
Data columns (total 15 columns):
age               48842 non-null int64
workClass         46043 non-null object
fnlwgt            48842 non-null int64
education         48842 non-null object
education-num     48842 non-null int64
marital-status    48842 non-null object
occupation        46033 non-null object
relationship      48842 non-null object
race              48842 non-null object
sex               48842 non-null object
capital-gain      48842 non-null int64
capital-loss      48842 non-null int64
hours-per-week    48842 non-null int64
native-country    47985 non-null object
income            48842 non-null int64
dtypes: int64(7), object(8)
memory usage: 6.0+ MB


In [42]:
data.isnull().sum()

age                  0
workClass         2799
fnlwgt               0
education            0
education-num        0
marital-status       0
occupation        2809
relationship         0
race                 0
sex                  0
capital-gain         0
capital-loss         0
hours-per-week       0
native-country     857
income               0
dtype: int64

In [46]:
from sklearn.impute import SimpleImputer
import numpy as np
imr = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
data['workClass'] = imr.fit_transform(data['workClass'])

ValueError: Expected 2D array, got 1D array instead:
array=['State-gov' 'Self-emp-not-inc' 'Private' ... 'Private' 'Private'
 'Self-emp-inc'].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [45]:
imputed_data.isnull().sum()

AttributeError: 'numpy.ndarray' object has no attribute 'isnull'

In [39]:


def convertCatColumn(df, col):
    df = pd.concat([df, pd.get_dummies(df[col],prefix=col,prefix_sep=':')], axis=1)
    df.drop(col,axis=1,inplace=True)
    return df

df = convertCatColumn(train_data, 'workClass')
df.head()

Unnamed: 0,age,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,...,income,workClass: ?,workClass: Federal-gov,workClass: Local-gov,workClass: Never-worked,workClass: Private,workClass: Self-emp-inc,workClass: Self-emp-not-inc,workClass: State-gov,workClass: Without-pay
0,39,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,...,<=50K,0,0,0,0,0,0,0,1,0
1,50,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,...,<=50K,0,0,0,0,0,0,1,0,0
2,38,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,...,<=50K,0,0,0,0,1,0,0,0,0
3,53,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,...,<=50K,0,0,0,0,1,0,0,0,0
4,28,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,...,<=50K,0,0,0,0,1,0,0,0,0
