In [1]:
#initial data exploration

import pandas as pd
import numpy as np
import matplotlib.pyplot as pl
from time import time
%matplotlib inline

records = pd.DataFrame.from_csv('records.csv')
over_50 = records.loc[records['over_50k'] == 1] #all rows where income > $50k
under_50 = records.loc[records['over_50k'] == 0] #all rows where income <= $50k

#some general information about the dataset
print('Earning >$50k: ' + str(over_50.shape[0]) + ' individuals (' + str('%.2f'%(100*over_50.shape[0]/(over_50.shape[0]+under_50.shape[0]))) + '%)')
print('Earning <=$50k: ' + str(under_50.shape[0]) + ' individuals (' + str('%.2f'%(100*under_50.shape[0]/(over_50.shape[0]+under_50.shape[0]))) + '%)')
print('\nSummary of continuous variables:\n')
print(records.drop(['over_50k'], axis=1).describe()) #summary statistics for each continuous variable

  if __name__ == '__main__':


Earning >$50k: 11687 individuals (23.93%)
Earning <=$50k: 37155 individuals (76.07%)

Summary of continuous variables:

                age  education_num  capital_gain  capital_loss    hours_week
count  48842.000000   48842.000000  48842.000000  48842.000000  48842.000000
mean      38.643585      10.078089   1079.067626     87.502314     40.422382
std       13.710510       2.570973   7452.019058    403.004552     12.391444
min       17.000000       1.000000      0.000000      0.000000      1.000000
25%       28.000000       9.000000      0.000000      0.000000     40.000000
50%       37.000000      10.000000      0.000000      0.000000     40.000000
75%       48.000000      12.000000      0.000000      0.000000     45.000000
max       90.000000      16.000000  99999.000000   4356.000000     99.000000


Roughly 24% of the data set is made up of individuals earning greater than \$50,000, and roughly 76% of the data is made up of individuals earning less than or equal to \$50,000. The means, standard deviations, and quartiles of the capital gain and capital loss variables indicate to me that the variables are heavily skewed, but I will verify this below. At least half the individuals in the dataset work for 40-45 hours per week, so I doubt over_50k will be strongly correlated to hours worked.

In [2]:
#obervations re: capital gain/loss
print('Skewness of continuous variables:\n')
print(records.skew())

print('\nNumber of instances of both non-zero capital gain and capital loss: ' + str(records.loc[(records['capital_gain'] >0) & (records['capital_loss'] >0)].shape[0]))

Skewness of continuous variables:

age               0.557580
education_num    -0.316525
capital_gain     11.894659
capital_loss      4.569809
hours_week        0.238750
over_50k          1.222216
dtype: float64

Number of instances of both non-zero capital gain and capital loss: 0


Capital gain and capital loss are heavily skewed variables, whereas the other continuous variables are not particularly skewed. They are also mutually-exclusive in that there are zero instances of both non-zero capital gain and capital loss. For this reason, I think it would be a good idea to consolidate the two variables into one variable representing the net capital gain/loss.

In [3]:
records['delta_capital'] = records['capital_gain'] - records['capital_loss'] #create net capital gain/loss variable
records.drop('capital_gain', axis=1, inplace=True) #we don't need this anymore
records.drop('capital_loss', axis=1, inplace=True) #we don't need this anymore

#now over_50k is no longer the rightmost column, so I'm going to swap it with the delta_capital column
cols = list(records)
cols[11], cols[12] = cols[12], cols[11]
records = records.loc[:, cols]

print('\nSkewness of continuous variables:\n')
print(records.skew())



Skewness of continuous variables:

age               0.557580
education_num    -0.316525
hours_week        0.238750
delta_capital    11.814939
over_50k          1.222216
dtype: float64


Predictably, the delta_capital variable is heavily skewed. I'd like to use a logarithmic transformation so that particularly high or low values don't interfere with training the model.

In [92]:
records['delta_capital'] = records['delta_capital'].apply(lambda i: np.log(i+1))

In [4]:
print(records.describe())

                age  education_num    hours_week  delta_capital      over_50k
count  48842.000000   48842.000000  48842.000000   48842.000000  48842.000000
mean      38.643585      10.078089     40.422382     991.565313      0.239282
std       13.710510       2.570973     12.391444    7475.549906      0.426649
min       17.000000       1.000000      1.000000   -4356.000000      0.000000
25%       28.000000       9.000000     40.000000       0.000000      0.000000
50%       37.000000      10.000000     40.000000       0.000000      0.000000
75%       48.000000      12.000000     45.000000       0.000000      0.000000
max       90.000000      16.000000     99.000000   99999.000000      1.000000
