In [3]:
# imports a library 'pandas', names it as 'pd'

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# enables inline plots, without it plots do not show up in the notebook.
%matplotlib inline


In [4]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 20)
pd.set_option('display.precision', 3)

In [5]:
# download the data and name the columns
cols = ['age', 'workspace', 'fnlwgt', 'education', 'education_num',
        'marital_status', 'occupation', 'relationship', 'ethnicity',
        'gender', 'capital_gain', 'capital_loss', 'hours_per_week',
       'country_of_origin', 'income']

df = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data',
                       names = cols)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32561 entries, 0 to 32560
Data columns (total 15 columns):
age                  32561 non-null int64
workspace            32561 non-null object
fnlwgt               32561 non-null int64
education            32561 non-null object
education_num        32561 non-null int64
marital_status       32561 non-null object
occupation           32561 non-null object
relationship         32561 non-null object
ethnicity            32561 non-null object
gender               32561 non-null object
capital_gain         32561 non-null int64
capital_loss         32561 non-null int64
hours_per_week       32561 non-null int64
country_of_origin    32561 non-null object
income               32561 non-null object
dtypes: int64(6), object(9)
memory usage: 4.0+ MB


In [7]:
# to view the first 5 or specify with ex: .head(10)
df.head()

Unnamed: 0,age,workspace,fnlwgt,education,education_num,marital_status,occupation,relationship,ethnicity,gender,capital_gain,capital_loss,hours_per_week,country_of_origin,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [8]:
# there's a space before each string in this data
df.education.unique()

array([' Bachelors', ' HS-grad', ' 11th', ' Masters', ' 9th',
       ' Some-college', ' Assoc-acdm', ' Assoc-voc', ' 7th-8th',
       ' Doctorate', ' Prof-school', ' 5th-6th', ' 10th', ' 1st-4th',
       ' Preschool', ' 12th'], dtype=object)

In [10]:
for col in df:
    if df[col].dtype == 'O':
        df[col] = df[col].map(lambda x:x.strip(' '))

In [11]:
# Here's a beak down of what that for loop is doing

In [12]:
# loops through df and get column names
for col in df:
    print col

age
workspace
fnlwgt
education
education_num
marital_status
occupation
relationship
ethnicity
gender
capital_gain
capital_loss
hours_per_week
country_of_origin
income


In [13]:
# gets the column type
df.education.dtype

dtype('O')

In [14]:
# strip function
x = ' string'
x.strip(' ')

'string'

In [15]:
strip_string = lambda x: x.strip(' ')
strip_string(' string')

'string'

In [18]:
# same as this
def strip_string2(x):
    x.strip(' ')
    return x

In [19]:
strip_string2(' string')

' string'

In [21]:
# map applies the function to each item in the data frame column so
df[col].map(lambda x: x.strip(' '))

# does the same thing as 
df['workspace'].map(strip_string2)

# but in the first case we don't have to define and name a functino

0               State-gov
1        Self-emp-not-inc
2                 Private
3                 Private
4                 Private
5                 Private
6                 Private
7        Self-emp-not-inc
8                 Private
9                 Private
               ...       
32551             Private
32552             Private
32553             Private
32554             Private
32555             Private
32556             Private
32557             Private
32558             Private
32559             Private
32560        Self-emp-inc
Name: workspace, dtype: object

In [23]:
df.education.value_counts()

HS-grad         10501
Some-college     7291
Bachelors        5355
Masters          1723
Assoc-voc        1382
11th             1175
Assoc-acdm       1067
10th              933
7th-8th           646
Prof-school       576
9th               514
12th              433
Doctorate         413
5th-6th           333
1st-4th           168
Preschool          51
Name: education, dtype: int64

In [24]:
df.hours_per_week.mean()

40.437455852092995

In [25]:
df[['age', 'capital_gain', 'capital_loss', 'hours_per_week']].describe()

Unnamed: 0,age,capital_gain,capital_loss,hours_per_week
count,32561.0,32561.0,32561.0,32561.0
mean,38.582,1077.649,87.304,40.437
std,13.64,7385.292,402.96,12.347
min,17.0,0.0,0.0,1.0
25%,28.0,0.0,0.0,40.0
50%,37.0,0.0,0.0,40.0
75%,48.0,0.0,0.0,45.0
max,90.0,99999.0,4356.0,99.0


In [26]:
df[df.age.isnull()]

Unnamed: 0,age,workspace,fnlwgt,education,education_num,marital_status,occupation,relationship,ethnicity,gender,capital_gain,capital_loss,hours_per_week,country_of_origin,income


In [27]:
df_no_nulls = df[df.age.notnull()]

In [28]:
null_df = pd.DataFrame([1,2,4,np.nan], columns = ['column1'])

In [29]:
null_df

Unnamed: 0,column1
0,1.0
1,2.0
2,4.0
3,
