In [None]:
# Python 2 & 3 Compatibility
from __future__ import print_function, division

# Pandas Review

## Set up Pandas default params

In [None]:
# imports a library 'pandas', names it as 'pd'
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from IPython.display import Image

# enables inline plots, without it plots don't show up in the notebook
%matplotlib inline

In [None]:
# various options in pandas
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 15)
pd.set_option('display.precision', 3)

In [None]:
## Make a Series
s = pd.Series([1,3,5,np.nan,6,8])

s

## Load a data set -- read_*

### "Census Income" dataset

http://archive.ics.uci.edu/ml/

In [None]:
# download the data and name the columns
cols = ['age', 'workclass', 'fnlwgt', 'education', 'education_num',
        'marital_status', 'occupation', 'relationship', 'ethnicity',
        'gender', 'capital_gain', 'capital_loss', 'hours_per_week',
        'country_of_origin', 'income']

df = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data',
                       names = cols)

**Q: What's happening in the above cell?**

## Viewing Data

* .info() 
* .head()
* .tail()
* .columns
* .values
* .dtype

### info

Displays the Columns, Types, Rows and the memory used by the dataframe

In [None]:
# we can see there are no null values
# columns with numerical values are type int64, no need to set data type

df.info()

### Head

Displays the first few rows in the dataframe

In [None]:
# to view the first 5 or specify with ex: .head(10)
df.head()

### Tail

Displays the last few rows in the dataframe

In [None]:
df.tail()

### Sample

Displays a Sample of rows in the dataframe

In [None]:
# head and tail are good.  But sometimes we want to randomly sample data
df.sample(5, random_state=42)

### Columns

Returns a List of Columns in the dataframe

In [None]:
# view all columns of the dataframe
df.columns

### Column Types

Returns the type of each column

In [None]:
df.dtypes

## Rename columns

In [None]:
df.columns

In [None]:
# replace a column name
df.rename(columns = {'country_of_origin' : 'native_country'}, inplace = True)
df.head()

## Descriptives 

* .describe()
* .value_counts()
* .mean()
* .unique()

### Describe

Displays summary statistic for each Numerical column

In [None]:
df.describe()

### value_counts

Counts the number of occurances of each categorical value for the column

In [None]:
df.education.value_counts()

In [None]:
df.education.value_counts().plot('barh')

In [None]:
df.hours_per_week.mean()

In [None]:
df[['age', 'capital_gain', 'capital_loss', 'hours_per_week']].describe()

### Unique

Returns the unique values for the column

In [None]:
# there's a space before each string in this data
df.education.unique()

In [None]:
# looks like it's in every object column
df.workclass.unique()

In [None]:
df["education"] = df.education.str.strip()

In [None]:
# Hurray We removed the leading space
df.education.unique()

In [None]:
df.info()

#### Exercise - Now, can you remove the leading space for all String Columns

In [None]:
# Your Turn

## plotting with pandas

http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.plot.html

In [None]:
df.age.hist(bins = 18);

In [None]:
# split the histograms by another column (with relatively few unique values)
df.hours_per_week.hist(by = df.income, sharey = True, bins = 25, figsize = (10,5));

In [None]:
# use value_counts() and a bar plot
df['workclass'].value_counts().plot(kind = 'barh')

In [None]:
df.boxplot(['age'], by = 'relationship', figsize=(10, 6));

In [None]:
fig, ax = plt.subplots(1,1, figsize = (10,5))
df.plot.scatter(
        x = 'age',
        y ='hours_per_week',
        alpha = .25,
        ax = ax
        )

## Selecting rows and columns 

### .loc 

* Selects row and columns by Names
* **by label**             `.loc[]`

### .iloc

* Selects row and columns by Index Position
* **by integer position**  `.iloc[]`


### .ix

* Redirects to `loc` or `iloc` based on input
* **for both**             `.ix[]`

http://pandas.pydata.org/pandas-docs/stable/indexing.html

In [None]:
# select a row
df.iloc[0:3]

In [None]:
# select a range of rows
df.iloc[10:15]

In [None]:
# last 2 rows
df.iloc[-2:]

In [None]:
# selecting every other row in columns 3-5
df.iloc[::2, 2:5].head()

In [None]:
# select a row
df.loc[0:3]

In [None]:
df.loc[0:2, ['age', 'relationship']]

In [None]:
df.ix[0:2, ['age', 'relationship']]

## Selection - Exercise

In [None]:
# pd.DataFrame let's you turn series, arrays, lists, and more into data frame structures

df_index = pd.DataFrame([[1,2,3,4],[2,4,6,8],[3,5,7,9]], [11,13,12], columns = ['A', 'C', 'D', 'B'])

In [None]:
df_index

In [None]:
# iloc indexes by postion, not by the labels in the index
df_index.iloc[0:1]

In [None]:
# with loc both the start and the stop are included
df_index.loc[11:12]

In [None]:
# select columns by position
df_index.iloc[:,0:1]

In [None]:
# or by label
df_index.loc[:,'A':'D']

In [None]:
# ix: primarily label based, but will fall back to integer positional access
df_index.ix[:,'A':'C']

In [None]:
# ix: primarily label based, but will fall back to integer positional access
df_index.ix[:,0:2]

## Filtering

In [None]:
df.age > 50

In [None]:
df[df.age > 50]

In [None]:
# Filter for only certain Columns
df.loc[df.age > 50, ['age', 'education', 'occupation', 'gender', 'income']]

## Now Filter on Gender

In [None]:
## Your Turn

## Now Filter on Gender and Age between 30 and 40

In [None]:
## Your Turn

## Sorting DataFrame

## Sorting DataFrame

* **sort_index()** to sort by index
* **sort_values()** to sort by values

##### sort( ) is going to be deprecated, so you should try to use sort_index() or sort_values() as much as possible

In [None]:
# Sort by Age

## Pivot Table

* Excel Style Pivot Table

In [None]:
df.pivot_table?

In [None]:
# Pivot by Education on rows, workclass on columns and value = hours per week
df.pivot_table("hours_per_week", "education", "workclass", aggfunc='median', fill_value='')

## Groupby

In [None]:
# Ref: Python for Data Analysis by Wes McKinney

Image(filename='../images/wesm_book_groupby.png')

In [None]:
# How to groupby column and apply a function like sum, count, or mean
df.groupby(['education']).mean()

In [None]:
df.groupby(['education','age'])[['hours_per_week','capital_gain']].mean()

In [None]:
# To groupby multiple columns with multiple functions attached
df.groupby(['income', 'native_country']).age.agg(['count', 'mean'])
# grouped in order of which column is listed first

## Reset Index

In [None]:
(df
 .groupby(['income','gender'])
 .mean()
 .reset_index()
)

## Combining Filter and Group

In [None]:
# Get Mean hours per week by Education where Country = 'United-States
## Code goes here

## Combining it all together - Group, Aggregation, Sort

In [None]:
# groupby income and country and then sort by their mean age within each data block
df_grouped = df.groupby(['income','native_country']).mean().sort_values('age', ascending = True)
df_grouped

# Note: In this example, the groupby, mean, and sort functions are stringed together in one line
# in the next example, we will show a different syntax so that you could write them on separate
# lines to make the code a little easier to read

In [None]:
# We want to group people by their income and country
# Then sort them by their income ASCE, and then sort by average age within that group DESC 
(df
 .groupby(['income','native_country'])
 .mean()
 .reset_index()
 .sort_values(['income','age'], ascending=[True,False])
)

# Note: In this example, we sort by the SAME column which we grouped by earlier 
# (eg. we first groupby 'income' and then sort by 'income')
# In this case, we must use .reset_index() to re-index the groupby objects, because the 'income' 
# column no longer exists after the groupby and hence cannot be sorted directly

## Dealing with Missing Data
* Find nulls
* Fill nulls
* Drop nulls

## Find Nulls

In [None]:
# as we saw with df.info() there are no nulls... 
# but if there were this would find the rows where age is null
df[df.age.isnull()]

## Fill nulls

In [None]:
null_df = pd.DataFrame([1,2,4,np.nan], columns = ['column1'])

In [None]:
null_df

In [None]:
# you can also fill nulls with a value or string
null_df.column1.fillna(1000)

In [None]:
# fillna does not do it inplace unless you specify
null_df

In [None]:
# you can also fill null with the median or mean value of the column
null_df.fillna(null_df.column1.median())

In [None]:
null_df.fillna('random_string')

## Drop nulls

In [None]:
null_df = pd.DataFrame([1,2,4,np.nan], columns = ['column1'])

In [None]:
null_df

In [None]:
null_df.dropna(how = 'any')

In [None]:
# .isnull() and .notnull() does opposite things
null_df.isnull()

In [None]:
null_df.notnull()

## Practice Exercises

In [None]:
# how many males and females are in this data set

In [None]:
# plot the total number of people in each occupation

In [None]:
# what is the lowest average age of an occupation

In [None]:
# create a boxplot of hours per week by education level

In [None]:
# create a new column for income where >50K = 1 and <=50K = 0

# hint... http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.replace.html

In [None]:
# find which "native_country" has the highest percent of people earning >50K

In [None]:
# visualize what percent of people earn over 50K by education level