# Pandas Intro


In [181]:
import numpy as np
print("numpy version", np.__version__)
import pandas as pd
print("pandas version", pd.__version__)

numpy version 1.14.0
pandas version 0.22.0


## Fundamental Classes

### Series

Series is something like a variable in data frame

In [None]:
s = pd.Series([1,2,5,6])
print(s)
print(s.values)
print(s.index)

In [None]:
pop = pd.Series( [ 38, 26, 19, 19], index = ['ca', 'tx', 'ny', 'fl'])  # population, in millions
print(pop, '\n')
print('TX population:\n', pop['tx'], '\n')
print(pop['ny']*1000000, '\n')

### DataFrame

The central data structure in pandas, rectangular data
rows: observations, columns: variables

variables are stored as Series.

In [None]:
## create as a dict of lists (or series):
df = {'ca': [35, 37, 38], 'tx': [23, 24, 26], 'md': [5,5,6]}
pop = pd.DataFrame(df)
print('population:\n', pop, '\n')
pop = pd.DataFrame(df, index = [2010,2012,2014])
print('population:\n', pop, '\n')

In [None]:
## read from file
titanic = pd.read_csv("../data/titanic.csv.bz2")  # note: automatic decompression
print("First 3 obs:\n", titanic.head(3), '\n')
print('Last 3 obs:\n', titanic.tail(3), '\n')

## Indexing dataframes

In [None]:
## Select variables
print( titanic["name"][:10] , '\n')
print( titanic.name[:10] , '\n')
print( titanic[["name", "survived"]][:10], '\n')

In [None]:
## select by logical operations
print('Enemies of the people:\n', titanic[titanic.pclass == 1].head(), '\n')
print('Male loosers:\n',
      titanic[np.logical_and(titanic.pclass == 3,
                             titanic.sex == 'male')].head(),
      '\n')
## note: 'and' is not ufunc 

In [None]:
##First select a subset
young = titanic[titanic.age < 20]
young.head()
## Note the indices

In [None]:
## iloc: select rows by integer row number
print(young.iloc[4], '\n')
print('as df:\n', young.iloc[[4]], '\n')
print( young.iloc[:3], '\n')
## select rows and columns
young.iloc[4, [3,4]]

In [None]:
## loc: select by index
print("as series:\n", young.loc[27], '\n')
print("as DF:\n", young.loc[[27]], '\n')
print(young.loc[[11,53]], '\n')
## select rows and columns
print( young.loc[[11,53], ['pclass', 'survived']], '\n')

## Statistics and data description

In [None]:
## compute means of data:
print("Mean values:\n", titanic[['pclass', 'name', 'survived', 'age']].mean(), '\n')
# note: 'name' removed
## find unique values
print("possible genders:\n",  titanic.sex.unique(), '\n')
print("valid values:\n", titanic.age.count(), '\n')
print("missings:\n", titanic.age.isnull().sum(), '\n')

In [None]:
tn = titanic[['pclass', 'survived', 'age', 'fare']]
print(tn.head(), '\n')
print("means:\n", tn.apply(np.mean), '\n')
print("ranges:\n", tn.apply(lambda x: x.max() - x.min()), '\n')

### Exercise:
compute the survival rate by age and gender: split the data into for
groups (you women, young men, old women, old men), and compute the
survival rate in each of the group.  You may want to use the median
age for the young-old boundary.

In [188]:
ageb = titanic.age.median()
print('age boundary:', ageb, '\n')
ym = titanic[np.logical_and(titanic.age < ageb, titanic.sex ==
                         'male')].survived.mean()
print('young men', ym)
yf = titanic[np.logical_and(titanic.age < ageb, titanic.sex !=
                         'male')].survived.mean()
print('young women', yf)
om = titanic[np.logical_and(titanic.age >= ageb, titanic.sex ==
                         'male')].survived.mean()
print('old men', om)
of = titanic[np.logical_and(titanic.age >= ageb, titanic.sex !=
                         'male')].survived.mean()
print('old women', of)

age boundary: 28.0 

young men 0.23452768729641693
young women 0.7106598984771574
old men 0.1794871794871795
old women 0.7958115183246073


## Data Wrangling

### Database joins

In [None]:
df1 = pd.DataFrame({ 'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'], 'data1' : range(7) })
df2 = pd.DataFrame({ 'key': ['a', 'b', 'd'], 'data2': range(3)})
## many-to-one inner join
print("inner join\n", pd.merge(df1, df2))
## many-to-one outer join
print("outer join\n", pd.merge(df1, df2, how='outer'))

In [None]:
## Many-to-many joins
df3 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'], 'data3': range(6)})
df4 = pd.DataFrame({'key': ['a', 'b', 'a', 'b', 'd'], 'data4': range(5)})
print("many-to-many left join\n", pd.merge(df3, df4, on='key', how='left'))

In [None]:
## join method
## Always uses index for the second data frame
df3.join(df4.set_index('key'), on='key', how='left')

### Concatenation along axes

In [None]:
## concatenating numpy matrices
arr = np.arange(12).reshape((3,4))
print("along axis 0\n", np.concatenate((arr, arr), axis=0))
print("along axis 1\n", np.concatenate((arr, arr), axis=1))


In [None]:
### Concatenation in pandas
s1 = pd.Series([0,1], index=['a', 'b'])
s2 = pd.Series([2,3,4], index=('c','d','e'))
s3 = pd.Series([5,6], index=['f','g'])
c1 = pd.concat((s1,s2,s3))
print("along axis0 (default)\n", c1)
print("note: this is of type", type(c1))

In [None]:
### concat in axis=1, get a DataFrame
c2 = pd.concat((s1,s2,s3), axis=1)
print("along axis0 (default)\n", c2)
print("note: this is of type", type(c2))


In [None]:
s4 = pd.concat((s1*5, s3))
print("outer join (default)\n", pd.concat([s1,s4], axis=1))
print("inner join\n", pd.concat([s1,s4], axis=1, join="inner"))
## give an axis:
print("given join axis\n", pd.concat([s1,s4], axis=1, join_axes=[['a', 'e', 'b', 'c']]))


In [None]:
## identify by hierarchical key (axis = 0)
c5 = pd.concat([s1,s1,s3])
print("w/o keys\n", c5)
c3 = pd.concat([s1,s1,s3], keys=['one', 'two', 'three'])
print("w/hierarchical key\n", c3)
c3.unstack()
## get DataFrame columns (axis = 1)
c4 = pd.concat([s1,s2,s3], axis = 1, keys=['one', 'two', 'three'])
print("w/hierarchical key\n", c4)


In [None]:
## DataFrames behave in similar fashion:
df1 = pd.DataFrame(np.arange(6).reshape(3,2), index=['a','b','c'], columns=['one','two'])
df2 = pd.DataFrame(5 + np.arange(4).reshape(2,2), index=['a','c'], columns=['three', 'four'])
print("df1:\n", df1)
print("df2:\n", df2)
pd.concat([df1, df2], axis=1, keys=['L1', 'L2'])

In [None]:
## overlapping data
a = pd.Series([np.nan, 2.5, np.nan, 3.5, 4.5, np.nan], index=['f', 'e', 'd', 'c', 'b', 'a'])
b = pd.Series(np.arange(len(a), dtype=np.float64), index=['f', 'e', 'd', 'c', 'b', 'a'])
b[-1] = np.nan
## select a if present, otherwise b
print("'np.where'\n", np.where(pd.isnull(a), b, a))
## do the same with 'combine_first'
print("'combine_first'\n", a.combine_first(b))

In [None]:
## works in a similar fashion for data frames
df1 = pd.DataFrame({'a': [1, np.nan, 5, np.nan], 'b': [np.nan, 2, np.nan, 6], 'c': range(2,18,4)})
df2 = pd.DataFrame({'a': [5, 4, np.nan, 3, 7], 'b': [np.nan, 3, 4, 6, 8]})
df1.combine_first(df2)

### Reshaping and pivoting

In [None]:
data = pd.DataFrame(np.arange(6).reshape((2,3)), index=pd.Index(['OH','CO'], name='state'), columns=pd.Index(['one', 'two', 'three'], name='number'))
long = data.stack()
print("long form\n", long)
wide = long.unstack()
print('wide form\n', wide)
print('wide with 0-level in columns:\n', long.unstack(0))

## Chapther 9: Data Aggregation and Group Operations

In [None]:
df = pd.DataFrame({'key1': ['a', 'a', 'b', 'b', 'a', np.nan], 'key2': ['one', 'two', 'one', 'two', 'one', np.nan],
                   'data1': np.random.randn(5), 'data2': np.random.randn(5)})
## group the data
grouped = df['data1'].groupby(df['key1'])
# group by a single key
print("means by key1:\n", grouped.mean())
print("\nmeans by key1, key2:\n", df['data1'].groupby([df['key1'], df.key2]).mean() )
# group by two keys


In [None]:
states = np.array(['OH', 'CA', 'CA', 'OH', 'OH'])
years = np.array([2005, 2005, 2006, 2005, 2006])
print("means by state, year:\n", df.data1.groupby([states, years]).mean() )
# note: key2 removed
## use the same data frame:
print("\nkeys in the same df:\n", df.groupby(['key1', 'key2']).mean() )


### Exercice:
repeat the previous exercise using grouping: compute the survival rate
for men, women, old and young using groupby operator.