In [1]:
import numpy as np
import pandas as pd

In [14]:
import pandas as pd

# Creating a DataFrame with a hierarchical index
data = {'A': [1, 2, 3, 4],
        'B': [5, 6, 7, 8]}
index = pd.MultiIndex.from_tuples([('Group 1', 'Item 1'), ('Group 1', 'Item 2'), ('Group 2', 'Item 1'), ('Group 2', 'Item 2')],
                                  names=['Group', 'Item'])
df = pd.DataFrame(data, index=index)


In [15]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
Group,Item,Unnamed: 2_level_1,Unnamed: 3_level_1
Group 1,Item 1,1,5
Group 1,Item 2,2,6
Group 2,Item 1,3,7
Group 2,Item 2,4,8


In [16]:
# Selecting data at specific levels
df.loc['Group 1']  # Selects all rows under 'Group 1'


Unnamed: 0_level_0,A,B
Item,Unnamed: 1_level_1,Unnamed: 2_level_1
Item 1,1,5
Item 2,2,6


In [17]:

df.loc['Group 1', 'Item 1']  # Selects a specific item under 'Group 1'


A    1
B    5
Name: (Group 1, Item 1), dtype: int64

In [18]:

df.loc['Group 1']['A']  # Selects all 'A' values under 'Group 1'


Item
Item 1    1
Item 2    2
Name: A, dtype: int64

In [19]:
# Slicing data within a level
df.loc['Group 1':'Group 2']  # Slices between 'Group 1' and 'Group 2'


Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
Group,Item,Unnamed: 2_level_1,Unnamed: 3_level_1
Group 1,Item 1,1,5
Group 1,Item 2,2,6
Group 2,Item 1,3,7
Group 2,Item 2,4,8


In [22]:
df.loc[(slice(None),'Item 1'),:]  # Selects 'Item 1' across all groups


Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
Group,Item,Unnamed: 2_level_1,Unnamed: 3_level_1
Group 1,Item 1,1,5
Group 2,Item 1,3,7


In [23]:
df.stack()

Group    Item     
Group 1  Item 1  A    1
                 B    5
         Item 2  A    2
                 B    6
Group 2  Item 1  A    3
                 B    7
         Item 2  A    4
                 B    8
dtype: int64

In [24]:
df.unstack()

Unnamed: 0_level_0,A,A,B,B
Item,Item 1,Item 2,Item 1,Item 2
Group,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Group 1,1,2,5,6
Group 2,3,4,7,8


In [25]:
df.reset_index()

Unnamed: 0,Group,Item,A,B
0,Group 1,Item 1,1,5
1,Group 1,Item 2,2,6
2,Group 2,Item 1,3,7
3,Group 2,Item 2,4,8


## Groupby

There are many times that we want to look and summary statistics by a categorical column.

In [28]:
import seaborn as sns
iris = sns.load_dataset('iris')

In [30]:
type(iris.groupby('species'))

pandas.core.groupby.generic.DataFrameGroupBy

In [33]:
iris.groupby('species').agg({'sepal_width': ['sum', 'mean'], 'petal_length': 'max'})


Unnamed: 0_level_0,sepal_width,sepal_width,petal_length
Unnamed: 0_level_1,sum,mean,max
species,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
setosa,171.4,3.428,1.9
versicolor,138.5,2.77,5.1
virginica,148.7,2.974,6.9


In [35]:
penguins = sns.load_dataset('penguins')

In [37]:
penguins.groupby(['species','island'])['bill_length_mm'].mean()

species    island   
Adelie     Biscoe       38.975000
           Dream        38.501786
           Torgersen    38.950980
Chinstrap  Dream        48.833824
Gentoo     Biscoe       47.504878
Name: bill_length_mm, dtype: float64

## Combining Datasets

```pandas.merge``` connects rows in DataFrames based on one or more keys
~~~
pd.merge(dfl, dfr, on='key')
pd.merge(dfl, dfr, left_on='lkey', right_on='rkey', how='inner')
~~~

By default, ```pd.merge``` performs an "inner" join, so if the ```how``` argument is left blank it assumes inner join.  Other possibilities:

* ```inner```: Use only the key combinations that are in both tables
* ```left```: Use all the key combinations in the left table
* ```right```: Use all the key combinations in the right table
* ```outer```: use all combinations that are in both tables together

In [None]:
dfl = pd.DataFrame({'lkey': ['b','b','a','c','a','a','b'], 'data1': range(7)})
dfl

In [None]:
dfr = pd.DataFrame({'rkey': ['a','b','d'], 'data2': range(3)})
dfr

In [None]:
pd.merge(dfl, dfr, left_on='lkey', right_on='rkey')

```pandas.concat``` concatentes or "stacks" together objects along an axis

In [None]:
df1 = pd.DataFrame({'state':['Utah','Ohio','Tennessee','Wyoming','Texas'], 'order':[45,17,16,44,28], 'min_elev':[2180,455,178,3105,0], 'capital':['Salt Lake City','Columbus','Nashville','Cheyenne','Austin']})
df1

In [None]:
df2 = pd.DataFrame({'state':['Nebraska','Maryland','Arizona','North Carolina'], 'order':[37,7,48,12], 'min_elev':[840,0, 72,0], 'capital':['Lincoln','Annapolis', 'Phoenix', 'Raleigh']})
df2

In [None]:
df_new = pd.concat([df1,df2])
df_new

In [None]:
df_new.reset_index()

In [None]:
df_new