# Multindexing in pandas

In [2]:
import pandas as pd
import numpy as np

In [3]:
index = [('California', 2000), ('California', 2010), 
        ('New York', 2000), ('New York', 2010), ('Texas', 2000), ('Texas', 2010)]

populations = np.random.randint(10000000, 100000000, size=6)

In [4]:
# Create a Series 'pop' where the index is index and the values is populations

pop = pd.Series(data=populations, index=index)
pop

(California, 2000)    95592463
(California, 2010)    38603960
(New York, 2000)      78644447
(New York, 2010)      22081561
(Texas, 2000)         72112103
(Texas, 2010)         53318191
dtype: int32

In [5]:
# Create MultiIndex object from index
index = pd.MultiIndex.from_tuples(index)
index

MultiIndex(levels=[['California', 'New York', 'Texas'], [2000, 2010]],
           labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]])

In [6]:
# Update the index of pop with the new index
pop = pop.reindex(index)

In [7]:
# Access all data for which the second index is 2010
pop.loc[:, 2010]

California    38603960
New York      22081561
Texas         53318191
dtype: int32

In [8]:
# Make a single-indexed dataframe from the pop Series
pop_df = pop.unstack()

In [9]:
# Turn that dataframe back into a MultiIndexed Series
pop_df.stack()

California  2000    95592463
            2010    38603960
New York    2000    78644447
            2010    22081561
Texas       2000    72112103
            2010    53318191
dtype: int32

In [16]:
under18 = [9267089, 9284094,
          4687374, 4318033,
          5906301, 6879014]

# Make a MultiIndexed dataframe from the pop series. Result will be the MultiIndexed pop series with a column added.
pop_df = pd.DataFrame({'pop': pop, 'under18': under18})
pop_df

Unnamed: 0,Unnamed: 1,pop,under18
California,2000,95592463,9267089
California,2010,38603960,9284094
New York,2000,78644447,4687374
New York,2010,22081561,4318033
Texas,2000,72112103,5906301
Texas,2010,53318191,6879014


In [10]:
# Get unstacked df with the fraction of people under 18 by year
(pop_df['under18'] / pop_df['pop']).unstack()

Unnamed: 0,2000,2010
California,0.197614,0.228814
New York,0.193292,0.115281
Texas,0.068256,0.120775


## Methods of MultiIndex Creation

In [11]:
# Make df by passing a list of two or more index arrays to the constructor
# (Generate a random nxn matrix for the values, add column names / index names manually)

mat = np.random.randint(0, 10, size=(4, 4))

mdf = pd.DataFrame(mat, index=[[1, 1, 2, 2], ['a', 'b', 'c', 'd']], columns=[[1, 1, 2, 2],  ['a', 'b', 'c', 'd']])

mdf

Unnamed: 0_level_0,Unnamed: 1_level_0,1,1,2,2
Unnamed: 0_level_1,Unnamed: 1_level_1,a,b,c,d
1,a,3,6,6,1
1,b,2,0,6,0
2,c,0,7,0,0
2,d,4,8,7,1


In [12]:
[{i: j} for i, j in enumerate(pop.values)]

[{0: 46894801},
 {1: 40574847},
 {2: 24250239},
 {3: 37456591},
 {4: 86531452},
 {5: 56957055}]

In [13]:
# Pass a dictionary with appropriate tuples as keys
data = {
    ('California', 2000): 33871648,
    ('California', 2010): 23452345,
    ('Texas', 2000): 43328938,
    ('Texas', 2010): 42345849,
}

pd.Series(data)

California  2000    33871648
            2010    23452345
Texas       2000    43328938
            2010    42345849
dtype: int64

### Explicit MultiIndex constructors

Using class method constructors in pd.MultiIndex

Levels: list of lists containing available index values for each level
labels: List of lists that reference these levels

In [14]:
# Construct a MultiIndex object from list of arrays
pd.MultiIndex.from_arrays([[1, 1, 2, 2], ['a', 'b', 'c', 'd']], names=['int', 'str'])

MultiIndex(levels=[[1, 2], ['a', 'b', 'c', 'd']],
           labels=[[0, 0, 1, 1], [0, 1, 2, 3]],
           names=['int', 'str'])

In [15]:
# Construct a MultiIndex object from list of tuples
pd.MultiIndex.from_tuples([(1, 'a'), (1, 'b'), (2, 'c'), (2, 'd')])

MultiIndex(levels=[[1, 2], ['a', 'b', 'c', 'd']],
           labels=[[0, 0, 1, 1], [0, 1, 2, 3]])

In [16]:
# Construct a MultiIndex object from Cartesian product of single indices
pd.MultiIndex.from_product([[1, 2], ['a', 'b', 'c', 'd']])

MultiIndex(levels=[[1, 2], ['a', 'b', 'c', 'd']],
           labels=[[0, 0, 0, 0, 1, 1, 1, 1], [0, 1, 2, 3, 0, 1, 2, 3]])

In [17]:
# Construct a MultiIndex directly by passing it 'levels' and 'labels' arguments
pd.MultiIndex(levels=[[1, 2], ['a', 'b', 'c', 'd']],
              labels=[[0, 0, 1, 1], [0, 1, 2, 3]])

MultiIndex(levels=[[1, 2], ['a', 'b', 'c', 'd']],
           labels=[[0, 0, 1, 1], [0, 1, 2, 3]])

In [18]:
# Add names 'state' and 'year' to the index levels of pop
pop.index.names = ['state', 'year']
pop

state       year
California  2000    46894801
            2010    40574847
New York    2000    24250239
            2010    37456591
Texas       2000    86531452
            2010    56957055
dtype: int32

### MultiIndex for columns

In [19]:
# Create a random medical dataframe.
# Index levels: year, visit
# Column levels: name, visit type

index = pd.MultiIndex.from_product([[2017, 2018], [1, 2]])
col = pd.MultiIndex.from_product([['Alex', 'Devin', 'Savanna',], ['Blood Test', 'Donation']])

In [20]:
health_data = pd.DataFrame(data=np.random.randint(0, 10, (4, 6)), index=index, columns=col)

In [21]:
health_data

Unnamed: 0_level_0,Unnamed: 1_level_0,Alex,Alex,Devin,Devin,Savanna,Savanna
Unnamed: 0_level_1,Unnamed: 1_level_1,Blood Test,Donation,Blood Test,Donation,Blood Test,Donation
2017,1,5,0,1,0,5,3
2017,2,6,5,3,4,1,8
2018,1,1,1,9,1,9,0
2018,2,5,9,5,9,0,1


In [22]:
health_data.loc[2017]

Unnamed: 0_level_0,Alex,Alex,Devin,Devin,Savanna,Savanna
Unnamed: 0_level_1,Blood Test,Donation,Blood Test,Donation,Blood Test,Donation
1,5,0,1,0,5,3
2,6,5,3,4,1,8


In [23]:
# Get all health data for just donation

idx = pd.IndexSlice
health_data.loc[idx[:], idx[:, 'Donation']]

Unnamed: 0_level_0,Unnamed: 1_level_0,Alex,Devin,Savanna
Unnamed: 0_level_1,Unnamed: 1_level_1,Donation,Donation,Donation
2017,1,0,0,3
2017,2,5,4,8
2018,1,1,1,0
2018,2,9,9,1


### Indexing and Slicing a MultiIndex

In [24]:
pop

state       year
California  2000    46894801
            2010    40574847
New York    2000    24250239
            2010    37456591
Texas       2000    86531452
            2010    56957055
dtype: int32

In [25]:
health_data

Unnamed: 0_level_0,Unnamed: 1_level_0,Alex,Alex,Devin,Devin,Savanna,Savanna
Unnamed: 0_level_1,Unnamed: 1_level_1,Blood Test,Donation,Blood Test,Donation,Blood Test,Donation
2017,1,5,0,1,0,5,3
2017,2,6,5,3,4,1,8
2018,1,1,1,9,1,9,0
2018,2,5,9,5,9,0,1


In [26]:
rdf = pd.DataFrame(np.random.randint(0, 10, (5, 5)), index=[i for i in range(5)], columns=[i for i in 'abcde'])

health_data.iloc[3, 1]

9

In [27]:
# From pop, grab florida's population in the year 2000
pop['Texas', 2000]

health_data.loc[2017, 'Alex':'Devin'].iloc[0, 0:4]

Alex   Blood Test    5
       Donation      0
Devin  Blood Test    1
       Donation      0
Name: 1, dtype: int32

In [28]:
# From pop, grab all info for California
pop['California']

year
2000    46894801
2010    40574847
dtype: int32

In [29]:
# From pop, grab info for both California and New York
pop[['California', 'New York']]

state       year
California  2000    46894801
            2010    40574847
New York    2000    24250239
            2010    37456591
dtype: int32

In [30]:
# From pop, grab all info for the year 2000
pop[:, 2000]

state
California    46894801
New York      24250239
Texas         86531452
dtype: int32

In [31]:
# From pop, grab all info where the population is above a certain amount
pop[pop > 20000000]

state       year
California  2000    46894801
            2010    40574847
New York    2000    24250239
            2010    37456591
Texas       2000    86531452
            2010    56957055
dtype: int32

In [32]:
# From pop, grab the first two states
pop.loc['California': 'New York']

state       year
California  2000    46894801
            2010    40574847
New York    2000    24250239
            2010    37456591
dtype: int32

### Multiply Indexed DataFrames

In [33]:
health_data

Unnamed: 0_level_0,Unnamed: 1_level_0,Alex,Alex,Devin,Devin,Savanna,Savanna
Unnamed: 0_level_1,Unnamed: 1_level_1,Blood Test,Donation,Blood Test,Donation,Blood Test,Donation
2017,1,5,0,1,0,5,3
2017,2,6,5,3,4,1,8
2018,1,1,1,9,1,9,0
2018,2,5,9,5,9,0,1


In [34]:
# Selection for Series works on df columns
# Get Alex's blood test data from health_data

health_data['Alex']['Blood Test']
health_data['Alex', 'Blood Test']

2017  1    5
      2    6
2018  1    1
      2    5
Name: (Alex, Blood Test), dtype: int32

In [35]:
# Get all data from the first visit of 2017
health_data.loc[2017]

Unnamed: 0_level_0,Alex,Alex,Devin,Devin,Savanna,Savanna
Unnamed: 0_level_1,Blood Test,Donation,Blood Test,Donation,Blood Test,Donation
1,5,0,1,0,5,3
2,6,5,3,4,1,8


In [36]:
# Get the first 4x4 set of data from health_data by integer location (should be all of Alex's data)
health_data.iloc[:2, :2]

Unnamed: 0_level_0,Unnamed: 1_level_0,Alex,Alex
Unnamed: 0_level_1,Unnamed: 1_level_1,Blood Test,Donation
2017,1,5,0
2017,2,6,5


In [37]:
health_data['Alex']['Blood Test'][2017] = pd.Series([1, 2])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  exec(code_obj, self.user_global_ns, self.user_ns)


In [38]:
health_data.loc[:, 'Alex':'Devin']

Unnamed: 0_level_0,Unnamed: 1_level_0,Alex,Alex,Devin,Devin
Unnamed: 0_level_1,Unnamed: 1_level_1,Blood Test,Donation,Blood Test,Donation
2017,1,1,0,1,0
2017,2,2,5,3,4
2018,1,1,1,9,1
2018,2,5,9,5,9


In [39]:
idx = pd.IndexSlice

# Using idx, get only blood test data from first visits
health_data
health_data.loc[idx[:, 1], idx[:, 'Blood Test']]

Unnamed: 0_level_0,Unnamed: 1_level_0,Alex,Devin,Savanna
Unnamed: 0_level_1,Unnamed: 1_level_1,Blood Test,Blood Test,Blood Test
2017,1,1,1,5
2018,1,1,9,9


### Rearranging Multi-Indices

In [40]:
health_data

Unnamed: 0_level_0,Unnamed: 1_level_0,Alex,Alex,Devin,Devin,Savanna,Savanna
Unnamed: 0_level_1,Unnamed: 1_level_1,Blood Test,Donation,Blood Test,Donation,Blood Test,Donation
2017,1,1,0,1,0,5,3
2017,2,2,5,3,4,1,8
2018,1,1,1,9,1,9,0
2018,2,5,9,5,9,0,1


In [41]:
# Test how unstack works with DataFrames

health_data.unstack()

Unnamed: 0_level_0,Alex,Alex,Alex,Alex,Devin,Devin,Devin,Devin,Savanna,Savanna,Savanna,Savanna
Unnamed: 0_level_1,Blood Test,Blood Test,Donation,Donation,Blood Test,Blood Test,Donation,Donation,Blood Test,Blood Test,Donation,Donation
Unnamed: 0_level_2,1,2,1,2,1,2,1,2,1,2,1,2
2017,1,2,0,5,1,3,0,4,5,1,3,8
2018,1,5,1,9,9,5,1,9,9,0,0,1


In [42]:
# Make new dataframe with 2019 data to be added to health_data later

ni = pd.MultiIndex.from_product([[2019], [1, 2]])
nc = pd.MultiIndex.from_product([['Alex', 'Devin', 'Savanna'], ['Blood Test', 'Donation']])

nd = pd.DataFrame(data=np.random.randint(0, 10, (2, 6)), index=ni, columns=nc)
nd

Unnamed: 0_level_0,Unnamed: 1_level_0,Alex,Alex,Devin,Devin,Savanna,Savanna
Unnamed: 0_level_1,Unnamed: 1_level_1,Blood Test,Donation,Blood Test,Donation,Blood Test,Donation
2019,1,1,7,4,4,9,8
2019,2,8,5,2,0,3,0


#### Sorted and unsorted indices

In [67]:
# Make unsorted data series

index = pd.MultiIndex.from_product([['a', 'c', 'b'], [1, 2]])
data = pd.Series(np.random.rand(6), index=index)
data.index.names=['let', 'num']

data

let  num
a    1      0.067763
     2      0.891819
c    1      0.025713
     2      0.844037
b    1      0.661885
     2      0.850021
dtype: float64

In [72]:
# Show that you can't index a Series normally if it isn't sorted

#data['a':'b']  # gives an error

In [74]:
# Sort the index of data

data.sort_index()['a':'b']

let  num
a    1      0.067763
     2      0.891819
b    1      0.661885
     2      0.850021
dtype: float64

In [78]:
# Sort the levels of data
data = data.sort_index()

In [80]:
# Select [a through b] of the sorted Series 'data'
data['a':'b']

let  num
a    1      0.067763
     2      0.891819
b    1      0.661885
     2      0.850021
dtype: float64

In [83]:
# Unstack 'data', with num going to columns

data.unstack(level=1)

num,1,2
let,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.067763,0.891819
b,0.661885,0.850021
c,0.025713,0.844037


In [84]:
# Unstack 'data', with let going to columns
data.unstack(level=0)

let,a,b,c
num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.067763,0.661885,0.025713
2,0.891819,0.850021,0.844037


#### Index setting and resetting

In [86]:
# Turn 'pop' into a flattened single-indexed dataframe with the columns being the former indices, 'pop_flat'
pop_flat = pop.unstack(level=1)
pop_flat

year,2000,2010
state,Unnamed: 1_level_1,Unnamed: 2_level_1
California,46894801,40574847
New York,24250239,37456591
Texas,86531452,56957055


In [93]:
# Turn pop_flat back into pop (e.g. perform the inverse operation)
pop_flat.stack()

state       year
California  2000    46894801
            2010    40574847
New York    2000    24250239
            2010    37456591
Texas       2000    86531452
            2010    56957055
dtype: int32

### Data Aggregations on Multi-Indices

In [94]:
health_data

Unnamed: 0_level_0,Unnamed: 1_level_0,Alex,Alex,Devin,Devin,Savanna,Savanna
Unnamed: 0_level_1,Unnamed: 1_level_1,Blood Test,Donation,Blood Test,Donation,Blood Test,Donation
2017,1,1,0,1,0,5,3
2017,2,2,5,3,4,1,8
2018,1,1,1,9,1,9,0
2018,2,5,9,5,9,0,1


In [113]:
# Get mean health measurements for each year across visits
health_data.mean(level=0)

# Get mean health measurements across people
#health_data.mean(axis=1, level=1)

Unnamed: 0_level_0,Alex,Alex,Devin,Devin,Savanna,Savanna
Unnamed: 0_level_1,Blood Test,Donation,Blood Test,Donation,Blood Test,Donation
2017,1.5,2.5,2.0,2.0,3.0,5.5
2018,3.0,5.0,7.0,5.0,4.5,0.5


In [115]:
# Get mean health measurements across all people
health_data.mean(axis=1, level=1)

Unnamed: 0,Unnamed: 1,Blood Test,Donation
2017,1,2.333333,1.0
2017,2,2.0,5.666667
2018,1,6.333333,0.666667
2018,2,3.333333,6.333333


In [119]:
# Get mean health measurements for each year across all people
health_data.mean(level=0).mean(axis=1, level=1)

Unnamed: 0,Blood Test,Donation
2017,2.166667,3.333333
2018,4.833333,3.5
