# Multindexing in pandas

In [1]:
import pandas as pd
import numpy as np

In [2]:
index = [('California', 2000), ('California', 2010), 
        ('New York', 2000), ('New York', 2010), ('Texas', 2000), ('Texas', 2010)]

populations = np.random.randint(10000000, 100000000, size=6)

In [3]:
# Create a Series 'pop' where the index is index and the values is populations

pop = pd.Series(data=populations, index=index)
pop

(California, 2000)    43930102
(California, 2010)    90286047
(New York, 2000)      46627823
(New York, 2010)      12032424
(Texas, 2000)         39398475
(Texas, 2010)         52072286
dtype: int32

In [4]:
# Create MultiIndex object from index
index = pd.MultiIndex.from_tuples(index)
index

MultiIndex(levels=[['California', 'New York', 'Texas'], [2000, 2010]],
           labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]])

In [5]:
# Update the index of pop with the new index
pop = pop.reindex(index)

In [6]:
# Access all data for which the second index is 2010
pop.loc[:, 2010]

California    90286047
New York      12032424
Texas         52072286
dtype: int32

In [7]:
# Make a single-indexed dataframe from the pop Series
pop_df = pop.unstack()

In [8]:
# Turn that dataframe back into a MultiIndexed Series
pop_df.stack()

California  2000    43930102
            2010    90286047
New York    2000    46627823
            2010    12032424
Texas       2000    39398475
            2010    52072286
dtype: int32

In [9]:
under18 = [9267089, 9284094,
          4687374, 4318033,
          5906301, 6879014]

# Make a MultiIndexed dataframe from the pop series. Result will be the MultiIndexed pop series with a column added.
pop_df = pd.DataFrame({'pop': pop, 'under18': under18})
pop_df

Unnamed: 0,Unnamed: 1,pop,under18
California,2000,43930102,9267089
California,2010,90286047,9284094
New York,2000,46627823,4687374
New York,2010,12032424,4318033
Texas,2000,39398475,5906301
Texas,2010,52072286,6879014


In [10]:
# Get unstacked df with the fraction of people under 18 by year
(pop_df['under18'] / pop_df['pop']).unstack()

Unnamed: 0,2000,2010
California,0.210951,0.10283
New York,0.100527,0.358866
Texas,0.149912,0.132105


## Methods of MultiIndex Creation

In [11]:
# Make df by passing a list of two or more index arrays to the constructor
# (Generate a random nxn matrix for the values, add column names / index names manually)

mat = np.random.randint(0, 10, size=(4, 4))

mdf = pd.DataFrame(mat, index=[[1, 1, 2, 2], ['a', 'b', 'c', 'd']], columns=[[1, 1, 2, 2],  ['a', 'b', 'c', 'd']])

mdf

Unnamed: 0_level_0,Unnamed: 1_level_0,1,1,2,2
Unnamed: 0_level_1,Unnamed: 1_level_1,a,b,c,d
1,a,3,3,7,7
1,b,4,6,4,2
2,c,5,8,0,4
2,d,6,9,3,0


In [12]:
[{i: j} for i, j in enumerate(pop.values)]

[{0: 43930102},
 {1: 90286047},
 {2: 46627823},
 {3: 12032424},
 {4: 39398475},
 {5: 52072286}]

In [13]:
# Pass a dictionary with appropriate tuples as keys
data = {
    ('California', 2000): 33871648,
    ('California', 2010): 23452345,
    ('Texas', 2000): 43328938,
    ('Texas', 2010): 42345849,
}

pd.Series(data)

California  2000    33871648
            2010    23452345
Texas       2000    43328938
            2010    42345849
dtype: int64

### Explicit MultiIndex constructors

Using class method constructors in pd.MultiIndex

Levels: list of lists containing available index values for each level
labels: List of lists that reference these levels

In [14]:
# Construct a MultiIndex object from list of arrays
pd.MultiIndex.from_arrays([[1, 1, 2, 2], ['a', 'b', 'c', 'd']], names=['int', 'str'])

MultiIndex(levels=[[1, 2], ['a', 'b', 'c', 'd']],
           labels=[[0, 0, 1, 1], [0, 1, 2, 3]],
           names=['int', 'str'])

In [15]:
# Construct a MultiIndex object from list of tuples
pd.MultiIndex.from_tuples([(1, 'a'), (1, 'b'), (2, 'c'), (2, 'd')])

MultiIndex(levels=[[1, 2], ['a', 'b', 'c', 'd']],
           labels=[[0, 0, 1, 1], [0, 1, 2, 3]])

In [16]:
# Construct a MultiIndex object from Cartesian product of single indices
pd.MultiIndex.from_product([[1, 2], ['a', 'b', 'c', 'd']])

MultiIndex(levels=[[1, 2], ['a', 'b', 'c', 'd']],
           labels=[[0, 0, 0, 0, 1, 1, 1, 1], [0, 1, 2, 3, 0, 1, 2, 3]])

In [17]:
# Construct a MultiIndex directly by passing it 'levels' and 'labels' arguments
pd.MultiIndex(levels=[[1, 2], ['a', 'b', 'c', 'd']],
              labels=[[0, 0, 1, 1], [0, 1, 2, 3]])

MultiIndex(levels=[[1, 2], ['a', 'b', 'c', 'd']],
           labels=[[0, 0, 1, 1], [0, 1, 2, 3]])

In [18]:
# Add names 'state' and 'year' to the index levels of pop
pop.index.names = ['state', 'year']
pop

state       year
California  2000    43930102
            2010    90286047
New York    2000    46627823
            2010    12032424
Texas       2000    39398475
            2010    52072286
dtype: int32

### MultiIndex for columns

In [19]:
# Create a random medical dataframe.
# Index levels: year, visit
# Column levels: name, visit type

index = pd.MultiIndex.from_product([[2017, 2018], [1, 2]])
col = pd.MultiIndex.from_product([['Alex', 'Devin', 'Savanna',], ['Blood Test', 'Donation']])

In [20]:
health_data = pd.DataFrame(data=np.random.randint(0, 10, (4, 6)), index=index, columns=col)

In [21]:
health_data

Unnamed: 0_level_0,Unnamed: 1_level_0,Alex,Alex,Devin,Devin,Savanna,Savanna
Unnamed: 0_level_1,Unnamed: 1_level_1,Blood Test,Donation,Blood Test,Donation,Blood Test,Donation
2017,1,5,8,2,2,5,7
2017,2,3,5,4,3,7,4
2018,1,3,4,0,1,7,4
2018,2,5,0,6,4,3,2


In [22]:
# Get all health data for just donation

idx = pd.IndexSlice
health_data.loc[idx[:], idx[:, 'Donation']]

Unnamed: 0_level_0,Unnamed: 1_level_0,Alex,Devin,Savanna
Unnamed: 0_level_1,Unnamed: 1_level_1,Donation,Donation,Donation
2017,1,8,2,7
2017,2,5,3,4
2018,1,4,1,4
2018,2,0,4,2


### Indexing and Slicing a MultiIndex

In [23]:
pop

state       year
California  2000    43930102
            2010    90286047
New York    2000    46627823
            2010    12032424
Texas       2000    39398475
            2010    52072286
dtype: int32

In [24]:
health_data

Unnamed: 0_level_0,Unnamed: 1_level_0,Alex,Alex,Devin,Devin,Savanna,Savanna
Unnamed: 0_level_1,Unnamed: 1_level_1,Blood Test,Donation,Blood Test,Donation,Blood Test,Donation
2017,1,5,8,2,2,5,7
2017,2,3,5,4,3,7,4
2018,1,3,4,0,1,7,4
2018,2,5,0,6,4,3,2


In [25]:
rdf = pd.DataFrame(np.random.randint(0, 10, (5, 5)), index=[i for i in range(5)], columns=[i for i in 'abcde'])

health_data.iloc[3, 1]

0

In [26]:
# From pop, grab florida's population in the year 2000
pop['Texas', 2000]

health_data.loc[2017, 'Alex':'Devin'].iloc[0, 0:4]

Alex   Blood Test    5
       Donation      8
Devin  Blood Test    2
       Donation      2
Name: 1, dtype: int32

In [27]:
# From pop, grab all info for California
pop['California']

year
2000    43930102
2010    90286047
dtype: int32

In [28]:
# From pop, grab info for both California and New York
pop[['California', 'New York']]

state       year
California  2000    43930102
            2010    90286047
New York    2000    46627823
            2010    12032424
dtype: int32

In [29]:
# From pop, grab all info for the year 2000
pop[:, 2000]

state
California    43930102
New York      46627823
Texas         39398475
dtype: int32

In [30]:
# From pop, grab all info where the population is above a certain amount
pop[pop > 20000000]

state       year
California  2000    43930102
            2010    90286047
New York    2000    46627823
Texas       2000    39398475
            2010    52072286
dtype: int32

In [31]:
# From pop, grab the first two states
pop.loc['California': 'New York']

state       year
California  2000    43930102
            2010    90286047
New York    2000    46627823
            2010    12032424
dtype: int32

### Multiply Indexed DataFrames

In [58]:
health_data

Unnamed: 0_level_0,Unnamed: 1_level_0,Alex,Alex,Devin,Devin,Savanna,Savanna
Unnamed: 0_level_1,Unnamed: 1_level_1,Blood Test,Donation,Blood Test,Donation,Blood Test,Donation
2017,1,1,8,2,2,5,7
2017,2,2,5,4,3,7,4
2018,1,3,4,0,1,7,4
2018,2,5,0,6,4,3,2


In [59]:
# Selection for Series works on df columns
# Get Alex's blood test data from health_data

health_data['Alex']['Blood Test']
health_data['Alex', 'Blood Test']

2017  1    1
      2    2
2018  1    3
      2    5
Name: (Alex, Blood Test), dtype: int32

In [74]:
# Get all data from the first visit of 2017
health_data.loc[2017]

Unnamed: 0_level_0,Alex,Alex,Devin,Devin,Savanna,Savanna
Unnamed: 0_level_1,Blood Test,Donation,Blood Test,Donation,Blood Test,Donation
1,1,8,2,2,5,7
2,2,5,4,3,7,4


In [68]:
# Get the first 4x4 set of data from health_data by integer location (should be all of Alex's data)
health_data.iloc[:2, :2]

Unnamed: 0_level_0,Unnamed: 1_level_0,Alex,Alex
Unnamed: 0_level_1,Unnamed: 1_level_1,Blood Test,Donation
2017,1,1,8
2017,2,2,5


In [36]:
health_data['Alex']['Blood Test'][2017] = pd.Series([1, 2])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  exec(code_obj, self.user_global_ns, self.user_ns)


In [76]:
health_data.loc[:, 'Alex':'Devin']

Unnamed: 0_level_0,Unnamed: 1_level_0,Alex,Alex,Devin,Devin
Unnamed: 0_level_1,Unnamed: 1_level_1,Blood Test,Donation,Blood Test,Donation
2017,1,1,8,2,2
2017,2,2,5,4,3
2018,1,3,4,0,1
2018,2,5,0,6,4


In [92]:
idx = pd.IndexSlice

# Using idx, get only blood test data from first visits
health_data
health_data.loc[idx[:, 1], idx[:, 'Blood Test']]

Unnamed: 0_level_0,Unnamed: 1_level_0,Alex,Devin,Savanna
Unnamed: 0_level_1,Unnamed: 1_level_1,Blood Test,Blood Test,Blood Test
2017,1,1,2,5
2018,1,3,0,7


### Rearranging Multi-Indices

In [97]:
health_data

Unnamed: 0_level_0,Unnamed: 1_level_0,Alex,Alex,Devin,Devin,Savanna,Savanna
Unnamed: 0_level_1,Unnamed: 1_level_1,Blood Test,Donation,Blood Test,Donation,Blood Test,Donation
2017,1,1,8,2,2,5,7
2017,2,2,5,4,3,7,4
2018,1,3,4,0,1,7,4
2018,2,5,0,6,4,3,2


In [96]:
# Test how unstack works with DataFrames

health_data.unstack()

Unnamed: 0_level_0,Alex,Alex,Alex,Alex,Devin,Devin,Devin,Devin,Savanna,Savanna,Savanna,Savanna
Unnamed: 0_level_1,Blood Test,Blood Test,Donation,Donation,Blood Test,Blood Test,Donation,Donation,Blood Test,Blood Test,Donation,Donation
Unnamed: 0_level_2,1,2,1,2,1,2,1,2,1,2,1,2
2017,1,2,8,5,2,4,2,3,5,7,7,4
2018,3,5,4,0,0,6,1,4,7,3,4,2


In [107]:
# Make new dataframe with 2019 data to be added to health_data later

ni = pd.MultiIndex.from_product([[2019], [1, 2]])
nc = pd.MultiIndex.from_product([['Alex', 'Devin', 'Savanna'], ['Blood Test', 'Donation']])

nd = pd.DataFrame(data=np.random.randint(0, 10, (2, 6)), index=ni, columns=nc)
nd

Unnamed: 0_level_0,Unnamed: 1_level_0,Alex,Alex,Devin,Devin,Savanna,Savanna
Unnamed: 0_level_1,Unnamed: 1_level_1,Blood Test,Donation,Blood Test,Donation,Blood Test,Donation
2017,1,3,0,3,5,1,0
2017,2,9,1,6,7,3,1


#### Sorted and unsorted indices

In [120]:
# Make unsorted data series

index = pd.MultiIndex.from_product([['a', 'c', 'b'], [1, 2]])
data = pd.Series(np.random.rand(6), index=index)
data.index.names=['let', 'num']

data

let  num
a    1      0.151035
     2      0.727024
c    1      0.996214
     2      0.809756
b    1      0.902918
     2      0.337228
dtype: float64

In [122]:
# Show that you can't index a Series normally if it isn't sorted

In [123]:
# Sort the index of data

In [124]:
# Sort the levels of data

In [129]:
# Select [a through b] of the sorted Series 'data'

In [130]:
# Unstack 'data', with num going to columns

In [131]:
# Unstack 'data', with let going to columns

#### Index setting and resetting

In [133]:
# Turn 'pop' into a flattened single-indexed dataframe with the columns being the former indices, 'pop_flat'

In [134]:
# Turn pop_flat back into pop (e.g. perform the inverse operation)

### Data Aggregations on Multi-Indices

In [143]:
health_data

Unnamed: 0_level_0,Unnamed: 1_level_0,Alex,Alex,Devin,Devin,Savanna,Savanna
Unnamed: 0_level_1,Unnamed: 1_level_1,Blood Test,Donation,Blood Test,Donation,Blood Test,Donation
2017,1,1,8,2,2,5,7
2017,2,2,5,4,3,7,4
2018,1,3,4,0,1,7,4
2018,2,5,0,6,4,3,2


In [144]:
# Get mean health measurements for each year across visits

In [None]:
# Get mean health measurements across all people

## Combining Datasets: Concat and Append

In [148]:
%reset

In [150]:
import pandas as pd
import numpy as np

In [160]:
# Convenience function

def make_dict(ind, cols):
    """Quickly make a DataFrame"""
    data = {c: [str(c) + str(i) for i in ind] for c in cols}
    return data
    #return pd.DataFrame(data, ind)
    
def make_df(ind, cols):
    """Quickly make a DataFrame"""
    data = {c: [str(c) + str(i) for i in ind] for c in cols}
    return pd.DataFrame(data, ind)

In [161]:
make_dict(range(3), 'ABC')

{'A': ['A0', 'A1', 'A2'], 'B': ['B0', 'B1', 'B2'], 'C': ['C0', 'C1', 'C2']}

In [163]:
make_df(range(3), 'ABC')

Unnamed: 0,A,B,C
0,A0,B0,C0
1,A1,B1,C1
2,A2,B2,C2
