In [72]:
# Reference: 
# online free docs:          https://pandas.pydata.org/pandas-docs/stable/
# book old edition free:  https://www.safaribooksonline.com/library/view/python-data-science/9781491912126/
# book new edition pay: https://smile.amazon.com/Python-Data-Science-Handbook-Essential/dp/1491912057/

In [73]:
# pandas is a python library used for data manipulation and analysis
# two key data structures - series objects and dataframes

In [1]:
import numpy as np
import pandas as pd

In [2]:
##### indexing dataframes ####

In [3]:
np.random.seed(0)
df = pd.DataFrame(data=np.random.normal(size=(3,5)), index=['X','Y','Z'], columns=['aa','bb','cc','dd','ee'])
df

Unnamed: 0,aa,bb,cc,dd,ee
X,1.764052,0.400157,0.978738,2.240893,1.867558
Y,-0.977278,0.950088,-0.151357,-0.103219,0.410599
Z,0.144044,1.454274,0.761038,0.121675,0.443863


In [37]:
# indexing columns 
df['aa']

X    1.764052
Y   -0.977278
Z    0.144044
Name: aa, dtype: float64

In [38]:
# getting multiple columns
df[['aa','cc']]

Unnamed: 0,aa,cc
X,1.764052,0.978738
Y,-0.977278,-0.151357
Z,0.144044,0.761038


In [39]:
# indexing rows
df.loc['X']

aa    1.764052
bb    0.400157
cc    0.978738
dd    2.240893
ee    1.867558
Name: X, dtype: float64

In [40]:
# indexing rows with implicit index
df.iloc[0]

aa    1.764052
bb    0.400157
cc    0.978738
dd    2.240893
ee    1.867558
Name: X, dtype: float64

In [41]:
# getting multiple rows
df.loc[['X','Z']]

Unnamed: 0,aa,bb,cc,dd,ee
X,1.764052,0.400157,0.978738,2.240893,1.867558
Z,0.144044,1.454274,0.761038,0.121675,0.443863


In [42]:
# indexing elements
df['aa']['X'] 

1.764052345967664

In [43]:
df['aa'].loc['X'] 

1.764052345967664

In [44]:
df.loc['X']['aa'] 

1.764052345967664

In [45]:
df.loc['X', 'aa'] 

1.764052345967664

In [46]:
# getting multiple elements
df[['aa','cc']].loc[['X','Z']]

Unnamed: 0,aa,cc
X,1.764052,0.978738
Z,0.144044,0.761038


In [47]:
df.loc[['X','Z'], ['aa','cc']] # df.loc[['X','Z']][['aa','cc']]

Unnamed: 0,aa,cc
X,1.764052,0.978738
Z,0.144044,0.761038


In [48]:
# columns of a dataframe are essentially series objects
type(df['aa'])

pandas.core.series.Series

In [49]:
# rows of a dataframe are essentially series objects
type(df.loc['X'])

pandas.core.series.Series

In [50]:
##### resetting/renaming index and columns #####

In [51]:
# set a name for the index column - by default it doesn't have a name
df = pd.DataFrame(data=np.random.normal(size=(3,5)), index=['X','Y','Z'], columns=['aa','bb','cc','dd','ee'])
df

Unnamed: 0,aa,bb,cc,dd,ee
X,0.333674,1.494079,-0.205158,0.313068,-0.854096
Y,-2.55299,0.653619,0.864436,-0.742165,2.269755
Z,-1.454366,0.045759,-0.187184,1.532779,1.469359


In [52]:
df.index.names

FrozenList([None])

In [53]:
df.index.names = ['group']
df

Unnamed: 0_level_0,aa,bb,cc,dd,ee
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
X,0.333674,1.494079,-0.205158,0.313068,-0.854096
Y,-2.55299,0.653619,0.864436,-0.742165,2.269755
Z,-1.454366,0.045759,-0.187184,1.532779,1.469359


In [54]:
# reset the index
df = pd.DataFrame(data=np.random.normal(size=(3,5)), index=['X','Y','Z'], columns=['aa','bb','cc','dd','ee'])
df

Unnamed: 0,aa,bb,cc,dd,ee
X,0.154947,0.378163,-0.887786,-1.980796,-0.347912
Y,0.156349,1.230291,1.20238,-0.387327,-0.302303
Z,-1.048553,-1.420018,-1.70627,1.950775,-0.509652


In [55]:
# this will reset indext to integer, and keep the old index as a new column in the dataframe
df.reset_index()

Unnamed: 0,index,aa,bb,cc,dd,ee
0,X,0.154947,0.378163,-0.887786,-1.980796,-0.347912
1,Y,0.156349,1.230291,1.20238,-0.387327,-0.302303
2,Z,-1.048553,-1.420018,-1.70627,1.950775,-0.509652


In [56]:
# this will reset indext to integer, and drop the old index 
df.reset_index(drop=True)

Unnamed: 0,aa,bb,cc,dd,ee
0,0.154947,0.378163,-0.887786,-1.980796,-0.347912
1,0.156349,1.230291,1.20238,-0.387327,-0.302303
2,-1.048553,-1.420018,-1.70627,1.950775,-0.509652


In [57]:
# this will reset index to integer, and drop the old index - all this will be done "in place" on the dataframe
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,aa,bb,cc,dd,ee
0,0.154947,0.378163,-0.887786,-1.980796,-0.347912
1,0.156349,1.230291,1.20238,-0.387327,-0.302303
2,-1.048553,-1.420018,-1.70627,1.950775,-0.509652


In [58]:
# create a new column and set that as the index
df['new'] = ['P','Q','R']
df

Unnamed: 0,aa,bb,cc,dd,ee,new
0,0.154947,0.378163,-0.887786,-1.980796,-0.347912,P
1,0.156349,1.230291,1.20238,-0.387327,-0.302303,Q
2,-1.048553,-1.420018,-1.70627,1.950775,-0.509652,R


In [59]:
df.set_index('new', inplace=True)
df

Unnamed: 0_level_0,aa,bb,cc,dd,ee
new,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
P,0.154947,0.378163,-0.887786,-1.980796,-0.347912
Q,0.156349,1.230291,1.20238,-0.387327,-0.302303
R,-1.048553,-1.420018,-1.70627,1.950775,-0.509652


In [60]:
# rename index and columns
df.rename(index={'P':'XX', 'Q':'YY', 'R':'ZZ'}, columns={'aa':'AA', 'bb':'BB'}, inplace=True)
df

Unnamed: 0_level_0,AA,BB,cc,dd,ee
new,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
XX,0.154947,0.378163,-0.887786,-1.980796,-0.347912
YY,0.156349,1.230291,1.20238,-0.387327,-0.302303
ZZ,-1.048553,-1.420018,-1.70627,1.950775,-0.509652


In [61]:
##### multi-indexing or index hierarchy #####

In [62]:
np.random.seed(0)
df = pd.DataFrame(
    data=np.random.randint(-100, 100, (4,5)),
    index=pd.MultiIndex.from_product([['X', 'Y'], [1,2]]),
    columns=['A','B','C','D','E'])
df

Unnamed: 0,Unnamed: 1,A,B,C,D,E
X,1,72,-53,17,92,-33
X,2,95,3,-91,-79,-64
Y,1,-13,-30,-12,40,-42
Y,2,93,-61,-13,74,-12


In [63]:
df.loc['X']

Unnamed: 0,A,B,C,D,E
1,72,-53,17,92,-33
2,95,3,-91,-79,-64


In [64]:
(df.loc['X'])['A']

1    72
2    95
Name: A, dtype: int32

In [65]:
(df.loc['X'])['A'].loc[1]

72

In [66]:
(df.loc['X']).loc[1]

A    72
B   -53
C    17
D    92
E   -33
Name: 1, dtype: int32

In [67]:
(df.loc['X']).loc[1]['A']

72

In [68]:
# cross-section indexing is helpful in grabbing rows pertaining to the lower-level index
df.index.names

FrozenList([None, None])

In [69]:
df.index.names = ['group','number']
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C,D,E
group,number,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
X,1,72,-53,17,92,-33
X,2,95,3,-91,-79,-64
Y,1,-13,-30,-12,40,-42
Y,2,93,-61,-13,74,-12


In [70]:
# grab all rows pertaining to the group level at group=X
df.xs('X', level='group')

Unnamed: 0_level_0,A,B,C,D,E
number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,72,-53,17,92,-33
2,95,3,-91,-79,-64


In [71]:
# grab all rows pertaining to the number level at number=1 (that are in different groups)
df.xs(1, level='number')

Unnamed: 0_level_0,A,B,C,D,E
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
X,72,-53,17,92,-33
Y,-13,-30,-12,40,-42
