# Useful Pandas Snippets

In [46]:
import pandas as pd
import numpy as np

In [47]:
np.random.seed(0)
df = pd.DataFrame(np.random.randint(0,50, size=(50, 4)), columns=list('ABCD'))

In [48]:
df.head()

Unnamed: 0,A,B,C,D
0,44,47,0,3
1,3,39,9,19
2,21,36,23,6
3,24,24,12,1
4,38,39,23,46


## Update coloumn in Pandas

#### Update column D where columns A and B are equal. Multiply D by 8888

In [49]:
df[df.A == df.B]

Unnamed: 0,A,B,C,D
3,24,24,12,1


In [50]:
df.loc[df.A == df.B, 'D'] = df.loc[df.A == df.B , 'D'] * 8888

In [51]:
df.head()

Unnamed: 0,A,B,C,D
0,44,47,0,3
1,3,39,9,19
2,21,36,23,6
3,24,24,12,8888
4,38,39,23,46


## Add column to a DataFrame

In [52]:
df['E'] = np.random.randint(0,50, size=50)

In [53]:
df.head()

Unnamed: 0,A,B,C,D,E
0,44,47,0,3,24
1,3,39,9,19,15
2,21,36,23,6,41
3,24,24,12,8888,18
4,38,39,23,46,40


## Add row to a DataFrame

In [54]:
df.tail()

Unnamed: 0,A,B,C,D,E
45,13,45,11,16,3
46,24,29,21,46,35
47,25,16,19,33,39
48,40,32,36,6,9
49,21,31,13,7,9


In [55]:
dfToAppend = pd.DataFrame(np.random.randint(0,50, size=(1,3)), columns=list('ACD'))
dfToAppend

Unnamed: 0,A,C,D
0,41,23,3


In [56]:
df = df.append(dfToAppend, ignore_index=True)

In [57]:
df.tail()

Unnamed: 0,A,B,C,D,E
46,24,29.0,21,46,35.0
47,25,16.0,19,33,39.0
48,40,32.0,36,6,9.0
49,21,31.0,13,7,9.0
50,41,,23,3,


## Change all NaNs to None (useful before loading into a db)
<b>df.where is equivalent to Oracle's nvl function</b>

In [58]:
df.where(df.notnull(), None).tail()

Unnamed: 0,A,B,C,D,E
46,24,29.0,21,46,35.0
47,25,16.0,19,33,39.0
48,40,32.0,36,6,9.0
49,21,31.0,13,7,9.0
50,41,,23,3,


## Loop through rows in a DataFrame (if you must)

In [59]:
for index, row in df.head().iterrows():
    print(index, row['A'], row['E'])

0 44.0 24.0
1 3.0 15.0
2 21.0 41.0
3 24.0 18.0
4 38.0 40.0


## GroupBy capabilities of Pandas 

In [61]:
len(df.index)

51

In [62]:
df['F'] = np.append(np.repeat(np.array(['test1', 'test2', 'test3', 'test4', 'test5']), 10), ['test5'])

In [69]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 51 entries, 0 to 50
Data columns (total 6 columns):
A    51 non-null int32
B    50 non-null float64
C    51 non-null int32
D    51 non-null int32
E    50 non-null float64
F    51 non-null object
dtypes: float64(2), int32(3), object(1)
memory usage: 2.0+ KB


In [68]:
df.tail()

Unnamed: 0,A,B,C,D,E,F
46,24,29.0,21,46,35.0,test5
47,25,16.0,19,33,39.0,test5
48,40,32.0,36,6,9.0,test5
49,21,31.0,13,7,9.0,test5
50,41,,23,3,,test5


In [93]:
dfGr = df.groupby('F').agg({'A': 'mean', 
                            'B': ['sum', 'count'], 
                            'C': {'Max C': 'max', 'Min C': 'min'}, 
                            'D': {'Special func': lambda x: 'Max: {}'.format(max(x))}})
dfGr

Unnamed: 0_level_0,B,B,C,C,D,A
Unnamed: 0_level_1,sum,count,Min C,Max C,Special func,mean
F,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
test1,262,10,0,37,Max: 8888,23.2
test2,237,10,0,41,Max: 42,23.2
test3,287,10,3,48,Max: 49,22.0
test4,170,10,1,43,Max: 46,19.8
test5,289,10,2,36,Max: 48,26.636364


## Flatten MultiIndex columns

In [98]:
dfGr.columns = ['_'.join(c) for c in dfGr.columns.ravel()]

In [99]:
dfGr

Unnamed: 0_level_0,B_sum,B_count,C_Min C,C_Max C,D_Special func,A_mean
F,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
test1,262,10,0,37,Max: 8888,23.2
test2,237,10,0,41,Max: 42,23.2
test3,287,10,3,48,Max: 49,22.0
test4,170,10,1,43,Max: 46,19.8
test5,289,10,2,36,Max: 48,26.636364


## Unstack index level

In [127]:
arr = np.append(np.repeat(np.array(['T', 'M']), 25), ['M'])

In [128]:
np.random.shuffle(arr)
arr

array(['T', 'T', 'M', 'T', 'M', 'T', 'M', 'T', 'T', 'T', 'M', 'M', 'T',
       'M', 'M', 'T', 'M', 'T', 'T', 'T', 'T', 'T', 'M', 'M', 'M', 'T',
       'T', 'M', 'M', 'M', 'M', 'M', 'M', 'T', 'M', 'M', 'T', 'T', 'M',
       'T', 'M', 'M', 'M', 'M', 'M', 'M', 'T', 'T', 'T', 'T', 'T'], 
      dtype='<U1')

In [129]:
df['G'] = arr

In [130]:
df.tail()

Unnamed: 0,A,B,C,D,E,F,G
46,24,29.0,21,46,35.0,test5,T
47,25,16.0,19,33,39.0,test5,T
48,40,32.0,36,6,9.0,test5,T
49,21,31.0,13,7,9.0,test5,T
50,41,,23,3,,test5,T


In [131]:
dfGr2 = df.groupby(['F', 'G']).agg({'A': 'sum', 'B': 'size'})
dfGr2

Unnamed: 0_level_0,Unnamed: 1_level_0,B,A
F,G,Unnamed: 2_level_1,Unnamed: 3_level_1
test1,M,3,72
test1,T,7,160
test2,M,5,93
test2,T,5,139
test3,M,6,160
test3,T,4,60
test4,M,6,128
test4,T,4,70
test5,M,6,142
test5,T,5,151


In [138]:
dfGr2.unstack(level=0)

Unnamed: 0_level_0,B,B,B,B,B,A,A,A,A,A
F,test1,test2,test3,test4,test5,test1,test2,test3,test4,test5
G,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
M,3,5,6,6,6,72,93,160,128,142
T,7,5,4,4,5,160,139,60,70,151


## Pivot table

In [143]:
df.pivot_table(values=['A', 'B'], index=['F'], columns=['G'], aggfunc=np.sum, margins=True, margins_name='Total')

Unnamed: 0_level_0,A,A,A,B,B,B
G,M,T,Total,M,T,Total
F,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
test1,72,160,232,83,179,262
test2,93,139,232,110,127,237
test3,160,60,220,183,104,287
test4,128,70,198,108,62,170
test5,142,151,293,181,108,289
Total,595,580,1175,665,580,1245
