In [1]:
# region Import
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import seaborn as sns
import datetime as dt

sns.set()
pd.set_option('display.expand_frame_repr', False)
# endregion

## Groupby aggregation with relabeling


In [6]:
animals = pd.DataFrame({'kind': ['cat', 'dog', 'cat', 'dog'],
                            'height': [9.1, 6.0, 9.5, 34.0],
                            'weight': [7.9, 7.5, 9.9, 198.0]})
animals

Unnamed: 0,kind,height,weight
0,cat,9.1,7.9
1,dog,6.0,7.5
2,cat,9.5,9.9
3,dog,34.0,198.0


In [24]:
animals.groupby(by='kind').agg(
    minHeight = pd.NamedAgg(column='height', aggfunc=np.min),
    maxHeight = pd.NamedAgg(column='height', aggfunc=np.max),
    avgWeight = pd.NamedAgg(column='weight', aggfunc=np.mean)
)

Unnamed: 0_level_0,minHeight,maxHeight,avgWeight
kind,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
cat,9.1,9.5,8.9
dog,6.0,34.0,102.75


In [19]:
dates = pd.date_range(start='20190101', periods=6, freq='D')
dates

DatetimeIndex(['2019-01-01', '2019-01-02', '2019-01-03', '2019-01-04',
               '2019-01-05', '2019-01-06'],
              dtype='datetime64[ns]', freq='D')

In [21]:
df = pd.DataFrame(data=np.random.randn(6, 4),
                 index=dates,
                 columns=['A', 'B', 'C', 'D'] )
df

Unnamed: 0,A,B,C,D
2019-01-01,0.501437,2.95776,0.355783,0.573872
2019-01-02,0.515532,-0.143581,0.255767,-0.022579
2019-01-03,1.971331,-1.723431,-0.800485,1.781385
2019-01-04,-1.597124,-1.216917,0.303841,-0.089252
2019-01-05,-1.11815,1.898717,-0.32258,1.893057
2019-01-06,0.895427,0.032905,1.181618,2.105195


In [8]:
list('AIJIOJAWD')

['A', 'I', 'J', 'I', 'O', 'J', 'A', 'W', 'D']

In [15]:
df.sort_values(by='A', axis=0, ascending=False)

Unnamed: 0,A,B,C,D
2019-01-04,1.701021,-0.266332,0.782735,0.742868
2019-01-01,0.067524,-0.073968,1.460207,0.236756
2019-01-03,-0.307644,-0.047458,-0.200672,-1.867609
2019-01-06,-0.332514,-1.182792,-0.172614,0.034275
2019-01-02,-0.769239,0.283732,-0.169692,-0.148777
2019-01-05,-1.239346,0.932564,0.244171,1.669823


In [28]:
df.iloc[:-1, 0:2]

Unnamed: 0,A,B
2019-01-01,0.501437,2.95776
2019-01-02,0.515532,-0.143581
2019-01-03,1.971331,-1.723431
2019-01-04,-1.597124,-1.216917
2019-01-05,-1.11815,1.898717


In [44]:
A = df[df['A'] > 0]['A'].to_list()
A

[0.5014371721389133, 0.515532173892191, 1.971331303332622, 0.8954268362073656]

In [48]:
df[df['A'].isin(A)]

Unnamed: 0,A,B,C,D
2019-01-01,0.501437,2.95776,0.355783,0.573872
2019-01-02,0.515532,-0.143581,0.255767,-0.022579
2019-01-03,1.971331,-1.723431,-0.800485,1.781385
2019-01-06,0.895427,0.032905,1.181618,2.105195


In [50]:
df['F'] = np.nan
df

Unnamed: 0,A,B,C,D,F
2019-01-01,0.501437,2.95776,0.355783,0.573872,
2019-01-02,0.515532,-0.143581,0.255767,-0.022579,
2019-01-03,1.971331,-1.723431,-0.800485,1.781385,
2019-01-04,-1.597124,-1.216917,0.303841,-0.089252,
2019-01-05,-1.11815,1.898717,-0.32258,1.893057,
2019-01-06,0.895427,0.032905,1.181618,2.105195,


In [54]:
df.loc[dates[:], 'A']

2019-01-01    0.501437
2019-01-02    0.515532
2019-01-03    1.971331
2019-01-04   -1.597124
2019-01-05   -1.118150
2019-01-06    0.895427
Freq: D, Name: A, dtype: float64

In [80]:
df = pd.DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
   ....:                          'foo', 'bar', 'foo', 'foo'],
   ....:                    'B': ['one', 'one', 'two', 'three',
   ....:                          'two', 'two', 'one', 'three'],
   ....:                    'C': np.random.randn(8),
   ....:                    'D': np.random.randn(8)})
df

Unnamed: 0,A,B,C,D
0,foo,one,-0.964723,-1.323411
1,bar,one,1.365759,-0.186477
2,foo,two,-1.538509,-1.157301
3,bar,three,-0.573633,1.138667
4,foo,two,0.495198,0.403999
5,bar,two,1.840416,1.055534
6,foo,one,0.521211,1.801306
7,foo,three,-0.354146,0.84999


In [85]:
dfGroupedSUM = df.groupby(by=['A', 'B']).agg(
    sumC = pd.NamedAgg(column='C', aggfunc=np.sum),
    sumD = pd.NamedAgg(column='D', aggfunc=np.sum)
)
dfGroupedSUM


Unnamed: 0_level_0,Unnamed: 1_level_0,sumC,sumD
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,1.365759,-0.186477
bar,three,-0.573633,1.138667
bar,two,1.840416,1.055534
foo,one,-0.443512,0.477895
foo,three,-0.354146,0.84999
foo,two,-1.04331,-0.753302


In [87]:
dfGroupedTWICE = df.groupby(by=['A', 'B']).agg(
    twiceD = pd.NamedAgg(column='D', aggfunc=lambda d: len(d))
)
dfGroupedTWICE

Unnamed: 0_level_0,Unnamed: 1_level_0,twiceD
A,B,Unnamed: 2_level_1
bar,one,1.0
bar,three,1.0
bar,two,1.0
foo,one,2.0
foo,three,1.0
foo,two,2.0


In [89]:
dfGroupedSUM.merge(right=dfGroupedTWICE, on=['A', 'B'])

Unnamed: 0_level_0,Unnamed: 1_level_0,sumC,sumD,twiceD
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bar,one,1.365759,-0.186477,1.0
bar,three,-0.573633,1.138667,1.0
bar,two,1.840416,1.055534,1.0
foo,one,-0.443512,0.477895,2.0
foo,three,-0.354146,0.84999,1.0
foo,two,-1.04331,-0.753302,2.0


In [62]:
stacked = dfGrouped.stack()
stacked

A    B          
bar  one    sumC    0.172487
            sumD   -2.086903
     three  sumC   -0.038665
            sumD    0.840426
     two    sumC    0.188042
            sumD    1.075566
foo  one    sumC    2.640473
            sumD   -0.662213
     three  sumC    1.261198
            sumD   -2.467749
     two    sumC    0.309019
            sumD   -0.398328
dtype: float64

In [64]:
stacked.unstack(level=0)

Unnamed: 0_level_0,A,bar,foo
B,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
one,sumC,0.172487,2.640473
one,sumD,-2.086903,-0.662213
three,sumC,-0.038665,1.261198
three,sumD,0.840426,-2.467749
two,sumC,0.188042,0.309019
two,sumD,1.075566,-0.398328


In [69]:
df

Unnamed: 0,A,B,C,D
0,foo,one,0.28138,-0.532765
1,bar,one,0.172487,-2.086903
2,foo,two,-0.570639,0.282657
3,bar,three,-0.038665,0.840426
4,foo,two,0.879658,-0.680985
5,bar,two,0.188042,1.075566
6,foo,one,2.359093,-0.129448
7,foo,three,1.261198,-2.467749


In [71]:
df.pivot_table(values=['C', 'D'], index='A', columns='B', aggfunc=np.mean)

Unnamed: 0_level_0,C,C,C,D,D,D
B,one,three,two,one,three,two
A,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
bar,0.172487,-0.038665,0.188042,-2.086903,0.840426,1.075566
foo,1.320237,1.261198,0.154509,-0.331106,-2.467749,-0.199164


In [72]:
df = pd.DataFrame({'AAA': [4, 5, 6, 7],
   ...:                    'BBB': [10, 20, 30, 40],
   ...:                    'CCC': [100, 50, -30, -50]})
df

Unnamed: 0,AAA,BBB,CCC
0,4,10,100
1,5,20,50
2,6,30,-30
3,7,40,-50


In [77]:
df.loc[df['AAA'] > 5, 'AAA'] = 10

In [78]:
df

Unnamed: 0,AAA,BBB,CCC
0,4,10,100
1,5,20,50
2,10,30,-30
3,10,40,-50
