In general, pandas has plenty going on for the split-apply-combine process of general data science.  While piping might be applicable, it may not be useful. I will bounce back and forth to demonstrate the examples, but likely won't demo all the ones in the tidyverse chapter.

### Preliminaries

In [1]:
import pandas as pd
import numpy as np

# note that doing much with R in anaconda notebooks will fail at some point
import rpy2.robjects as robjects
from rpy2.robjects.packages import importr
from rpy2.robjects import r, pandas2ri
pandas2ri.activate()

In [2]:
## ----load_bball----------------------------------------------------------
# load('data/bball.RData')
# glimpse(bball[,1:5])

robjects.r['load']('../data/bball.RData')
bball = robjects.r.bball
# bball = pd.read_csv('../data/bball.csv')
bball.iloc[:,1:5].info()

<class 'pandas.core.frame.DataFrame'>
Index: 664 entries, 1 to 664
Data columns (total 4 columns):
Player    664 non-null object
Pos       664 non-null object
Age       664 non-null float64
Tm        664 non-null object
dtypes: float64(1), object(3)
memory usage: 25.9+ KB


In [3]:
## ----select1-------------------------------------------------------------
# bball %>% 
#   select(Player, Tm, Pos) %>% 
#   head

(bball
 .loc[:,['Player', 'Tm', 'Pos']]
 .head()
)

# or
(bball[['Player', 'Tm', 'Pos']]
 .head()
)


Unnamed: 0,Player,Tm,Pos
1,Alex Abrines,OKC,SG
2,Quincy Acy,BRK,PF
3,Steven Adams,OKC,C
4,Bam Adebayo,MIA,C
5,Arron Afflalo,ORL,SG


In [4]:
## ----select2-------------------------------------------------------------
# bball %>%     
#   select(-Player, -Tm, -Pos)  %>% 
#   head

(bball
 .drop(columns=['Player', 'Tm', 'Pos'])
 .head()
)

Unnamed: 0,Rk,Age,G,GS,MP,FG,FGA,FG.,X3P,X3PA,...,FT.,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
1,1.0,24.0,75.0,8.0,1134.0,115.0,291.0,0.395,84.0,221.0,...,0.848,26.0,88.0,114.0,28.0,38.0,8.0,25.0,124.0,353.0
2,2.0,27.0,70.0,8.0,1359.0,130.0,365.0,0.356,102.0,292.0,...,0.817,40.0,216.0,256.0,57.0,33.0,29.0,60.0,149.0,411.0
3,3.0,24.0,76.0,76.0,2487.0,448.0,712.0,0.629,0.0,2.0,...,0.557,384.0,301.0,685.0,88.0,92.0,78.0,128.0,215.0,1056.0
4,4.0,20.0,69.0,19.0,1368.0,174.0,340.0,0.512,0.0,7.0,...,0.721,118.0,263.0,381.0,101.0,32.0,41.0,66.0,138.0,477.0
5,5.0,32.0,53.0,3.0,682.0,65.0,162.0,0.401,27.0,70.0,...,0.846,4.0,62.0,66.0,30.0,4.0,9.0,21.0,56.0,179.0


The following example uses tidyverse helper functions, which are available as basic string functions in Python (e.g. str.contains), but I haven't found how to implement them as cleanly in the pandaverse (e.g. using filter or query). 

In [5]:
## ----select3-------------------------------------------------------------
# bball %>% 
#   select(Player, contains("3P"), ends_with("RB")) %>% 
#   arrange(desc(TRB)) %>% 
#   head

(bball
 .filter(regex='3P|RB$', axis='columns')  # columns is the default
 .sort_values(by='TRB', ascending=False)
 .head()
)

# looks funny because we haven't filtered out the repeated headers yet

Unnamed: 0,X3P,X3PA,X3P.,ORB,DRB,TRB
169,0.0,11.0,0.0,399.0,848.0,1247.0
334,0.0,0.0,,329.0,842.0,1171.0
592,120.0,285.0,0.421,238.0,774.0,1012.0
281,1.0,7.0,0.143,255.0,757.0,1012.0
149,55.0,162.0,0.34,187.0,645.0,832.0


### Filtering Rows

In [6]:
## ----filter0-------------------------------------------------------------
# bball = bball %>% 
#   filter(Rk != "Rk")

bball = (bball
         .query('Rk != "Rk"')
         .apply(pd.to_numeric, errors='ignore')
        )

# redo previous
(bball
 .filter(regex='3P|RB$', axis='columns')  # columns is the default
 .sort_values(by='TRB', ascending=False)
 .head()
)

Unnamed: 0,X3P,X3PA,X3P.,ORB,DRB,TRB
169,0.0,11.0,0.0,399.0,848.0,1247.0
334,0.0,0.0,,329.0,842.0,1171.0
592,120.0,285.0,0.421,238.0,774.0,1012.0
281,1.0,7.0,0.143,255.0,757.0,1012.0
149,55.0,162.0,0.34,187.0,645.0,832.0


In [7]:
## ----filter1-------------------------------------------------------------
# bball %>% 
#   filter(Age > 35, Pos == "SF" | Pos == "PF") %>% 
#   distinct(Player, Pos, Age)     

(bball
 .query('Age > 35 & (Pos == "SF"| Pos == "PF")')
 .drop_duplicates(subset = ['Player', 'Pos', 'Age'])
)

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,FT.,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
10,10.0,Tony Allen,SF,36.0,NOP,22.0,0.0,273.0,44.0,91.0,...,0.524,20.0,27.0,47.0,9.0,11.0,3.0,19.0,49.0,103.0
106,87.0,Vince Carter,SF,41.0,SAC,58.0,5.0,1026.0,114.0,283.0,...,0.757,18.0,130.0,148.0,69.0,42.0,26.0,36.0,96.0,313.0
128,105.0,Nick Collison,PF,37.0,OKC,15.0,0.0,75.0,13.0,19.0,...,0.385,7.0,13.0,20.0,4.0,0.0,0.0,7.0,7.0,31.0
309,251.0,Richard Jefferson,SF,37.0,DEN,20.0,0.0,163.0,12.0,27.0,...,0.571,2.0,15.0,17.0,15.0,2.0,1.0,5.0,11.0,30.0
318,258.0,Joe Johnson,SF,36.0,TOT,55.0,4.0,1205.0,146.0,360.0,...,0.889,14.0,158.0,172.0,83.0,18.0,7.0,49.0,67.0,372.0
518,421.0,Zach Randolph,PF,36.0,SAC,59.0,57.0,1508.0,361.0,763.0,...,0.785,97.0,300.0,397.0,127.0,42.0,10.0,116.0,119.0,857.0
634,516.0,Damien Wilkins,SF,38.0,IND,19.0,1.0,152.0,13.0,39.0,...,0.75,5.0,11.0,16.0,9.0,2.0,1.0,5.0,7.0,33.0


In [8]:
## ----filter2-------------------------------------------------------------
# bball %>% 
#   slice(1:10)


bball.iloc[:10]

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,FT.,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
1,1.0,Alex Abrines,SG,24.0,OKC,75.0,8.0,1134.0,115.0,291.0,...,0.848,26.0,88.0,114.0,28.0,38.0,8.0,25.0,124.0,353.0
2,2.0,Quincy Acy,PF,27.0,BRK,70.0,8.0,1359.0,130.0,365.0,...,0.817,40.0,216.0,256.0,57.0,33.0,29.0,60.0,149.0,411.0
3,3.0,Steven Adams,C,24.0,OKC,76.0,76.0,2487.0,448.0,712.0,...,0.557,384.0,301.0,685.0,88.0,92.0,78.0,128.0,215.0,1056.0
4,4.0,Bam Adebayo,C,20.0,MIA,69.0,19.0,1368.0,174.0,340.0,...,0.721,118.0,263.0,381.0,101.0,32.0,41.0,66.0,138.0,477.0
5,5.0,Arron Afflalo,SG,32.0,ORL,53.0,3.0,682.0,65.0,162.0,...,0.846,4.0,62.0,66.0,30.0,4.0,9.0,21.0,56.0,179.0
6,6.0,Cole Aldrich,C,29.0,MIN,21.0,0.0,49.0,5.0,15.0,...,0.333,3.0,12.0,15.0,3.0,2.0,1.0,1.0,11.0,12.0
7,7.0,LaMarcus Aldridge,C,32.0,SAS,75.0,75.0,2509.0,687.0,1347.0,...,0.837,246.0,389.0,635.0,152.0,43.0,90.0,111.0,162.0,1735.0
8,8.0,Jarrett Allen,C,19.0,BRK,72.0,31.0,1441.0,234.0,397.0,...,0.776,144.0,244.0,388.0,49.0,28.0,88.0,82.0,147.0,587.0
9,9.0,Kadeem Allen,PG,25.0,BOS,18.0,1.0,107.0,6.0,22.0,...,0.778,4.0,7.0,11.0,12.0,3.0,2.0,9.0,15.0,19.0
10,10.0,Tony Allen,SF,36.0,NOP,22.0,0.0,273.0,44.0,91.0,...,0.524,20.0,27.0,47.0,9.0,11.0,3.0,19.0,49.0,103.0


In [9]:
## ----uniteFilterArrange--------------------------------------------------
# bball %>% 
#   unite("posTeam", Pos, Tm) %>%         # create a new variable
#   filter(posTeam == "PF_SAS") %>%       # use it for filtering
#   select(Player, posTeam, Age) %>%      # use it for selection
#   arrange(desc(Age))                    # order 

(bball
 .assign(posTeam = bball.Pos + '_' + bball.Tm)
 .query('posTeam == "PF_SAS"')
 .loc[:,['Player', 'posTeam', 'Age']]
 .sort_values(by='Age', ascending=False)
)

Unnamed: 0,Player,posTeam,Age
209,Rudy Gay,PF_SAS,31.0
50,Davis Bertans,PF_SAS,25.0


### Generating New Data

In [10]:
## ----mutateAt------------------------------------------------------------
# bball = bball %>% 
#   mutate_at(vars(-Player, -Pos, -Tm), funs(as.numeric))   

# glimpse(bball[,1:7])

# we already did this in the first 'filtering rows' example


In [10]:
## ----mutate--------------------------------------------------------------
# bball = bball %>% 
#   mutate(trueShooting = PTS / (2 * (FGA + (.44 * FTA))),
#          effectiveFG = (FG + (.5 * X3P)) / FGA, 
#          shootingDif = trueShooting - FG.)

# summary(select(bball, shootingDif))  # select and others don't have to be piped to use

# slight issue due to there being a dot in the column name; but really this is not very satisfactory,
# but using newly created variables is on the way https://github.com/pandas-dev/pandas/issues/14207
# see also https://stackoverflow.com/questions/42496102/how-to-use-created-variable-in-same-assign-function-with-pandas
bball = (bball
         .assign(
             trueShooting = bball.PTS / (2 * (bball.FGA + (.44 * bball.FTA))),
             effectiveFG = (bball.FG + .5*bball.X3P) / bball.FGA)
        )
bball = bball.assign(shootingDif  = bball.trueShooting - bball.loc[:,'FG.'])

bball.shootingDif.describe()

count    660.000000
mean       0.087329
std        0.059283
min       -0.056977
25%        0.049093
50%        0.088768
75%        0.117104
max        0.500000
Name: shootingDif, dtype: float64

### Groupby

In [11]:
## ----groupby-------------------------------------------------------------
# bball %>%   
#   mutate(trueShooting = PTS / (2 * (FGA + (.44 * FTA))),
#          effectiveFG = (FG + (.5 * X3P)) / FGA, 
#          shootingDif = trueShooting - FG.) %>%  
#   select(Player, Tm, Pos, MP, trueShooting, effectiveFG, PTS) %>% 
#   group_by(Pos) %>%                                                 
#   summarize(meanTrueShooting = mean(trueShooting, na.rm = TRUE)) 

(bball
 .assign(
     trueShooting = bball.PTS / (2 * (bball.FGA + (.44 * bball.FTA))),
     effectiveFG = (bball.FG + .5*bball.X3P) / bball.FGA,
     shootingDif  = bball.trueShooting - bball.loc[:,'FG.'])
 .loc[:,['Player', 'Tm', 'Pos', 'MP', 'trueShooting', 'effectiveFG', 'PTS']]
 .groupby('Pos')
 .agg({'trueShooting': {'meanTrueShooting' :'mean'}})
)

in a future version.

For column-specific groupby renaming, use named aggregation

    >>> df.groupby(...).agg(name=('column', aggfunc))

  return super().aggregate(arg, *args, **kwargs)


Unnamed: 0_level_0,trueShooting
Unnamed: 0_level_1,meanTrueShooting
Pos,Unnamed: 1_level_2
C,0.574687
PF,0.536896
PG,0.512306
PG-SG,0.539388
SF,0.507524
SF-SG,0.521916
SG,0.515585


In [13]:
# I'm not currently aware of a do operation in pandas (nor have a good way to look it up). 
# However, I also find it somewhat awkward in the R implementation and rarely useful compared 
# to other approaches.

## ----do------------------------------------------------------------------
# bball %>% 
#   mutate(Pos = if_else(Pos=='PF-C', 'C', Pos)) %>% 
#   group_by(Pos) %>%     
#   do(FgFt_Corr=cor(.$FG., .$FT., use='complete')) %>% 
#   unnest(FgFt_Corr)

## ----do2-----------------------------------------------------------------
# library(nycflights13)
# carriers = group_by(flights, carrier)
# group_size(carriers)

# mods = do(carriers, model = lm(arr_delay ~ dep_time, data = .)) # reminder that data frames are lists
# mods %>% 
#   summarize(rsq = summary(model)$r.squared) %>% 

#   head

### Merge by id

In [12]:
## ----merge_demo
# band_members = data_frame(Name = c('Seth', 'Francis', 'Bubba'),
#                           Band = c('Com Truise', 'Pixies', 'The New Year'))
# band_instruments = data_frame(Name = c('Seth', 'Francis', 'Bubba'),
#                               Instrument = c('Synthesizer', 'Guitar', 'Guitar'))

# band_members
# band_instruments

# left_join(band_members, band_instruments)

band_members = pd.DataFrame({'Name' : ['Seth', 'Francis', 'Bubba'],
                             'Band' : ['Com Truise', 'Pixies', 'The New Year']
                            })
band_instruments = pd.DataFrame({'Name' : ['Seth', 'Francis', 'Bubba'],
                               'Instrument' : ['Synthesizer', 'Guitar', 'Guitar']
                            })

band_members
band_instruments


band_members.merge(band_instruments)

# alternative
# band_members = pd.DataFrame({'Band' : ['Com Truise', 'Pixies', 'The New Year']
#                             }, index = ['Seth', 'Francis', 'Bubba'])
# band_instruments = pd.DataFrame({'Instrument' : ['Synthesizer', 'Guitar', 'Guitar']}, 
#                                 index = ['Seth', 'Francis', 'Bubba'])
# band_members.join(band_instruments, how='left')


Unnamed: 0,Name,Band,Instrument
0,Seth,Com Truise,Synthesizer
1,Francis,Pixies,Guitar
2,Bubba,The New Year,Guitar


In [13]:
## ----gather_spread-------------------------------------------------------
# library(tidyr)
# stocks <- data.frame( time = as.Date('2009-01-01') + 0:9,
#                       X = rnorm(10, 0, 1),
#                       Y = rnorm(10, 0, 2),
#                       Z = rnorm(10, 0, 4) )
# stocks %>% head
# stocks %>% 
#   gather(stock, price, -time) %>% 
#   head

## ----tidyrSpread---------------------------------------------------------
# bball %>% 
#   separate(Player, into=c('firstName', 'lastName'), sep=' ') %>% 
#   select(1:5) %>% 
#   head

stocks = pd.DataFrame({'time' : pd.date_range('2009-01-01', periods=10),
                       'X' : np.random.randn(10),
                       'Y' : np.random.normal(0, 2, 10),
                       'Z' : np.random.normal(0, 4, 10)} )
stocks.head()

Unnamed: 0,time,X,Y,Z
0,2009-01-01,-0.847213,2.47902,-3.975186
1,2009-01-02,-0.285002,1.07401,-4.609405
2,2009-01-03,-1.400255,0.598027,2.494775
3,2009-01-04,1.552341,2.181804,5.435343
4,2009-01-05,1.735045,-0.067005,-1.72825


In [14]:
stocks_melt = stocks.melt(id_vars='time') 
stocks_melt

Unnamed: 0,time,variable,value
0,2009-01-01,X,-0.847213
1,2009-01-02,X,-0.285002
2,2009-01-03,X,-1.400255
3,2009-01-04,X,1.552341
4,2009-01-05,X,1.735045
5,2009-01-06,X,0.197499
6,2009-01-07,X,0.96443
7,2009-01-08,X,0.396378
8,2009-01-09,X,-0.023351
9,2009-01-10,X,-0.434503


In [15]:
stocks_melt.pivot(index='time', columns='variable')

Unnamed: 0_level_0,value,value,value
variable,X,Y,Z
time,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
2009-01-01,-0.847213,2.47902,-3.975186
2009-01-02,-0.285002,1.07401,-4.609405
2009-01-03,-1.400255,0.598027,2.494775
2009-01-04,1.552341,2.181804,5.435343
2009-01-05,1.735045,-0.067005,-1.72825
2009-01-06,0.197499,-1.749032,4.716749
2009-01-07,0.96443,0.39818,7.878169
2009-01-08,0.396378,-0.046286,-2.487165
2009-01-09,-0.023351,-0.255405,-0.605492
2009-01-10,-0.434503,-0.107056,-2.376451
