In general, pandas has plenty going on for the split-apply-combine process of general data science.  While piping might be applicable, it may not be useful. I will bounce back and forth to demonstrate the examples, but likely won't demo all the ones in the tidyverse chapter.

### Preliminaries

In [1]:
import pandas as pd
import numpy as np

# note that doing much with R in anaconda notebooks will fail at some point
import rpy2.robjects as robjects
from rpy2.robjects.packages import importr
from rpy2.robjects import r, pandas2ri
pandas2ri.activate()

In [2]:
## ----load_bball----------------------------------------------------------
# load('data/bball.RData')
# glimpse(bball[,1:5])

robjects.r['load']('../data/bball.RData')
bball = robjects.r.bball
# bball = pd.read_csv('../data/bball.csv')
bball.iloc[:,1:5].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 619 entries, 1 to 619
Data columns (total 4 columns):
Player    619 non-null object
Pos       619 non-null object
Age       619 non-null object
Tm        619 non-null object
dtypes: object(4)
memory usage: 24.2+ KB


In [3]:
## ----select1-------------------------------------------------------------
# bball %>% 
#   select(Player, Tm, Pos) %>% 
#   head

(bball
 .loc[:,['Player', 'Tm', 'Pos']]
 .head()
)

# or
(bball[['Player', 'Tm', 'Pos']]
 .head()
)


Unnamed: 0,Player,Tm,Pos
1,Alex Abrines,OKC,SG
2,Quincy Acy,TOT,PF
3,Quincy Acy,DAL,PF
4,Quincy Acy,BRK,PF
5,Steven Adams,OKC,C


In [4]:
## ----select2-------------------------------------------------------------
# bball %>%     
#   select(-Player, -Tm, -Pos)  %>% 
#   head

(bball
 .drop(columns=['Player', 'Tm', 'Pos'])
 .head()
)

Unnamed: 0,Rk,Age,G,GS,MP,FG,FGA,FG.,X3P,X3PA,...,FT.,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
1,1,23,68,6,1055,134,341,0.393,94,247,...,0.898,18,68,86,40,37,8,33,114,406
2,2,26,38,1,558,70,170,0.412,37,90,...,0.75,20,95,115,18,14,15,21,67,222
3,2,26,6,0,48,5,17,0.294,1,7,...,0.667,2,6,8,0,0,0,2,9,13
4,2,26,32,1,510,65,153,0.425,36,83,...,0.754,18,89,107,18,14,15,19,58,209
5,3,23,80,80,2389,374,655,0.571,0,1,...,0.611,281,332,613,86,89,78,146,195,905


The following example uses tidyverse helper functions, which are available as basic string functions in Python (e.g. str.contains), but I haven't found how to implement them as cleanly in the pandaverse (e.g. using filter or query). 

In [5]:
## ----select3-------------------------------------------------------------
# bball %>% 
#   select(Player, contains("3P"), ends_with("RB")) %>% 
#   arrange(desc(TRB)) %>% 
#   head

(bball
 .filter(regex='3P|RB$', axis='columns')  # columns is the default
 .sort_values(by='TRB', ascending=False)
 .head()
)

# looks funny because we haven't filtered out the repeated headers yet

Unnamed: 0,X3P,X3PA,X3P.,ORB,DRB,TRB
584,3P,3PA,3P%,ORB,DRB,TRB
508,3P,3PA,3P%,ORB,DRB,TRB
354,3P,3PA,3P%,ORB,DRB,TRB
48,3P,3PA,3P%,ORB,DRB,TRB
77,3P,3PA,3P%,ORB,DRB,TRB


### Filtering Rows

In [6]:
## ----filter0-------------------------------------------------------------
# bball = bball %>% 
#   filter(Rk != "Rk")

bball = (bball
         .query('Rk != "Rk"')
         .apply(pd.to_numeric, errors='ignore')
        )

# redo previous
(bball
 .filter(regex='3P|RB$', axis='columns')  # columns is the default
 .sort_values(by='TRB', ascending=False)
 .head()
)

Unnamed: 0,X3P,X3PA,X3P.,ORB,DRB,TRB
143,2,7,0.286,345,770,1115
305,0,2,0.0,298,816,1114
585,0,0,,293,795,1088
197,0,1,0.0,314,721,1035
551,101,275,0.367,296,711,1007


In [7]:
## ----filter1-------------------------------------------------------------
# bball %>% 
#   filter(Age > 35, Pos == "SF" | Pos == "PF") %>% 
#   distinct(Player, Pos, Age)     

(bball
 .query('Age > 35 & (Pos == "SF"| Pos == "PF")')
 .drop_duplicates(subset = ['Player', 'Pos', 'Age'])
)

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,FT.,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
34,29,Matt Barnes,SF,36,TOT,74,18,1777,185,473,...,0.787,70,314,384,195,48,26,107,185,527
92,73,Vince Carter,SF,40,MEM,73,15,1799,193,490,...,0.765,36,191,227,133,60,36,50,163,586
109,86,Nick Collison,PF,36,OKC,20,0,128,14,23,...,0.625,9,22,31,12,2,2,4,17,33
145,117,Mike Dunleavy,SF,36,TOT,53,2,841,93,220,...,0.8,18,97,115,50,15,7,28,67,275
283,222,Richard Jefferson,SF,36,CLE,79,13,1614,153,343,...,0.741,28,175,203,78,26,10,52,153,448
297,234,Dahntay Jones,SF,36,CLE,1,0,12,3,8,...,0.75,1,1,2,1,0,0,1,1,9
300,237,James Jones,SF,36,CLE,48,2,381,44,92,...,0.65,3,34,37,14,6,10,10,37,132
384,303,Mike Miller,SF,36,DEN,20,0,151,9,23,...,1.0,2,36,38,22,2,0,13,9,28
420,332,Dirk Nowitzki,PF,38,DAL,54,54,1424,296,678,...,0.875,23,330,353,82,30,38,51,113,769
451,356,Paul Pierce,SF,39,LAC,25,7,277,28,70,...,0.769,1,47,48,10,4,5,16,40,81


In [8]:
## ----filter2-------------------------------------------------------------
# bball %>% 
#   slice(1:10)


bball.iloc[:10]

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,FT.,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
1,1,Alex Abrines,SG,23,OKC,68,6,1055,134,341,...,0.898,18,68,86,40,37,8,33,114,406
2,2,Quincy Acy,PF,26,TOT,38,1,558,70,170,...,0.75,20,95,115,18,14,15,21,67,222
3,2,Quincy Acy,PF,26,DAL,6,0,48,5,17,...,0.667,2,6,8,0,0,0,2,9,13
4,2,Quincy Acy,PF,26,BRK,32,1,510,65,153,...,0.754,18,89,107,18,14,15,19,58,209
5,3,Steven Adams,C,23,OKC,80,80,2389,374,655,...,0.611,281,332,613,86,89,78,146,195,905
6,4,Arron Afflalo,SG,31,SAC,61,45,1580,185,420,...,0.892,9,116,125,78,21,6,42,104,515
7,5,Alexis Ajinca,C,28,NOP,39,15,584,89,178,...,0.725,46,131,177,12,20,22,31,77,207
8,6,Cole Aldrich,C,28,MIN,62,0,531,45,86,...,0.682,51,107,158,25,25,23,17,85,105
9,7,LaMarcus Aldridge,PF,31,SAS,72,72,2335,500,1049,...,0.812,172,351,523,139,46,88,98,158,1243
10,8,Lavoy Allen,PF,27,IND,61,5,871,77,168,...,0.697,105,114,219,57,18,24,29,78,177


In [9]:
## ----uniteFilterArrange--------------------------------------------------
# bball %>% 
#   unite("posTeam", Pos, Tm) %>%         # create a new variable
#   filter(posTeam == "PF_SAS") %>%       # use it for filtering
#   select(Player, posTeam, Age) %>%      # use it for selection
#   arrange(desc(Age))                    # order 

(bball
 .assign(posTeam = bball.Pos + '_' + bball.Tm)
 .query('posTeam == "PF_SAS"')
 .loc[:,['Player', 'posTeam', 'Age']]
 .sort_values(by='Age', ascending=False)
)

Unnamed: 0,Player,posTeam,Age
329,David Lee,PF_SAS,33
9,LaMarcus Aldridge,PF_SAS,31
52,Davis Bertans,PF_SAS,24


### Generating New Data

In [10]:
## ----mutateAt------------------------------------------------------------
# bball = bball %>% 
#   mutate_at(vars(-Player, -Pos, -Tm), funs(as.numeric))   

# glimpse(bball[,1:7])

# we already did this in the first 'filtering rows' example


In [11]:
## ----mutate--------------------------------------------------------------
# bball = bball %>% 
#   mutate(trueShooting = PTS / (2 * (FGA + (.44 * FTA))),
#          effectiveFG = (FG + (.5 * X3P)) / FGA, 
#          shootingDif = trueShooting - FG.)

# summary(select(bball, shootingDif))  # select and others don't have to be piped to use

# slight issue due to there being a dot in the column name; but really this is not very satisfactory,
# but using newly created variables is on the way https://github.com/pandas-dev/pandas/issues/14207
# see also https://stackoverflow.com/questions/42496102/how-to-use-created-variable-in-same-assign-function-with-pandas
bball = (bball
         .assign(
             trueShooting = bball.PTS / (2 * (bball.FGA + (.44 * bball.FTA))),
             effectiveFG = (bball.FG + .5*bball.X3P) / bball.FGA)
        )
bball = bball.assign(shootingDif  = bball.trueShooting - bball.loc[:,'FG.'])

bball.shootingDif.describe()

count    593.000000
mean       0.085550
std        0.056424
min       -0.468085
25%        0.052019
50%        0.090717
75%        0.117596
max        0.397872
Name: shootingDif, dtype: float64

### Groupby

In [12]:
## ----groupby-------------------------------------------------------------
# bball %>%   
#   mutate(trueShooting = PTS / (2 * (FGA + (.44 * FTA))),
#          effectiveFG = (FG + (.5 * X3P)) / FGA, 
#          shootingDif = trueShooting - FG.) %>%  
#   select(Player, Tm, Pos, MP, trueShooting, effectiveFG, PTS) %>% 
#   group_by(Pos) %>%                                                 
#   summarize(meanTrueShooting = mean(trueShooting, na.rm = TRUE)) 

(bball
 .assign(
     trueShooting = bball.PTS / (2 * (bball.FGA + (.44 * bball.FTA))),
     effectiveFG = (bball.FG + .5*bball.X3P) / bball.FGA,
     shootingDif  = bball.trueShooting - bball.loc[:,'FG.'])
 .loc[:,['Player', 'Tm', 'Pos', 'MP', 'trueShooting', 'effectiveFG', 'PTS']]
 .groupby('Pos')
 .agg({'trueShooting': {'meanTrueShooting' :'mean'}})
)

  return super(DataFrameGroupBy, self).aggregate(arg, *args, **kwargs)


Unnamed: 0_level_0,trueShooting
Unnamed: 0_level_1,meanTrueShooting
Pos,Unnamed: 1_level_2
C,0.564639
PF,0.516403
PF-C,0.509345
PG,0.510535
SF,0.529564
SG,0.515449


In [13]:
# I'm not currently aware of a do operation in pandas (nor have a good way to look it up). 
# However, I also find it somewhat awkward in the R implementation and rarely useful compared 
# to other approaches.

## ----do------------------------------------------------------------------
# bball %>% 
#   mutate(Pos = if_else(Pos=='PF-C', 'C', Pos)) %>% 
#   group_by(Pos) %>%     
#   do(FgFt_Corr=cor(.$FG., .$FT., use='complete')) %>% 
#   unnest(FgFt_Corr)

## ----do2-----------------------------------------------------------------
# library(nycflights13)
# carriers = group_by(flights, carrier)
# group_size(carriers)

# mods = do(carriers, model = lm(arr_delay ~ dep_time, data = .)) # reminder that data frames are lists
# mods %>% 
#   summarize(rsq = summary(model)$r.squared) %>% 

#   head

### Merge by id

In [14]:
## ----merge_demo
# band_members = data_frame(Name = c('Seth', 'Francis', 'Bubba'),
#                           Band = c('Com Truise', 'Pixies', 'The New Year'))
# band_instruments = data_frame(Name = c('Seth', 'Francis', 'Bubba'),
#                               Instrument = c('Synthesizer', 'Guitar', 'Guitar'))

# band_members
# band_instruments

# left_join(band_members, band_instruments)

band_members = pd.DataFrame({'Name' : ['Seth', 'Francis', 'Bubba'],
                             'Band' : ['Com Truise', 'Pixies', 'The New Year']
                            })
band_instruments = pd.DataFrame({'Name' : ['Seth', 'Francis', 'Bubba'],
                               'Instrument' : ['Synthesizer', 'Guitar', 'Guitar']
                            })

band_members
band_instruments


band_members.merge(band_instruments)

# alternative
# band_members = pd.DataFrame({'Band' : ['Com Truise', 'Pixies', 'The New Year']
#                             }, index = ['Seth', 'Francis', 'Bubba'])
# band_instruments = pd.DataFrame({'Instrument' : ['Synthesizer', 'Guitar', 'Guitar']}, 
#                                 index = ['Seth', 'Francis', 'Bubba'])
# band_members.join(band_instruments, how='left')


Unnamed: 0,Band,Name,Instrument
0,Com Truise,Seth,Synthesizer
1,Pixies,Francis,Guitar
2,The New Year,Bubba,Guitar


In [15]:
## ----gather_spread-------------------------------------------------------
# library(tidyr)
# stocks <- data.frame( time = as.Date('2009-01-01') + 0:9,
#                       X = rnorm(10, 0, 1),
#                       Y = rnorm(10, 0, 2),
#                       Z = rnorm(10, 0, 4) )
# stocks %>% head
# stocks %>% 
#   gather(stock, price, -time) %>% 
#   head

## ----tidyrSpread---------------------------------------------------------
# bball %>% 
#   separate(Player, into=c('firstName', 'lastName'), sep=' ') %>% 
#   select(1:5) %>% 
#   head

stocks = pd.DataFrame({'time' : pd.date_range('2009-01-01', periods=10),
                       'X' : np.random.randn(10),
                       'Y' : np.random.normal(0, 2, 10),
                       'Z' : np.random.normal(0, 4, 10)} )
stocks.head()

Unnamed: 0,X,Y,Z,time
0,-0.194377,-0.467984,-0.820087,2009-01-01
1,0.294429,-2.366306,0.411116,2009-01-02
2,-0.099478,2.23971,1.507877,2009-01-03
3,3.101763,2.160017,-3.038893,2009-01-04
4,-0.642435,-2.402669,3.677759,2009-01-05


In [16]:
stocks_melt = stocks.melt(id_vars='time') 
stocks_melt

Unnamed: 0,time,variable,value
0,2009-01-01,X,-0.194377
1,2009-01-02,X,0.294429
2,2009-01-03,X,-0.099478
3,2009-01-04,X,3.101763
4,2009-01-05,X,-0.642435
5,2009-01-06,X,1.092986
6,2009-01-07,X,-0.194165
7,2009-01-08,X,-1.456005
8,2009-01-09,X,-1.043897
9,2009-01-10,X,0.739556


In [17]:
stocks_melt.pivot(index='time', columns='variable')

Unnamed: 0_level_0,value,value,value
variable,X,Y,Z
time,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
2009-01-01,-0.194377,-0.467984,-0.820087
2009-01-02,0.294429,-2.366306,0.411116
2009-01-03,-0.099478,2.23971,1.507877
2009-01-04,3.101763,2.160017,-3.038893
2009-01-05,-0.642435,-2.402669,3.677759
2009-01-06,1.092986,-2.06052,-2.210833
2009-01-07,-0.194165,4.606805,0.036478
2009-01-08,-1.456005,-2.00164,-4.680342
2009-01-09,-1.043897,1.173318,2.840138
2009-01-10,0.739556,-0.405245,2.088565
