In general, pandas has plenty going on for the split-apply-combine process of general data science.  While piping might be applicable, it may not be useful. I will bounce back and forth to demonstrate the examples, but likely won't demo all the ones in the tidyverse chapter.

### Preliminaries

In [1]:
import pandas as pd
import numpy as np

In [43]:
## ----load_bball----------------------------------------------------------
# load('data/bball.RData')
# glimpse(bball[,1:5])

bball = pd.read_csv('../data/bball.csv')
bball.iloc[:,1:5].info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 619 entries, 0 to 618
Data columns (total 4 columns):
Player    619 non-null object
Pos       619 non-null object
Age       619 non-null object
Tm        619 non-null object
dtypes: object(4)
memory usage: 19.4+ KB


In [32]:
## ----select1-------------------------------------------------------------
# bball %>% 
#   select(Player, Tm, Pos) %>% 
#   head

(bball
 .loc[:,['Player', 'Tm', 'Pos']]
 .head()
)

# or
(bball[['Player', 'Tm', 'Pos']]
 .head()
)


Unnamed: 0,Player,Tm,Pos
0,Alex Abrines,OKC,SG
1,Quincy Acy,TOT,PF
2,Quincy Acy,DAL,PF
3,Quincy Acy,BRK,PF
4,Steven Adams,OKC,C


In [11]:
## ----select2-------------------------------------------------------------
# bball %>%     
#   select(-Player, -Tm, -Pos)  %>% 
#   head

(bball
 .drop(columns=['Player', 'Tm', 'Pos'])
 .head()
)

Unnamed: 0,Rk,Age,G,GS,MP,FG,FGA,FG.,X3P,X3PA,...,FT.,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,1,23,68,6,1055,134,341,0.393,94,247,...,0.898,18,68,86,40,37,8,33,114,406
1,2,26,38,1,558,70,170,0.412,37,90,...,0.75,20,95,115,18,14,15,21,67,222
2,2,26,6,0,48,5,17,0.294,1,7,...,0.667,2,6,8,0,0,0,2,9,13
3,2,26,32,1,510,65,153,0.425,36,83,...,0.754,18,89,107,18,14,15,19,58,209
4,3,23,80,80,2389,374,655,0.571,0,1,...,0.611,281,332,613,86,89,78,146,195,905


The following example uses tidyverse helper functions, which are not as extensive in pandas, but the functionality that is there will get you pretty far. 

In [39]:
## ----select3-------------------------------------------------------------
# bball %>% 
#   select(Player, contains("3P"), ends_with("RB")) %>% 
#   arrange(desc(TRB)) %>% 
#   head

(bball
 .filter(regex='3P|RB$', axis='columns')  # columns is the default
 .sort_values(by='TRB', ascending=False)
 .head()
)

# looks funny because we haven't filtered out the repeated headers yet

Unnamed: 0,X3P,X3PA,X3P.,ORB,DRB,TRB
583,3P,3PA,3P%,ORB,DRB,TRB
507,3P,3PA,3P%,ORB,DRB,TRB
353,3P,3PA,3P%,ORB,DRB,TRB
47,3P,3PA,3P%,ORB,DRB,TRB
76,3P,3PA,3P%,ORB,DRB,TRB


### Filtering Rows

In [55]:
## ----filter0-------------------------------------------------------------
# bball = bball %>% 
#   filter(Rk != "Rk")

bball = (bball
         .query('Rk != "Rk"')
         .apply(pd.to_numeric, errors='ignore')
        )

# redo previous
(bball
 .filter(regex='3P|RB$', axis='columns')  # columns is the default
 .sort_values(by='TRB', ascending=False)
 .head()
)

Unnamed: 0,X3P,X3PA,X3P.,ORB,DRB,TRB
142,2,7,0.286,345,770,1115
304,0,2,0.0,298,816,1114
584,0,0,,293,795,1088
196,0,1,0.0,314,721,1035
550,101,275,0.367,296,711,1007


In [60]:
## ----filter1-------------------------------------------------------------
# bball %>% 
#   filter(Age > 35, Pos == "SF" | Pos == "PF") %>% 
#   distinct(Player, Pos, Age)     

(bball
 .query('Age > 35 & (Pos == "SF"| Pos == "PF")')
 .drop_duplicates(subset = ['Player', 'Pos', 'Age'])
)

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,FT.,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
33,29,Matt Barnes,SF,36,TOT,74,18,1777,185,473,...,0.787,70,314,384,195,48,26,107,185,527
91,73,Vince Carter,SF,40,MEM,73,15,1799,193,490,...,0.765,36,191,227,133,60,36,50,163,586
108,86,Nick Collison,PF,36,OKC,20,0,128,14,23,...,0.625,9,22,31,12,2,2,4,17,33
144,117,Mike Dunleavy,SF,36,TOT,53,2,841,93,220,...,0.8,18,97,115,50,15,7,28,67,275
282,222,Richard Jefferson,SF,36,CLE,79,13,1614,153,343,...,0.741,28,175,203,78,26,10,52,153,448
296,234,Dahntay Jones,SF,36,CLE,1,0,12,3,8,...,0.75,1,1,2,1,0,0,1,1,9
299,237,James Jones,SF,36,CLE,48,2,381,44,92,...,0.65,3,34,37,14,6,10,10,37,132
383,303,Mike Miller,SF,36,DEN,20,0,151,9,23,...,1.0,2,36,38,22,2,0,13,9,28
419,332,Dirk Nowitzki,PF,38,DAL,54,54,1424,296,678,...,0.875,23,330,353,82,30,38,51,113,769
450,356,Paul Pierce,SF,39,LAC,25,7,277,28,70,...,0.769,1,47,48,10,4,5,16,40,81


In [64]:
## ----filter2-------------------------------------------------------------
# bball %>% 
#   slice(1:10)


bball.iloc[:10]

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,FT.,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,1,Alex Abrines,SG,23,OKC,68,6,1055,134,341,...,0.898,18,68,86,40,37,8,33,114,406
1,2,Quincy Acy,PF,26,TOT,38,1,558,70,170,...,0.75,20,95,115,18,14,15,21,67,222
2,2,Quincy Acy,PF,26,DAL,6,0,48,5,17,...,0.667,2,6,8,0,0,0,2,9,13
3,2,Quincy Acy,PF,26,BRK,32,1,510,65,153,...,0.754,18,89,107,18,14,15,19,58,209
4,3,Steven Adams,C,23,OKC,80,80,2389,374,655,...,0.611,281,332,613,86,89,78,146,195,905
5,4,Arron Afflalo,SG,31,SAC,61,45,1580,185,420,...,0.892,9,116,125,78,21,6,42,104,515
6,5,Alexis Ajinca,C,28,NOP,39,15,584,89,178,...,0.725,46,131,177,12,20,22,31,77,207
7,6,Cole Aldrich,C,28,MIN,62,0,531,45,86,...,0.682,51,107,158,25,25,23,17,85,105
8,7,LaMarcus Aldridge,PF,31,SAS,72,72,2335,500,1049,...,0.812,172,351,523,139,46,88,98,158,1243
9,8,Lavoy Allen,PF,27,IND,61,5,871,77,168,...,0.697,105,114,219,57,18,24,29,78,177


In [72]:
## ----uniteFilterArrange--------------------------------------------------
# bball %>% 
#   unite("posTeam", Pos, Tm) %>%         # create a new variable
#   filter(posTeam == "PF_SAS") %>%       # use it for filtering
#   select(Player, posTeam, Age) %>%      # use it for selection
#   arrange(desc(Age))                    # order 

(bball
 .assign(posTeam = bball.Pos + '_' + bball.Tm)
 .query('posTeam == "PF_SAS"')
 .loc[:,['Player', 'posTeam', 'Age']]
 .sort_values(by='Age', ascending=False)
)

Unnamed: 0,Player,posTeam,Age
328,David Lee,PF_SAS,33
8,LaMarcus Aldridge,PF_SAS,31
51,Davis Bertans,PF_SAS,24


### Generating New Data