# Setup

In [1]:
from piper.factory import *
from piper.verbs import *
from piper.defaults import *
from pathlib import Path

piper v0.1.0: Monday, 29 March 2021 19:08:16


# Importing data into a dataframe

## read_csv()

Here's an example where we 'assign' the result to a variable (as in R)

In [2]:
%%piper
df <- pd.read_csv('inputs/2018 Bestsellers.csv') 
>> clean_columns()

## info()

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1300 entries, 0 to 1299
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   title          1300 non-null   object 
 1   week           1300 non-null   object 
 2   list           1300 non-null   object 
 3   rank           1300 non-null   int64  
 4   author         1300 non-null   object 
 5   price          1300 non-null   float64
 6   isbn           1300 non-null   int64  
 7   previous_rank  786 non-null    float64
 8   weeks_on_list  910 non-null    float64
dtypes: float64(3), int64(2), object(4)
memory usage: 91.5+ KB


In [4]:
%piper df >> info() >> pd.DataFrame.transpose()

Dataframe consumes 0.41 Mb


Unnamed: 0,0,1,2,3,4,5,6,7,8
columns,title,week,list,rank,author,price,isbn,previous_rank,weeks_on_list
type,object,object,object,int64,object,float64,int64,float64,float64
n,1300,1300,1300,1300,1300,1300,1300,1300,1300
isna,0,0,0,0,0,0,0,514,390
isnull,0,0,0,0,0,0,0,514,390
unique,220,13,7,15,195,46,229,15,95


## select()

### ALL columns

select() allows you to select columns in various ways, default - ALL columns

In [5]:
# df = select(df, ['week', 'title'])
# df

In [6]:
%%piper
df >> select() >> head()

1300 rows, 9 columns


Unnamed: 0,title,week,list,rank,author,price,isbn,previous_rank,weeks_on_list
0,A Wrinkle in Time,2/14/2018,Early & Middle,1,Madeleine L'Engle,8.99,9781250153272,,
1,Wonder,2/14/2018,Early & Middle,2,R.J. Palacio,16.99,9781524720193,,
2,Auggie & Me,2/14/2018,Early & Middle,3,R.J. Palacio,16.99,9781101934852,,
3,The Girl Who Drank the Moon,2/14/2018,Early & Middle,4,Kelly Barnhill,16.95,9781616205676,,


### specific column(s)

Specify an column name, selects that column

In [7]:
%%piper
df >> select('title') >> head()

1300 rows, 1 columns


Unnamed: 0,title
0,A Wrinkle in Time
1,Wonder
2,Auggie & Me
3,The Girl Who Drank the Moon


passing a list retrieves the respective columns in one go.

In [8]:
%%piper
select(df, ['title', 'author', 'price']) >>
head()

1300 rows, 3 columns


Unnamed: 0,title,author,price
0,A Wrinkle in Time,Madeleine L'Engle,8.99
1,Wonder,R.J. Palacio,16.99
2,Auggie & Me,R.J. Palacio,16.99
3,The Girl Who Drank the Moon,Kelly Barnhill,16.95


### excluding column(s)

How about all fields EXCEPT certain columns?<br> No problem, specify a minus as a prefix to the column name.

In [9]:
%%piper
df
>> select(['-isbn', '-previous_rank'])
>> select('-weeks_on_list')
>> head()

1300 rows, 6 columns


Unnamed: 0,title,week,list,rank,author,price
0,A Wrinkle in Time,2/14/2018,Early & Middle,1,Madeleine L'Engle,8.99
1,Wonder,2/14/2018,Early & Middle,2,R.J. Palacio,16.99
2,Auggie & Me,2/14/2018,Early & Middle,3,R.J. Palacio,16.99
3,The Girl Who Drank the Moon,2/14/2018,Early & Middle,4,Kelly Barnhill,16.95


### selecting from one column to another column (column range/slice)

If you want to select a subset (range) of columns - pass a slice object with the from and to columns.

In [10]:
%piper df >> select(slice('title', 'isbn')) >> head()

Use %piper/%%piper --info to see rendered pandas pipe statement


'NoneType' object has no attribute 'pipe'


### combining rows containing header information

In [11]:
xl_file = Path('inputs/Data/sales_2017.xlsx')
temp_df = pd.read_excel(xl_file, header=None)
head(temp_df)

2155 rows, 12 columns


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,Order,Customer,Sales,Order,Ship,Order,SKU,Order,Unit Sell,Discount,Shipping,Ship Mode
1,ID,ID,Person,Date,Date,Priority,Code,Quantity,Price,Percent,Amount,Container
2,13729,C508,Mr Robert Carlton,2017-01-01 00:00:00,2017-01-03 00:00:00,Not Specified,SKU947,9,95.99,0.08,35,Express Air - Large Box
3,28774,C372,Miss Roseanna Marr,2017-01-01 00:00:00,2017-01-02 00:00:00,High,SKU937,32,5.98,0.1,4.69,Regular Air - Small Box


In [12]:
%%piper
pd.read_excel(xl_file, header=None)
>> combine_header_rows()
>> head()

2153 rows, 12 columns


Unnamed: 0,Order Id,Customer Id,Sales Person,Order Date,Ship Date,Order Priority,Sku Code,Order Quantity,Unit Sell Price,Discount Percent,Shipping Amount,Ship Mode Container
2,13729,C508,Mr Robert Carlton,2017-01-01,2017-01-03,Not Specified,SKU947,9,95.99,0.08,35.0,Express Air - Large Box
3,28774,C372,Miss Roseanna Marr,2017-01-01,2017-01-02,High,SKU937,32,5.98,0.1,4.69,Regular Air - Small Box
4,9285,C212,Mr Robert Carlton,2017-01-02,2017-01-04,Critical,SKU363,3,40.98,0.06,2.99,Regular Air - Small Box
5,37537,C015,Mr Robert Carlton,2017-01-02,2017-01-02,Low,SKU052,4,291.73,0.0,48.8,Delivery Truck - Jumbo Drum


## count() / counts()

In [13]:
%piper df >> count() >> head()

9 rows, 3 columns


Unnamed: 0,n,%,cum %
title,1300,12.04,12.04
week,1300,12.04,24.08
list,1300,12.04,36.12
rank,1300,12.04,48.17


In [14]:
count(df.list)

Unnamed: 0_level_0,n,%,cum %
list,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Early & Middle,195,15.0,15.0
Hardcover Fiction,195,15.0,30.0
Hardcover Nonfiction,195,15.0,45.0
Trade Paperback Fiction,195,15.0,60.0
Trade Paperback Nonfiction,195,15.0,75.0
Young Adult,195,15.0,90.0
Mass Market,130,10.0,100.0


In [15]:
%piper df >> count('list', totals=True)

Unnamed: 0,n,%,cum %
Early & Middle,195,15.0,15.0
Hardcover Fiction,195,15.0,30.0
Hardcover Nonfiction,195,15.0,45.0
Trade Paperback Fiction,195,15.0,60.0
Trade Paperback Nonfiction,195,15.0,75.0
Young Adult,195,15.0,90.0
Mass Market,130,10.0,100.0
Total,1300,100.0,


In [16]:
%%piper 
count(df)

Unnamed: 0,n,%,cum %
title,1300,12.04,12.04
week,1300,12.04,24.08
list,1300,12.04,36.12
rank,1300,12.04,48.17
author,1300,12.04,60.21
price,1300,12.04,72.25
isbn,1300,12.04,84.29
weeks_on_list,910,8.43,92.72
previous_rank,786,7.28,100.0


In [17]:
%piper df >> count(['list'], percent=True, cum_percent=True, reset_index=True)

Unnamed: 0,list,n,%,cum %
0,Early & Middle,195,15.0,15.0
1,Hardcover Fiction,195,15.0,30.0
2,Hardcover Nonfiction,195,15.0,45.0
3,Trade Paperback Fiction,195,15.0,60.0
4,Trade Paperback Nonfiction,195,15.0,75.0
5,Young Adult,195,15.0,90.0
6,Mass Market,130,10.0,100.0


## assign()

In [18]:
%%piper
df 
>> select(slice('title', 'isbn'))
>> assign(week=lambda x: pd.to_datetime(x.week),
          price=lambda x: x.price.astype(str),
          concatenated_field = lambda x: x.title + ' ' + x.list, 
          price_premium = lambda x: (x.price.astype(float) * 4).round(2)) 
>> head(4)

Use %piper/%%piper --info to see rendered pandas pipe statement


'NoneType' object has no attribute 'pipe'


## to_date()

Alternative to using assign for date conversion 

In [19]:
%piper df >> to_date('week', format='%m/%d/%Y') >> head()

Use %piper/%%piper --info to see rendered pandas pipe statement


name 'to_date' is not defined


## relocate()

In [20]:
%%piper
df >>
select(slice('title', 'isbn')) >>
assign(week=lambda x: pd.to_datetime(x.week),
       price=lambda x: x.price.astype(str),
       concatenated_field = lambda x: x.title + ' ' + x.list, 
       price_premium = lambda x: (x.price.astype(float) * 4).round(2))
>> relocate('author', loc='first')
>> relocate('isbn', loc='after', ref_column='week')
>> relocate(['concatenated_field', 'price_premium'], loc='before', ref_column='isbn')
>> head(4)

Use %piper/%%piper --info to see rendered pandas pipe statement


'NoneType' object has no attribute 'pipe'


## where()

### ==, between, isin, startswith, endswith, contains etc.

Note how you can mix and match where clauses with assignments and add additional where clauses when needed.

In [21]:
%%piper
df 
>> select(slice('title', 'isbn')) 
>> where(" author.str.strip() == 'R.J. Palacio' ")
>> assign(week=lambda x: pd.to_datetime(x.week),
          price=lambda x: x.price.astype(str))
>> where(" rank.between(2, 4) & price.astype('float').between(14, 17) ")
>> head(2)

Use %piper/%%piper --info to see rendered pandas pipe statement


'NoneType' object has no attribute 'pipe'


In [22]:
%%piper
df >> select(slice('title', 'isbn'))
>> rename(columns={'list': 'list_'}) 
>> where(" list_.isin(['Early & Middle', 'Trade Paperback Nonfiction']) ") 
>> head(4)

Use %piper/%%piper --info to see rendered pandas pipe statement


'NoneType' object has no attribute 'pipe'


In [23]:
%%piper
df >> select(slice('title', 'isbn'))
>> rename(columns={'list': 'list_'}) 
>> where(" list_.str.startswith('Early') | list_.str.endswith('ion') ") 
>> head(2)

Use %piper/%%piper --info to see rendered pandas pipe statement


'NoneType' object has no attribute 'pipe'


In [24]:
%%piper
df >> select(slice('title', 'isbn'))
>> rename(columns={'list': 'list_'}) 
>> where(" list_.str.lower().str.contains('young') ")
>> head(4)

Use %piper/%%piper --info to see rendered pandas pipe statement


'NoneType' object has no attribute 'pipe'


## distinct()

In [25]:
%piper df >> distinct('isbn') >> head()

229 rows, 9 columns


Unnamed: 0,title,week,list,rank,author,price,isbn,previous_rank,weeks_on_list
0,A Wrinkle in Time,2/14/2018,Early & Middle,1,Madeleine L'Engle,8.99,9781250153272,,
1,Wonder,2/14/2018,Early & Middle,2,R.J. Palacio,16.99,9781524720193,,
2,Auggie & Me,2/14/2018,Early & Middle,3,R.J. Palacio,16.99,9781101934852,,
3,The Girl Who Drank the Moon,2/14/2018,Early & Middle,4,Kelly Barnhill,16.95,9781616205676,,


## group_by()

In [26]:
%%piper
df 
>> group_by(['title', 'author'])
>> summarise(mean_price=('price', 'mean')) 
>> assign(mean_price_times_something=lambda x: x.mean_price * 3) 
>> order_by(['author', 'title'], ascending=[False, False])
>> head(5).reset_index()

225 rows, 2 columns


Unnamed: 0,title,author,mean_price,mean_price_times_something
0,Feel Free,Zadie Smith,28.0,84.0
1,Leonardo da Vinci,Walter Isaacson,35.0,105.0
2,The Sympathizer,Viet Thanh Nguyen,16.0,48.0
3,Roller Girl,Victoria Jamieson,12.99,38.97
4,All's Faire in Middle School,Victoria Jamieson,12.99,38.97


### compress rows to a list / explode list

In [27]:
%%piper
df >> group_by('list')
>> summarise({'author': lambda x: x.tolist()})
# >> explode('author')  
>> head(6)

7 rows, 1 columns


Unnamed: 0_level_0,author
list,Unnamed: 1_level_1
Early & Middle,"[Madeleine L'Engle, R.J. Palacio, R.J. Palacio, Kelly Barnhill, Raina Telgemeier, Kimberly Brubaker Bradley, Raina T..."
Hardcover Fiction,"[Kristin Hannah, Tayari Jones, A.J. Finn, Amor Towles, Chloe Benjamin, Celeste Ng, Jesmyn Ward, Jojo Moyes, Dan Brow..."
Hardcover Nonfiction,"[Michael Wolff, Neil deGrasse Tyson, Mark Manson, Walter Isaacson, Tiffany Haddish, Dave Eggers, David Grann, Pete S..."
Mass Market,"[Lee Child, Ernest Cline, John Grisham, James Patterson, Michael Crichton, Liane Moriarty, Danielle Steel, Brian Kil..."
Trade Paperback Fiction,"[Colson Whitehead, George Saunders, Min Jin Lee, Jessica Shattuck, Ruth Ware, Georgia Hunter, Fredrik Backman, Antho..."
Trade Paperback Nonfiction,"[Jen Sincero, Peter Frankopan, Doug Stanton, Jonah Berger, Atul Gawande, Douglas J. Preston, Harvard Business School..."


## summarise()

In [28]:
%%piper 
get_sample_data() 
>> select('-dates')
>> summarise({'values_1': 'sum', 'values_2': 'sum'})

Use %piper/%%piper --info to see rendered pandas pipe statement


name 'get_sample_data' is not defined


In [29]:
%%piper 
get_sample_data() 
>> select('-dates')
>> where("regions.isin(['North', 'South'])")
>> group_by(['regions', 'countries'])
>> summarise({'values_1': 'sum', 'values_2': 'sum'})
>> rename(columns={'values_1': 'totval1', 'values_2': 'totval2'})
>> assign(pcent_val1 = lambda x: (x.totval1 * 100/x.totval1.sum()).round(2),
           grp_pcent = lambda x: x.groupby(['regions'])['totval2'].transform(lambda x: (x*100/x.sum()).round(2))
         )
>> head(10)

Use %piper/%%piper --info to see rendered pandas pipe statement


name 'get_sample_data' is not defined


## pivot_table()

In [30]:
head(df)

1300 rows, 9 columns


Unnamed: 0,title,week,list,rank,author,price,isbn,previous_rank,weeks_on_list
0,A Wrinkle in Time,2/14/2018,Early & Middle,1,Madeleine L'Engle,8.99,9781250153272,,
1,Wonder,2/14/2018,Early & Middle,2,R.J. Palacio,16.99,9781524720193,,
2,Auggie & Me,2/14/2018,Early & Middle,3,R.J. Palacio,16.99,9781101934852,,
3,The Girl Who Drank the Moon,2/14/2018,Early & Middle,4,Kelly Barnhill,16.95,9781616205676,,


In [31]:
index = ['list']
values=['rank', 'price']

In [32]:
%%piper

df
>> pivot_table(index=index, values=values, margins=True, fill_value=0,
               aggfunc={'rank': 'count', 'price': np.mean})
>> rename(columns={'price': 'mean_price', 'rank': 'total_rows'})
>> assign(mean_price=lambda x: x.mean_price.round(2)).reset_index()
>> head(4)

Use %piper/%%piper --info to see rendered pandas pipe statement


name 'pivot_table' is not defined
