## Swifter – A package which efficiently applies any function to a pandas dataframe or series in the fastest available manner.

### Imports and Data 

In [2]:
import pandas as pd
import numpy as np
import swifter

In [3]:
trips = pd.read_csv('trip.csv')

In [4]:
print(trips.shape)
trips.head()

(669959, 11)


Unnamed: 0,id,duration,start_date,start_station_name,start_station_id,end_date,end_station_name,end_station_id,bike_id,subscription_type,zip_code
0,4576,63,8/29/2013 14:13,South Van Ness at Market,66,8/29/2013 14:14,South Van Ness at Market,66,520,Subscriber,94127
1,4607,70,8/29/2013 14:42,San Jose City Hall,10,8/29/2013 14:43,San Jose City Hall,10,661,Subscriber,95138
2,4130,71,8/29/2013 10:16,Mountain View City Hall,27,8/29/2013 10:17,Mountain View City Hall,27,48,Subscriber,97214
3,4251,77,8/29/2013 11:29,San Jose City Hall,10,8/29/2013 11:30,San Jose City Hall,10,26,Subscriber,95060
4,4299,83,8/29/2013 12:02,South Van Ness at Market,66,8/29/2013 12:04,Market at 10th,67,319,Subscriber,94103


### Apply any function in the fastest available manner¶

In [5]:
def bikes_start_end(end_id):
    if end_id > 50:
        return True
    else:
        return False

In [6]:
%time trips['bike_station_class'] = trips['end_station_id'].swifter.apply(bikes_start_end)

HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=669959.0, style=ProgressStyle(descript…


Wall time: 1.8 s


#### write code in a vectorized format

In [7]:
def bikes_start_end_vectorized(end_id):
    return np.where(end_id > 5, True, False)

In [8]:
%time trips['bike_station_class_vec'] = trips['end_station_id'].swifter.apply(bikes_start_end_vectorized)

Wall time: 15.6 ms


In [9]:
trips.head()

Unnamed: 0,id,duration,start_date,start_station_name,start_station_id,end_date,end_station_name,end_station_id,bike_id,subscription_type,zip_code,bike_station_class,bike_station_class_vec
0,4576,63,8/29/2013 14:13,South Van Ness at Market,66,8/29/2013 14:14,South Van Ness at Market,66,520,Subscriber,94127,True,True
1,4607,70,8/29/2013 14:42,San Jose City Hall,10,8/29/2013 14:43,San Jose City Hall,10,661,Subscriber,95138,False,True
2,4130,71,8/29/2013 10:16,Mountain View City Hall,27,8/29/2013 10:17,Mountain View City Hall,27,48,Subscriber,97214,False,True
3,4251,77,8/29/2013 11:29,San Jose City Hall,10,8/29/2013 11:30,San Jose City Hall,10,26,Subscriber,95060,False,True
4,4299,83,8/29/2013 12:02,South Van Ness at Market,66,8/29/2013 12:04,Market at 10th,67,319,Subscriber,94103,True,True


#### When you can't write code in a vectorized format, swifter still makes parallel processing easy

In [10]:
%time trips['date'] = trips['start_date'].swifter.apply(pd.to_datetime)

Wall time: 45.8 s


In [11]:
trips.head()

Unnamed: 0,id,duration,start_date,start_station_name,start_station_id,end_date,end_station_name,end_station_id,bike_id,subscription_type,zip_code,bike_station_class,bike_station_class_vec,date
0,4576,63,8/29/2013 14:13,South Van Ness at Market,66,8/29/2013 14:14,South Van Ness at Market,66,520,Subscriber,94127,True,True,2013-08-29 14:13:00
1,4607,70,8/29/2013 14:42,San Jose City Hall,10,8/29/2013 14:43,San Jose City Hall,10,661,Subscriber,95138,False,True,2013-08-29 14:42:00
2,4130,71,8/29/2013 10:16,Mountain View City Hall,27,8/29/2013 10:17,Mountain View City Hall,27,48,Subscriber,97214,False,True,2013-08-29 10:16:00
3,4251,77,8/29/2013 11:29,San Jose City Hall,10,8/29/2013 11:30,San Jose City Hall,10,26,Subscriber,95060,False,True,2013-08-29 11:29:00
4,4299,83,8/29/2013 12:02,South Van Ness at Market,66,8/29/2013 12:04,Market at 10th,67,319,Subscriber,94103,True,True,2013-08-29 12:02:00


### Multiple columns apply example 

In [12]:
#def bikes_start_end(start_id,end_id):
#     return end_id-start_id
# %time trips["bikes_start_end_diff"] = trips[['start_station_id', 'end_station_id']].swifter.apply(lambda row: bikes_start_end(row["start_station_id"], row["end_station_id"]))

### Rolling objects apply example 

In [13]:
%time trips["rolling_sum_duration"] = trips['duration'].swifter.rolling(10).apply(sum)

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Pandas Apply', max=1.0, style=ProgressS…


Wall time: 2min 32s


In [16]:
trips.iloc[10:15,:]

Unnamed: 0,id,duration,start_date,start_station_name,start_station_id,end_date,end_station_name,end_station_id,bike_id,subscription_type,zip_code,bike_station_class,bike_station_class_vec,date,rolling_sum_duration
10,4549,125,8/29/2013 13:52,Spear at Folsom,49,8/29/2013 13:55,Embarcadero at Bryant,54,368,Subscriber,94109,True,True,2013-08-29 13:52:00,976.0
11,4498,126,8/29/2013 13:23,San Pedro Square,6,8/29/2013 13:25,Santa Clara at Almaden,4,26,Subscriber,95112,False,False,2013-08-29 13:23:00,1032.0
12,4965,129,8/29/2013 19:32,Mountain View Caltrain Station,28,8/29/2013 19:35,Mountain View Caltrain Station,28,140,Subscriber,94041,False,True,2013-08-29 19:32:00,1090.0
13,4557,130,8/29/2013 13:57,2nd at South Park,64,8/29/2013 13:59,2nd at South Park,64,371,Subscriber,94122,True,True,2013-08-29 13:57:00,1143.0
14,4386,134,8/29/2013 12:31,Clay at Battery,41,8/29/2013 12:33,Beale at Market,56,503,Subscriber,94109,True,True,2013-08-29 12:31:00,1194.0
