# Pandas

In [4]:
import pandas as pd
import numpy as np
pd.options.display.max_columns = None # display all columns in notebook

## Download Sample Data

In [5]:
import os
import urllib.request

In [6]:
url = 'https://s3.amazonaws.com/nyc-tlc/trip+data/green_tripdata_2018-01.csv' # 70 MB
csvfile = '../example_files/test_data.csv'
if not os.path.exists(csvfile):
    print('downloading sample data')
    urllib.request.urlretrieve(url, csvfile)

## Series

In [7]:
s = pd.Series(np.random.randn(1000))
s.head() # shows the first n (standard: 5) rows of a dataframe

0    0.946169
1    1.158735
2    0.421197
3   -0.320976
4    0.124651
dtype: float64

In [8]:
type(s.index), type(s.values)

(pandas.core.indexes.range.RangeIndex, numpy.ndarray)

A Pandas Series is actually a numpy array with an index.

## DataFrames

In [9]:
df_rdn = pd.DataFrame(np.random.randn(10000, 5), columns=['a', 'b', 'c', 'd', 'e'])
df_rdn.head() # shows the first n (standard: 5) rows of a dataframe

Unnamed: 0,a,b,c,d,e
0,0.593598,0.857968,0.079722,-0.565566,-0.384126
1,-1.208424,0.561354,-0.893922,-0.077008,-0.404015
2,-1.490283,1.006293,0.913702,-2.230726,-0.66394
3,1.478132,-0.472758,0.409065,-0.179187,1.031576
4,1.834077,0.873852,0.188734,-0.598767,0.72387


Construct a Pandas DataFrame from numpy random numbers

In [10]:
type(df_rdn.index), type(df_rdn.a), type(df_rdn.a.values), type(df_rdn['b'])

(pandas.core.indexes.range.RangeIndex,
 pandas.core.series.Series,
 numpy.ndarray,
 pandas.core.series.Series)

A DataFrame is an index combined with multiple columns. 

Each column can be extracted as a Pandas Series (the column values together with the index).

Access to columns (as Series) is either possible with the syntax df['col'] or df.col (the latter only if the column is a valid Python name, i.e. it does not contain invalid characters like spaces, +, -, etc.).

In [11]:
df_taxi = pd.read_csv(csvfile)
df_taxi.head()

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type
0,2,2018-01-01 00:18:50,2018-01-01 00:24:39,N,1,236,236,5,0.7,6.0,0.5,0.5,0.0,0.0,,0.3,7.3,2,1.0
1,2,2018-01-01 00:30:26,2018-01-01 00:46:42,N,1,43,42,5,3.5,14.5,0.5,0.5,0.0,0.0,,0.3,15.8,2,1.0
2,2,2018-01-01 00:07:25,2018-01-01 00:19:45,N,1,74,152,1,2.14,10.0,0.5,0.5,0.0,0.0,,0.3,11.3,2,1.0
3,2,2018-01-01 00:32:40,2018-01-01 00:33:41,N,1,255,255,1,0.03,-3.0,-0.5,-0.5,0.0,0.0,,-0.3,-4.3,3,1.0
4,2,2018-01-01 00:32:40,2018-01-01 00:33:41,N,1,255,255,1,0.03,3.0,0.5,0.5,0.0,0.0,,0.3,4.3,2,1.0


In [12]:
len(df_taxi)

793529

Construct DataFrame from csv file

### Column Types

In [13]:
df_taxi.dtypes # show data types of columns

VendorID                   int64
lpep_pickup_datetime      object
lpep_dropoff_datetime     object
store_and_fwd_flag        object
RatecodeID                 int64
PULocationID               int64
DOLocationID               int64
passenger_count            int64
trip_distance            float64
fare_amount              float64
extra                    float64
mta_tax                  float64
tip_amount               float64
tolls_amount             float64
ehail_fee                float64
improvement_surcharge    float64
total_amount             float64
payment_type               int64
trip_type                float64
dtype: object

Numerical data types (integers, floats) are usually detected automatically during csv import.
Date (-time) columns are not identified automatically but imported as *object* (analogue to strings).

In [14]:
df_taxi.lpep_pickup_datetime = pd.to_datetime(df_taxi.lpep_pickup_datetime)
df_taxi.lpep_dropoff_datetime = pd.to_datetime(df_taxi.lpep_dropoff_datetime)

In [15]:
df_taxi.dtypes

VendorID                          int64
lpep_pickup_datetime     datetime64[ns]
lpep_dropoff_datetime    datetime64[ns]
store_and_fwd_flag               object
RatecodeID                        int64
PULocationID                      int64
DOLocationID                      int64
passenger_count                   int64
trip_distance                   float64
fare_amount                     float64
extra                           float64
mta_tax                         float64
tip_amount                      float64
tolls_amount                    float64
ehail_fee                       float64
improvement_surcharge           float64
total_amount                    float64
payment_type                      int64
trip_type                       float64
dtype: object

Ex-post conversion to Pandas Datetime using *pd.to_datetime()*. An explicit datetime format can be passed if Pandas cannot identify it automatically.

In [16]:
df_taxi = pd.read_csv(csvfile, parse_dates=[1, 2])
df_taxi.head()

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type
0,2,2018-01-01 00:18:50,2018-01-01 00:24:39,N,1,236,236,5,0.7,6.0,0.5,0.5,0.0,0.0,,0.3,7.3,2,1.0
1,2,2018-01-01 00:30:26,2018-01-01 00:46:42,N,1,43,42,5,3.5,14.5,0.5,0.5,0.0,0.0,,0.3,15.8,2,1.0
2,2,2018-01-01 00:07:25,2018-01-01 00:19:45,N,1,74,152,1,2.14,10.0,0.5,0.5,0.0,0.0,,0.3,11.3,2,1.0
3,2,2018-01-01 00:32:40,2018-01-01 00:33:41,N,1,255,255,1,0.03,-3.0,-0.5,-0.5,0.0,0.0,,-0.3,-4.3,3,1.0
4,2,2018-01-01 00:32:40,2018-01-01 00:33:41,N,1,255,255,1,0.03,3.0,0.5,0.5,0.0,0.0,,0.3,4.3,2,1.0


In [17]:
df_taxi.dtypes

VendorID                          int64
lpep_pickup_datetime     datetime64[ns]
lpep_dropoff_datetime    datetime64[ns]
store_and_fwd_flag               object
RatecodeID                        int64
PULocationID                      int64
DOLocationID                      int64
passenger_count                   int64
trip_distance                   float64
fare_amount                     float64
extra                           float64
mta_tax                         float64
tip_amount                      float64
tolls_amount                    float64
ehail_fee                       float64
improvement_surcharge           float64
total_amount                    float64
payment_type                      int64
trip_type                       float64
dtype: object

Datetime conversion during csv import by specifying datetime columns.

### Filtering

In [18]:
df_taxi_clean = df_taxi[df_taxi.total_amount > 0]
print(f'{len(df_taxi) - len(df_taxi_clean)} lines with non-positive payments removed')

3646 lines with non-positive payments removed


Meaning of this syntax: take all elements of df where the filter condition inside the brackets is True. The filter condition is actually a Pandas Series:

In [19]:
filter_ = df_taxi.total_amount > 0
filter_.head()

0     True
1     True
2     True
3    False
4     True
Name: total_amount, dtype: bool

In [20]:
df_taxi_clean = df_taxi[(df_taxi.total_amount > 0) & (df_taxi.passenger_count > 0)]
print(f'{len(df_taxi) - len(df_taxi_clean)} '
      f'lines with non-positive payments and passengers removed')

3817 lines with non-positive payments and passengers removed


Multiple conditions can be used for filtering using the operators & (and), | (or), ~ (not). Brackets must be made around the comparisons.

The syntax above is the most common one.

In [21]:
%timeit df_taxi_clean = df_taxi[(df_taxi.total_amount > 0) & (df_taxi.passenger_count > 0)] 
# filtering of 800k lines

124 ms ± 226 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [22]:
df_taxi_clean2 = df_taxi.loc[(df_taxi.total_amount > 0) & (df_taxi.passenger_count > 0),:]

In [23]:
pd.testing.assert_frame_equal(df_taxi_clean, df_taxi_clean2)
# raises AssertionError if DataFrames are different

In [24]:
%timeit df_taxi_clean2 = df_taxi.loc[(df_taxi.total_amount > 0) & (df_taxi.passenger_count > 0),:]

154 ms ± 108 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


Alternative syntax using df.loc[row, col] gives identical results in same time.

In [25]:
df_taxi_clean3 = df_taxi.query('total_amount > 0 and passenger_count > 0')

In [26]:
pd.testing.assert_frame_equal(df_taxi_clean, df_taxi_clean3)

In [27]:
%timeit df_taxi_clean3 = df_taxi.query('total_amount > 0 and passenger_count > 0')

157 ms ± 450 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


Yet another (rather exotic) filtering syntax, where the condition is given as a sting of Python code.

In [28]:
del df_taxi_clean2, df_taxi_clean3

### Vectorized Calculations on Columns

In [29]:
df_rdn['f'] = df_rdn['a'] + df_rdn.b * df_rdn.c * np.exp(df_rdn.d) / df_rdn.e
df_rdn['g'] = df_rdn.f.astype(np.int64) # type conversion
df_rdn.head()

Unnamed: 0,a,b,c,d,e,f,g
0,0.593598,0.857968,0.079722,-0.565566,-0.384126,0.492451,0
1,-1.208424,0.561354,-0.893922,-0.077008,-0.404015,-0.058434,0
2,-1.490283,1.006293,0.913702,-2.230726,-0.66394,-1.639085,-1
3,1.478132,-0.472758,0.409065,-0.179187,1.031576,1.321418,1
4,1.834077,0.873852,0.188734,-0.598767,0.72387,1.959272,1


Definition of a new column using vectorized calculations. Numpy functions can be used here (as described above, Pandas columns are actually numpy arrays).

The index makes sure that for operations with Series the rows are matched correctly. 

In [30]:
df_taxi['drive_time'] = df_taxi.lpep_dropoff_datetime - df_taxi.lpep_pickup_datetime
df_taxi['avg_speed'] = df_taxi.trip_distance / (df_taxi.drive_time.dt.seconds / 3600)
df_taxi.head()

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,drive_time,avg_speed
0,2,2018-01-01 00:18:50,2018-01-01 00:24:39,N,1,236,236,5,0.7,6.0,0.5,0.5,0.0,0.0,,0.3,7.3,2,1.0,00:05:49,7.22063
1,2,2018-01-01 00:30:26,2018-01-01 00:46:42,N,1,43,42,5,3.5,14.5,0.5,0.5,0.0,0.0,,0.3,15.8,2,1.0,00:16:16,12.909836
2,2,2018-01-01 00:07:25,2018-01-01 00:19:45,N,1,74,152,1,2.14,10.0,0.5,0.5,0.0,0.0,,0.3,11.3,2,1.0,00:12:20,10.410811
3,2,2018-01-01 00:32:40,2018-01-01 00:33:41,N,1,255,255,1,0.03,-3.0,-0.5,-0.5,0.0,0.0,,-0.3,-4.3,3,1.0,00:01:01,1.770492
4,2,2018-01-01 00:32:40,2018-01-01 00:33:41,N,1,255,255,1,0.03,3.0,0.5,0.5,0.0,0.0,,0.3,4.3,2,1.0,00:01:01,1.770492


Operations on Pandas Datetime and Timedelta objects.

### Apply Syntax

In [31]:
df_sample = df_taxi.sample(n=1000)

In [32]:
df_sample['useless_string']=df_sample.apply(lambda x: str(x['passenger_count'])+'_'+str(x['payment_type']),axis=1)
df_sample.useless_string.head()

337365    1_2
785729    1_2
784485    1_1
77173     1_1
44510     1_2
Name: useless_string, dtype: object

In [33]:
%timeit df_sample['useless_string']=df_sample.apply(lambda x: str(x['passenger_count'])+'_'+str(x['payment_type']),axis=1)

115 ms ± 66.6 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


Calculations which cannot be executed on whole columns at once can be applied element-wise (usually slower but more versatile than applying on whole column).

In [34]:
df_sample['useless_string2']=df_sample.passenger_count.astype('str')+'_'+df_sample.payment_type.astype('str')
df_sample.useless_string2.head()

337365    1_2
785729    1_2
784485    1_1
77173     1_1
44510     1_2
Name: useless_string2, dtype: object

In [35]:
%timeit df_sample['useless_string2']=df_sample.passenger_count.astype('str')+'_'+df_sample.payment_type.astype('str')

6.76 ms ± 22 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [36]:
df_taxi.payment_type.unique()

array([2, 3, 1, 4, 5])

The same as vectorized syntax is more than 10 times faster.
However, the apply syntax is more flexible.

In [37]:
payments = {1: 'cash',
    2: 'credit card',
    3: 'debit card',
    4: 'gold',
    5: 'bill'}
df_sample.payment_type.apply(lambda x: payments.get(x, None)).head()

337365    credit card
785729    credit card
784485           cash
77173            cash
44510     credit card
Name: payment_type, dtype: object

In [38]:
%timeit df_sample.payment_type.apply(lambda x: payments.get(x, None))

1.1 ms ± 7.41 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


The apply function can be used both on Pandas Series and DataFrames.

### Mapping

In [39]:
df_sample.payment_type.map(payments).head()

337365    credit card
785729    credit card
784485           cash
77173            cash
44510     credit card
Name: payment_type, dtype: object

In [40]:
%timeit df_sample.payment_type.map(payments)

1.47 ms ± 2.45 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


Alternatively in this example, the map function can be used to apply a dictionary to a Pandas Series.

### Iteration over Rows

#### iterrows()

In [77]:
for index, row in df_sample.head(2).iterrows():
    print(index, row)

337365 VendorID                                   2
lpep_pickup_datetime     2018-01-14 16:13:37
lpep_dropoff_datetime    2018-01-14 16:19:14
store_and_fwd_flag                         N
RatecodeID                                 1
PULocationID                               7
DOLocationID                               7
passenger_count                            1
trip_distance                            0.8
fare_amount                              5.5
extra                                      0
mta_tax                                  0.5
tip_amount                                 0
tolls_amount                               0
ehail_fee                                NaN
improvement_surcharge                    0.3
total_amount                             6.3
payment_type                               2
trip_type                                  1
drive_time                   0 days 00:05:37
avg_speed                            8.54599
useless_string                           1_2
use

In [78]:
type(row) # iterrows returns Pandas Series

pandas.core.series.Series

In [62]:
for index, row in df_sample.iterrows():
    row['useless_string_3'] = (str(row['passenger_count']) + '_' 
                               + str(row['payment_type']))

In [63]:
df_sample.head()

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,drive_time,avg_speed,useless_string,useless_string2
337365,2,2018-01-14 16:13:37,2018-01-14 16:19:14,N,1,7,7,1,0.8,5.5,0.0,0.5,0.0,0.0,,0.3,6.3,2,1.0,00:05:37,8.545994,1_2,1_2
785729,2,2018-01-31 18:20:10,2018-01-31 18:48:28,N,1,256,226,1,4.55,20.5,1.0,0.5,0.0,0.0,,0.3,22.3,2,1.0,00:28:18,9.646643,1_2,1_2
784485,2,2018-01-31 17:08:37,2018-01-31 17:15:24,N,1,41,166,1,1.3,7.0,1.0,0.5,1.76,0.0,,0.3,10.56,1,1.0,00:06:47,11.498771,1_1,1_1
77173,1,2018-01-04 15:11:28,2018-01-04 15:26:28,N,1,145,36,1,3.7,14.0,0.5,0.5,3.05,0.0,,0.3,18.35,1,1.0,00:15:00,14.8,1_1,1_1
44510,2,2018-01-02 21:32:46,2018-01-02 21:38:41,N,1,223,223,1,1.29,6.0,0.5,0.5,0.0,0.0,,0.3,7.3,2,1.0,00:05:55,13.08169,1_2,1_2


Adding new "colums" to the row dictionaries does not change the original DataFrame.

In [64]:
res = []
for index, row in df_sample.iterrows():
    res.append(str(row['passenger_count']) + '_' 
                               + str(row['payment_type']))
df_sample['useless_string_3'] = res

In [76]:
df_sample.head()

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,drive_time,avg_speed,useless_string,useless_string2,useless_string_3
337365,2,2018-01-14 16:13:37,2018-01-14 16:19:14,N,1,7,7,1,0.8,5.5,0.0,0.5,0.0,0.0,,0.3,6.3,2,1.0,00:05:37,8.545994,1_2,1_2,1_2
785729,2,2018-01-31 18:20:10,2018-01-31 18:48:28,N,1,256,226,1,4.55,20.5,1.0,0.5,0.0,0.0,,0.3,22.3,2,1.0,00:28:18,9.646643,1_2,1_2,1_2
784485,2,2018-01-31 17:08:37,2018-01-31 17:15:24,N,1,41,166,1,1.3,7.0,1.0,0.5,1.76,0.0,,0.3,10.56,1,1.0,00:06:47,11.498771,1_1,1_1,1_1
77173,1,2018-01-04 15:11:28,2018-01-04 15:26:28,N,1,145,36,1,3.7,14.0,0.5,0.5,3.05,0.0,,0.3,18.35,1,1.0,00:15:00,14.8,1_1,1_1,1_1
44510,2,2018-01-02 21:32:46,2018-01-02 21:38:41,N,1,223,223,1,1.29,6.0,0.5,0.5,0.0,0.0,,0.3,7.3,2,1.0,00:05:55,13.08169,1_2,1_2,1_2


In [89]:
%%timeit
res = []
for index, row in df_sample.iterrows():
    res.append(str(row['passenger_count']) + '_' 
                               + str(row['payment_type']))
df_sample['useless_string_3'] = res

277 ms ± 1.24 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


This version now adds the new column to the DataFrame, but takes about 2.5 times the time as using the row-wise apply function and about 30 times the time as the vectorized version.

In [114]:
df_mixed_type = pd.DataFrame([{'int': 1, 'float': 2.5}, {'int': 3, 'float': 3.14}])
df_mixed_type

Unnamed: 0,float,int
0,2.5,1
1,3.14,3


In [115]:
df_mixed_type.dtypes

float    float64
int        int64
dtype: object

In [117]:
for idx, row in df_mixed_type.iterrows():
    print(row['int'], type(row['int']))
    print(row['float'], type(row['float']))

1.0 <class 'numpy.float64'>
2.5 <class 'numpy.float64'>
3.0 <class 'numpy.float64'>
3.14 <class 'numpy.float64'>


*iterrows()* does not preserve data types. In the example above, the values in the integer column are converted into floats.

#### itertuples()

In [79]:
for row in df_sample.head(2).itertuples():
    print(row)

Pandas(Index=337365, VendorID=2, lpep_pickup_datetime=Timestamp('2018-01-14 16:13:37'), lpep_dropoff_datetime=Timestamp('2018-01-14 16:19:14'), store_and_fwd_flag='N', RatecodeID=1, PULocationID=7, DOLocationID=7, passenger_count=1, trip_distance=0.8, fare_amount=5.5, extra=0.0, mta_tax=0.5, tip_amount=0.0, tolls_amount=0.0, ehail_fee=nan, improvement_surcharge=0.3, total_amount=6.3, payment_type=2, trip_type=1.0, drive_time=Timedelta('0 days 00:05:37'), avg_speed=8.5459940652819, useless_string='1_2', useless_string2='1_2', useless_string_3='1_2')
Pandas(Index=785729, VendorID=2, lpep_pickup_datetime=Timestamp('2018-01-31 18:20:10'), lpep_dropoff_datetime=Timestamp('2018-01-31 18:48:28'), store_and_fwd_flag='N', RatecodeID=1, PULocationID=256, DOLocationID=226, passenger_count=1, trip_distance=4.55, fare_amount=20.5, extra=1.0, mta_tax=0.5, tip_amount=0.0, tolls_amount=0.0, ehail_fee=nan, improvement_surcharge=0.3, total_amount=22.3, payment_type=2, trip_type=1.0, drive_time=Timedelta

In [85]:
type(row), row[5], row.passenger_count, row.trip_distance 
# data structure similar to NamedTuple

(pandas.core.frame.Pandas, 1, 1, 4.55)

In [92]:
res = []
for row in df_sample.itertuples():
    res.append(str(row.passenger_count) + '_' 
                               + str(row.payment_type))
df_sample['useless_string_4'] = res

In [87]:
df_sample.head()

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,drive_time,avg_speed,useless_string,useless_string2,useless_string_3,useless_string_4
337365,2,2018-01-14 16:13:37,2018-01-14 16:19:14,N,1,7,7,1,0.8,5.5,0.0,0.5,0.0,0.0,,0.3,6.3,2,1.0,00:05:37,8.545994,1_2,1_2,1_2,1_2
785729,2,2018-01-31 18:20:10,2018-01-31 18:48:28,N,1,256,226,1,4.55,20.5,1.0,0.5,0.0,0.0,,0.3,22.3,2,1.0,00:28:18,9.646643,1_2,1_2,1_2,1_2
784485,2,2018-01-31 17:08:37,2018-01-31 17:15:24,N,1,41,166,1,1.3,7.0,1.0,0.5,1.76,0.0,,0.3,10.56,1,1.0,00:06:47,11.498771,1_1,1_1,1_1,1_1
77173,1,2018-01-04 15:11:28,2018-01-04 15:26:28,N,1,145,36,1,3.7,14.0,0.5,0.5,3.05,0.0,,0.3,18.35,1,1.0,00:15:00,14.8,1_1,1_1,1_1,1_1
44510,2,2018-01-02 21:32:46,2018-01-02 21:38:41,N,1,223,223,1,1.29,6.0,0.5,0.5,0.0,0.0,,0.3,7.3,2,1.0,00:05:55,13.08169,1_2,1_2,1_2,1_2


In [93]:
%%timeit
res = []
for row in df_sample.itertuples():
    res.append(str(row.passenger_count) + '_' 
                               + str(row.payment_type))
df_sample['useless_string_4'] = res

63.6 ms ± 241 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


Itertuples is about 4 times faster than iterrows, even though it provides practically the same functionality.
In this example, *itertuples* is even faster than *apply*, however this may not always be the case.

In [119]:
for row in df_mixed_type.itertuples():
    print(row.int, type(row.int))
    print(row.float, type(row.float))

1 <class 'int'>
2.5 <class 'float'>
3 <class 'int'>
3.14 <class 'float'>


In contrast to *iterrows()*, *itertuples()* preserves data types.

### Summary DataFrame Calculations

1. Use vectorized calculations on Numpy arrays wherever possible. This is by far the fastest way. Note that in addition to simple calculations, practically all Numpy functions work on Numpy arrays and therefore also on Pandas Series / DataFrames.
2. If it is not possible to vectorize a calculation, use either the *apply* syntax or iterate over the DataFrame using *itertuples()*. If required, parallelization could be used for speed-up, see [Multithreading and Multiprocessing](parallel_computing.ipynb).
3. Never use *iterrows()*. It is much slower than *itertuples()* and does not preserve data types.

## Indices and Multi-Indices

## Group-By and Pivot Tables

## Gotchas

### Setting With Copy Error¶

When first slicing a dataframe and afterwards doing some modification, it is not under control if the original dataframe is changed or a copy of it. Therefore, Pandas raises a SettingWithCopy warning.

https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#indexing-view-versus-copy

In [41]:
df = pd.DataFrame([{'a' :0, 'b':1},{'a':1, 'b':2}])
df2 = df[df.a == 0]
df2['c'] = df2.a + df2.b

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


If required (e.g. for easier identification of the line throwing the warning), this can be changed into an exception with the following setting

It is recommended to use this setting for production code (put this line directly after Pandas import in each module).

In [42]:
pd.options.mode.chained_assignment = 'raise'
#raises pandas.core.common.SettingWithCopyError

In [43]:
df = pd.DataFrame([{'a' :0, 'b':1},{'a':1, 'b':2}])
try:
    df2 = df[df.a ==0 ]
    df2['c'] = df2.a + df2.b
except pd.core.common.SettingWithCopyError as e:
    print(e)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


To get rid of this error, use one of the following methods:

In [44]:
df = pd.DataFrame([{'a' :0, 'b':1},{'a':1, 'b':2}])
df['c'] = df.a + df.b
df2 = df[df.a == 0]
df2.head()

Unnamed: 0,a,b,c
0,0,1,1


Modification before slicing - may be slow for large data sets. Use this if the calculated column is required outside of the selected rows, too.

In [45]:
df = pd.DataFrame([{'a' :0, 'b':1},{'a':1, 'b':2}])
df2 = df[df.a == 0].copy()
df2['c'] = df2.a + df2.b
df2.head()

Unnamed: 0,a,b,c
0,0,1,1


Copy DataFrame - potentially large memory consumption, but it is ensured that the original DataFrame is not changed.

In [46]:
df = pd.DataFrame([{'a' :0, 'b':1},{'a':1, 'b':2}])
df.loc[df.a == 0, 'c'] = df.a + df.b
df.head()

Unnamed: 0,a,b,c
0,0,1,1.0
1,1,2,


Modify explicitly only the selected rows of the original dataframe. The new column for rows which are not selected is filled with NaN.

## Performance

### Pandas vs. List of Dictionaries

In [47]:
df_taxi = pd.read_csv(csvfile, parse_dates=[1, 2]).head(1000)

#### Pandas

In [48]:
def process_pd(df):
    df['amt_per_mile'] = df['total_amount'] / df['trip_distance'] # zero divisions give NaN
    return df

In [49]:
df_taxi2 = process_pd(df_taxi)
df_taxi2.head()

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,amt_per_mile
0,2,2018-01-01 00:18:50,2018-01-01 00:24:39,N,1,236,236,5,0.7,6.0,0.5,0.5,0.0,0.0,,0.3,7.3,2,1.0,10.428571
1,2,2018-01-01 00:30:26,2018-01-01 00:46:42,N,1,43,42,5,3.5,14.5,0.5,0.5,0.0,0.0,,0.3,15.8,2,1.0,4.514286
2,2,2018-01-01 00:07:25,2018-01-01 00:19:45,N,1,74,152,1,2.14,10.0,0.5,0.5,0.0,0.0,,0.3,11.3,2,1.0,5.280374
3,2,2018-01-01 00:32:40,2018-01-01 00:33:41,N,1,255,255,1,0.03,-3.0,-0.5,-0.5,0.0,0.0,,-0.3,-4.3,3,1.0,-143.333333
4,2,2018-01-01 00:32:40,2018-01-01 00:33:41,N,1,255,255,1,0.03,3.0,0.5,0.5,0.0,0.0,,0.3,4.3,2,1.0,143.333333


In [50]:
%timeit process_pd(df_taxi)

8.4 ms ± 107 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


#### DictList

In [51]:
dict_list = df_taxi.to_dict('records')  # create list of dictionaries from Pandas DataFrame

In [52]:
dict_list[:2]

[{'VendorID': 2,
  'lpep_pickup_datetime': Timestamp('2018-01-01 00:18:50'),
  'lpep_dropoff_datetime': Timestamp('2018-01-01 00:24:39'),
  'store_and_fwd_flag': 'N',
  'RatecodeID': 1,
  'PULocationID': 236,
  'DOLocationID': 236,
  'passenger_count': 5,
  'trip_distance': 0.7,
  'fare_amount': 6.0,
  'extra': 0.5,
  'mta_tax': 0.5,
  'tip_amount': 0.0,
  'tolls_amount': 0.0,
  'ehail_fee': nan,
  'improvement_surcharge': 0.3,
  'total_amount': 7.3,
  'payment_type': 2,
  'trip_type': 1.0,
  'amt_per_mile': 10.428571428571429},
 {'VendorID': 2,
  'lpep_pickup_datetime': Timestamp('2018-01-01 00:30:26'),
  'lpep_dropoff_datetime': Timestamp('2018-01-01 00:46:42'),
  'store_and_fwd_flag': 'N',
  'RatecodeID': 1,
  'PULocationID': 43,
  'DOLocationID': 42,
  'passenger_count': 5,
  'trip_distance': 3.5,
  'fare_amount': 14.5,
  'extra': 0.5,
  'mta_tax': 0.5,
  'tip_amount': 0.0,
  'tolls_amount': 0.0,
  'ehail_fee': nan,
  'improvement_surcharge': 0.3,
  'total_amount': 15.8,
  'payment

In [53]:
def process_dl(dl):
    dl_out = []
    for line in dl:
        try:
            line['amt_per_mile'] = line['total_amount'] / line['trip_distance']
        except ZeroDivisionError:
            line['amt_per_mile'] = float('nan')
        dl_out.append(line)
    return dl_out

In [54]:
dict_list = process_dl(dict_list)

In [55]:
%timeit process_dl(dict_list)

425 ms ± 1.58 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


Processing in Pandas is at least a factor 50 faster than for a list of dictionaries. Furthermore, code is simpler and memory requirements smaller.

In [56]:
del dict_list # free up memory