# Pandas

In [1]:
import pandas as pd
import numpy as np
pd.options.display.max_columns = None # display all columns in notebook

## Download Sample Data

In [1]:
import os
import urllib.request

In [2]:
url = 'https://s3.amazonaws.com/nyc-tlc/trip+data/green_tripdata_2018-01.csv' # 70 MB
csvfile = '../example_files/test_data.csv'
if not os.path.exists(csvfile):
    print('downloading sample data')
    urllib.request.urlretrieve(url, csvfile)

downloading sample data


## Series

In [4]:
s = pd.Series(np.random.randn(1000))
s.head() # shows the first n (standard: 5) rows of a dataframe

0   -0.229725
1    0.248501
2    1.827603
3   -0.643470
4   -1.330106
dtype: float64

In [5]:
type(s.index), type(s.values)

(pandas.core.indexes.range.RangeIndex, numpy.ndarray)

A Pandas Series is actually a numpy array with an index.

## DataFrames

In [6]:
df_rdn = pd.DataFrame(np.random.randn(10000, 5), columns=['a', 'b', 'c', 'd', 'e'])
df_rdn.head() # shows the first n (standard: 5) rows of a dataframe

Unnamed: 0,a,b,c,d,e
0,0.168629,-1.396017,1.007475,-0.555405,0.504369
1,0.056246,-0.465622,0.480935,1.049594,-0.322133
2,0.398782,0.443204,0.221876,-0.101678,-0.113244
3,1.455685,0.080307,-0.223034,1.046918,-1.266838
4,-1.552577,0.181166,-2.392206,0.654899,0.503213


Construct a Pandas DataFrame from numpy random numbers

In [7]:
type(df_rdn.index), type(df_rdn.a), type(df_rdn.a.values), type(df_rdn['b'])

(pandas.core.indexes.range.RangeIndex,
 pandas.core.series.Series,
 numpy.ndarray,
 pandas.core.series.Series)

A DataFrame is an index combined with multiple columns. 

Each column can be extracted as a Pandas Series (the column values together with the index).

Access to columns (as Series) is either possible with the syntax df['col'] or df.col (the latter only if the column is a valid Python name, i.e. it does not contain invalid characters like spaces, +, -, etc.).

In [8]:
df_taxi = pd.read_csv(csvfile)
df_taxi.head()

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type
0,2,2018-01-01 00:18:50,2018-01-01 00:24:39,N,1,236,236,5,0.7,6.0,0.5,0.5,0.0,0.0,,0.3,7.3,2,1.0
1,2,2018-01-01 00:30:26,2018-01-01 00:46:42,N,1,43,42,5,3.5,14.5,0.5,0.5,0.0,0.0,,0.3,15.8,2,1.0
2,2,2018-01-01 00:07:25,2018-01-01 00:19:45,N,1,74,152,1,2.14,10.0,0.5,0.5,0.0,0.0,,0.3,11.3,2,1.0
3,2,2018-01-01 00:32:40,2018-01-01 00:33:41,N,1,255,255,1,0.03,-3.0,-0.5,-0.5,0.0,0.0,,-0.3,-4.3,3,1.0
4,2,2018-01-01 00:32:40,2018-01-01 00:33:41,N,1,255,255,1,0.03,3.0,0.5,0.5,0.0,0.0,,0.3,4.3,2,1.0


In [9]:
len(df_taxi)

793529

Construct DataFrame from csv file

### Column Types

In [10]:
df_taxi.dtypes # show data types of columns

VendorID                   int64
lpep_pickup_datetime      object
lpep_dropoff_datetime     object
store_and_fwd_flag        object
RatecodeID                 int64
PULocationID               int64
DOLocationID               int64
passenger_count            int64
trip_distance            float64
fare_amount              float64
extra                    float64
mta_tax                  float64
tip_amount               float64
tolls_amount             float64
ehail_fee                float64
improvement_surcharge    float64
total_amount             float64
payment_type               int64
trip_type                float64
dtype: object

Numerical data types (integers, floats) are usually detected automatically during csv import.
Date (-time) columns are not identified automatically but imported as *object* (analogue to strings).

In [11]:
df_taxi.lpep_pickup_datetime = pd.to_datetime(df_taxi.lpep_pickup_datetime)
df_taxi.lpep_dropoff_datetime = pd.to_datetime(df_taxi.lpep_dropoff_datetime)

In [12]:
df_taxi.dtypes

VendorID                          int64
lpep_pickup_datetime     datetime64[ns]
lpep_dropoff_datetime    datetime64[ns]
store_and_fwd_flag               object
RatecodeID                        int64
PULocationID                      int64
DOLocationID                      int64
passenger_count                   int64
trip_distance                   float64
fare_amount                     float64
extra                           float64
mta_tax                         float64
tip_amount                      float64
tolls_amount                    float64
ehail_fee                       float64
improvement_surcharge           float64
total_amount                    float64
payment_type                      int64
trip_type                       float64
dtype: object

Ex-post conversion to Pandas Datetime using *pd.to_datetime()*. An explicit datetime format can be passed if Pandas cannot identify it automatically.

In [5]:
df_taxi = pd.read_csv(csvfile, parse_dates=[1, 2])
df_taxi.head()

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type
0,2,2018-01-01 00:18:50,2018-01-01 00:24:39,N,1,236,236,5,0.7,6.0,0.5,0.5,0.0,0.0,,0.3,7.3,2,1.0
1,2,2018-01-01 00:30:26,2018-01-01 00:46:42,N,1,43,42,5,3.5,14.5,0.5,0.5,0.0,0.0,,0.3,15.8,2,1.0
2,2,2018-01-01 00:07:25,2018-01-01 00:19:45,N,1,74,152,1,2.14,10.0,0.5,0.5,0.0,0.0,,0.3,11.3,2,1.0
3,2,2018-01-01 00:32:40,2018-01-01 00:33:41,N,1,255,255,1,0.03,-3.0,-0.5,-0.5,0.0,0.0,,-0.3,-4.3,3,1.0
4,2,2018-01-01 00:32:40,2018-01-01 00:33:41,N,1,255,255,1,0.03,3.0,0.5,0.5,0.0,0.0,,0.3,4.3,2,1.0


In [14]:
df_taxi.dtypes

VendorID                          int64
lpep_pickup_datetime     datetime64[ns]
lpep_dropoff_datetime    datetime64[ns]
store_and_fwd_flag               object
RatecodeID                        int64
PULocationID                      int64
DOLocationID                      int64
passenger_count                   int64
trip_distance                   float64
fare_amount                     float64
extra                           float64
mta_tax                         float64
tip_amount                      float64
tolls_amount                    float64
ehail_fee                       float64
improvement_surcharge           float64
total_amount                    float64
payment_type                      int64
trip_type                       float64
dtype: object

Datetime conversion during csv import by specifying datetime columns.

### Filtering

In [15]:
df_taxi_clean = df_taxi[df_taxi.total_amount > 0]
print(f'{len(df_taxi) - len(df_taxi_clean)} lines with non-positive payments removed')

3646 lines with non-positive payments removed


Meaning of this syntax: take all elements of df where the filter condition inside the brackets is True. The filter condition is actually a Pandas Series:

In [16]:
filter_ = df_taxi.total_amount > 0
filter_.head()

0     True
1     True
2     True
3    False
4     True
Name: total_amount, dtype: bool

In [17]:
df_taxi_clean = df_taxi[(df_taxi.total_amount > 0) & (df_taxi.passenger_count > 0)]
print(f'{len(df_taxi) - len(df_taxi_clean)} '
      f'lines with non-positive payments and passengers removed')

3817 lines with non-positive payments and passengers removed


Multiple conditions can be used for filtering using the operators & (and), | (or), ~ (not). Brackets must be made around the comparisons.

The syntax above is the most common one.

In [18]:
%timeit df_taxi_clean = df_taxi[(df_taxi.total_amount > 0) & (df_taxi.passenger_count > 0)] 
# filtering of 800k lines

130 ms ± 356 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [19]:
df_taxi_clean2 = df_taxi.loc[(df_taxi.total_amount > 0) & (df_taxi.passenger_count > 0),:]

In [20]:
pd.testing.assert_frame_equal(df_taxi_clean, df_taxi_clean2)
# raises AssertionError if DataFrames are different

In [21]:
%timeit df_taxi_clean2 = df_taxi.loc[(df_taxi.total_amount > 0) & (df_taxi.passenger_count > 0),:]

146 ms ± 159 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


Alternative syntax using df.loc[row, col] gives identical results in same time.

In [22]:
df_taxi_clean3 = df_taxi.query('total_amount > 0 and passenger_count > 0')

In [23]:
pd.testing.assert_frame_equal(df_taxi_clean, df_taxi_clean3)

In [24]:
%timeit df_taxi_clean3 = df_taxi.query('total_amount > 0 and passenger_count > 0')

156 ms ± 294 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


Yet another (rather exotic) filtering syntax, where the condition is given as a sting of Python code.

In [25]:
del df_taxi_clean2, df_taxi_clean3

### Vectorized Calculations on Columns

In [26]:
df_rdn['f'] = df_rdn['a'] + df_rdn.b * df_rdn.c * np.exp(df_rdn.d) / df_rdn.e
df_rdn['g'] = df_rdn.f.astype(np.int64) # type conversion
df_rdn.head()

Unnamed: 0,a,b,c,d,e,f,g
0,0.168629,-1.396017,1.007475,-0.555405,0.504369,-1.431547,-1
1,0.056246,-0.465622,0.480935,1.049594,-0.322133,2.041966,2
2,0.398782,0.443204,0.221876,-0.101678,-0.113244,-0.385627,0
3,1.455685,0.080307,-0.223034,1.046918,-1.266838,1.495963,1
4,-1.552577,0.181166,-2.392206,0.654899,0.503213,-3.210413,-3


Definition of a new column using vectorized calculations. Numpy functions can be used here (as described above, Pandas columns are actually numpy arrays).

The index makes sure that for operations with Series the rows are matched correctly. 

In [27]:
df_taxi['drive_time'] = df_taxi.lpep_dropoff_datetime - df_taxi.lpep_pickup_datetime
df_taxi['avg_speed'] = df_taxi.trip_distance / (df_taxi.drive_time.dt.seconds / 3600)
df_taxi.head()

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,drive_time,avg_speed
0,2,2018-01-01 00:18:50,2018-01-01 00:24:39,N,1,236,236,5,0.7,6.0,0.5,0.5,0.0,0.0,,0.3,7.3,2,1.0,00:05:49,7.22063
1,2,2018-01-01 00:30:26,2018-01-01 00:46:42,N,1,43,42,5,3.5,14.5,0.5,0.5,0.0,0.0,,0.3,15.8,2,1.0,00:16:16,12.909836
2,2,2018-01-01 00:07:25,2018-01-01 00:19:45,N,1,74,152,1,2.14,10.0,0.5,0.5,0.0,0.0,,0.3,11.3,2,1.0,00:12:20,10.410811
3,2,2018-01-01 00:32:40,2018-01-01 00:33:41,N,1,255,255,1,0.03,-3.0,-0.5,-0.5,0.0,0.0,,-0.3,-4.3,3,1.0,00:01:01,1.770492
4,2,2018-01-01 00:32:40,2018-01-01 00:33:41,N,1,255,255,1,0.03,3.0,0.5,0.5,0.0,0.0,,0.3,4.3,2,1.0,00:01:01,1.770492


Operations on Pandas Datetime and Timedelta objects.

### Apply Syntax

In [23]:
df_sample = df_taxi.sample(n=1000)

In [29]:
df_sample['useless_string']=df_sample.apply(lambda x: str(x['passenger_count'])+'_'+str(x['payment_type']),axis=1)
df_sample.useless_string.head()

355692    1_2
430185    1_1
491288    1_1
434187    2_1
73867     1_2
Name: useless_string, dtype: object

In [30]:
%timeit df_sample['useless_string']=df_sample.apply(lambda x: str(x['passenger_count'])+'_'+str(x['payment_type']),axis=1)

115 ms ± 133 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


Calculations which cannot be executed on whole columns at once can be applied element-wise (usually slower but more versatile than applying on whole column).

In [31]:
df_sample['useless_string2']=df_sample.passenger_count.astype('str')+'_'+df_sample.payment_type.astype('str')
df_sample.useless_string2.head()

355692    1_2
430185    1_1
491288    1_1
434187    2_1
73867     1_2
Name: useless_string2, dtype: object

In [32]:
%timeit df_sample['useless_string2']=df_sample.passenger_count.astype('str')+'_'+df_sample.payment_type.astype('str')

6.85 ms ± 44 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [33]:
df_taxi.payment_type.unique()

array([2, 3, 1, 4, 5])

The same as vectorized syntax is more than 10 times faster.
However, the apply syntax is more flexible.

In [24]:
payments = {1: 'cash',
    2: 'credit card',
    3: 'debit card',
    4: 'gold',
    5: 'bill'}

In [34]:
df_sample.payment_type.apply(lambda x: payments.get(x, None)).head()

355692    credit card
430185           cash
491288           cash
434187           cash
73867     credit card
Name: payment_type, dtype: object

In [35]:
%timeit df_sample.payment_type.apply(lambda x: payments.get(x, None))

1.09 ms ± 6.4 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


The apply function can be used both on Pandas Series and DataFrames.

### Mapping

In [36]:
df_sample.payment_type.map(payments).head()

355692    credit card
430185           cash
491288           cash
434187           cash
73867     credit card
Name: payment_type, dtype: object

In [37]:
%timeit df_sample.payment_type.map(payments)

1.49 ms ± 5.65 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


Alternatively in this example, the map function can be used to apply a dictionary to a Pandas Series.

### Iteration over Rows

#### iterrows()

In [38]:
for index, row in df_sample.head(2).iterrows():
    print(index, row)

355692 VendorID                                   2
lpep_pickup_datetime     2018-01-15 13:13:33
lpep_dropoff_datetime    2018-01-15 13:33:28
store_and_fwd_flag                         N
RatecodeID                                 1
PULocationID                              95
DOLocationID                             198
passenger_count                            1
trip_distance                           5.57
fare_amount                               19
extra                                      0
mta_tax                                  0.5
tip_amount                                 0
tolls_amount                               0
ehail_fee                                NaN
improvement_surcharge                    0.3
total_amount                            19.8
payment_type                               2
trip_type                                  1
drive_time                   0 days 00:19:55
avg_speed                            16.7799
useless_string                           1_2
use

In [39]:
type(row) # iterrows returns Pandas Series

pandas.core.series.Series

In [40]:
for index, row in df_sample.iterrows():
    row['useless_string_3'] = (str(row['passenger_count']) + '_' 
                               + str(row['payment_type']))

In [41]:
df_sample.head()

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,drive_time,avg_speed,useless_string,useless_string2
355692,2,2018-01-15 13:13:33,2018-01-15 13:33:28,N,1,95,198,1,5.57,19.0,0.0,0.5,0.0,0.0,,0.3,19.8,2,1.0,00:19:55,16.779916,1_2,1_2
430185,2,2018-01-18 15:04:51,2018-01-18 15:54:32,N,1,55,39,1,9.69,35.0,0.0,0.5,0.0,0.0,,0.3,35.8,1,1.0,00:49:41,11.702113,1_1,1_1
491288,1,2018-01-20 15:58:29,2018-01-20 16:24:46,N,1,25,188,1,4.1,19.5,0.0,0.5,3.05,0.0,,0.3,23.35,1,1.0,00:26:17,9.359543,1_1,1_1
434187,2,2018-01-18 17:05:23,2018-01-18 17:09:37,N,1,75,236,2,0.43,5.0,1.0,0.5,1.36,0.0,,0.3,8.16,1,1.0,00:04:14,6.094488,2_1,2_1
73867,1,2018-01-04 01:26:15,2018-01-04 01:36:28,N,1,146,162,1,3.5,13.0,0.5,0.5,0.0,0.0,,0.3,14.3,2,1.0,00:10:13,20.554649,1_2,1_2


Adding new "colums" to the row dictionaries does not change the original DataFrame.

In [42]:
res = []
for index, row in df_sample.iterrows():
    res.append(str(row['passenger_count']) + '_' 
                               + str(row['payment_type']))
df_sample['useless_string_3'] = res

In [43]:
df_sample.head()

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,drive_time,avg_speed,useless_string,useless_string2,useless_string_3
355692,2,2018-01-15 13:13:33,2018-01-15 13:33:28,N,1,95,198,1,5.57,19.0,0.0,0.5,0.0,0.0,,0.3,19.8,2,1.0,00:19:55,16.779916,1_2,1_2,1_2
430185,2,2018-01-18 15:04:51,2018-01-18 15:54:32,N,1,55,39,1,9.69,35.0,0.0,0.5,0.0,0.0,,0.3,35.8,1,1.0,00:49:41,11.702113,1_1,1_1,1_1
491288,1,2018-01-20 15:58:29,2018-01-20 16:24:46,N,1,25,188,1,4.1,19.5,0.0,0.5,3.05,0.0,,0.3,23.35,1,1.0,00:26:17,9.359543,1_1,1_1,1_1
434187,2,2018-01-18 17:05:23,2018-01-18 17:09:37,N,1,75,236,2,0.43,5.0,1.0,0.5,1.36,0.0,,0.3,8.16,1,1.0,00:04:14,6.094488,2_1,2_1,2_1
73867,1,2018-01-04 01:26:15,2018-01-04 01:36:28,N,1,146,162,1,3.5,13.0,0.5,0.5,0.0,0.0,,0.3,14.3,2,1.0,00:10:13,20.554649,1_2,1_2,1_2


In [44]:
%%timeit
res = []
for index, row in df_sample.iterrows():
    res.append(str(row['passenger_count']) + '_' 
                               + str(row['payment_type']))
df_sample['useless_string_3'] = res

279 ms ± 1.15 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


This version now adds the new column to the DataFrame, but takes about 2.5 times the time as using the row-wise apply function and about 30 times the time as the vectorized version.

In [45]:
df_mixed_type = pd.DataFrame([{'int': 1, 'float': 2.5}, {'int': 3, 'float': 3.14}])
df_mixed_type

Unnamed: 0,float,int
0,2.5,1
1,3.14,3


In [46]:
df_mixed_type.dtypes

float    float64
int        int64
dtype: object

In [47]:
for idx, row in df_mixed_type.iterrows():
    print(row['int'], type(row['int']))
    print(row['float'], type(row['float']))

1.0 <class 'numpy.float64'>
2.5 <class 'numpy.float64'>
3.0 <class 'numpy.float64'>
3.14 <class 'numpy.float64'>


*iterrows()* does not preserve data types. In the example above, the values in the integer column are converted into floats.

#### itertuples()

In [48]:
for row in df_sample.head(2).itertuples():
    print(row)

Pandas(Index=355692, VendorID=2, lpep_pickup_datetime=Timestamp('2018-01-15 13:13:33'), lpep_dropoff_datetime=Timestamp('2018-01-15 13:33:28'), store_and_fwd_flag='N', RatecodeID=1, PULocationID=95, DOLocationID=198, passenger_count=1, trip_distance=5.57, fare_amount=19.0, extra=0.0, mta_tax=0.5, tip_amount=0.0, tolls_amount=0.0, ehail_fee=nan, improvement_surcharge=0.3, total_amount=19.8, payment_type=2, trip_type=1.0, drive_time=Timedelta('0 days 00:19:55'), avg_speed=16.779916317991635, useless_string='1_2', useless_string2='1_2', useless_string_3='1_2')
Pandas(Index=430185, VendorID=2, lpep_pickup_datetime=Timestamp('2018-01-18 15:04:51'), lpep_dropoff_datetime=Timestamp('2018-01-18 15:54:32'), store_and_fwd_flag='N', RatecodeID=1, PULocationID=55, DOLocationID=39, passenger_count=1, trip_distance=9.69, fare_amount=35.0, extra=0.0, mta_tax=0.5, tip_amount=0.0, tolls_amount=0.0, ehail_fee=nan, improvement_surcharge=0.3, total_amount=35.8, payment_type=1, trip_type=1.0, drive_time=Ti

In [49]:
type(row), row[5], row.passenger_count, row.trip_distance 
# data structure similar to NamedTuple

(pandas.core.frame.Pandas, 1, 1, 9.69)

In [50]:
res = []
for row in df_sample.itertuples():
    res.append(str(row.passenger_count) + '_' 
                               + str(row.payment_type))
df_sample['useless_string_4'] = res

In [51]:
df_sample.head()

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,drive_time,avg_speed,useless_string,useless_string2,useless_string_3,useless_string_4
355692,2,2018-01-15 13:13:33,2018-01-15 13:33:28,N,1,95,198,1,5.57,19.0,0.0,0.5,0.0,0.0,,0.3,19.8,2,1.0,00:19:55,16.779916,1_2,1_2,1_2,1_2
430185,2,2018-01-18 15:04:51,2018-01-18 15:54:32,N,1,55,39,1,9.69,35.0,0.0,0.5,0.0,0.0,,0.3,35.8,1,1.0,00:49:41,11.702113,1_1,1_1,1_1,1_1
491288,1,2018-01-20 15:58:29,2018-01-20 16:24:46,N,1,25,188,1,4.1,19.5,0.0,0.5,3.05,0.0,,0.3,23.35,1,1.0,00:26:17,9.359543,1_1,1_1,1_1,1_1
434187,2,2018-01-18 17:05:23,2018-01-18 17:09:37,N,1,75,236,2,0.43,5.0,1.0,0.5,1.36,0.0,,0.3,8.16,1,1.0,00:04:14,6.094488,2_1,2_1,2_1,2_1
73867,1,2018-01-04 01:26:15,2018-01-04 01:36:28,N,1,146,162,1,3.5,13.0,0.5,0.5,0.0,0.0,,0.3,14.3,2,1.0,00:10:13,20.554649,1_2,1_2,1_2,1_2


In [52]:
%%timeit
res = []
for row in df_sample.itertuples():
    res.append(str(row.passenger_count) + '_' 
                               + str(row.payment_type))
df_sample['useless_string_4'] = res

62.1 ms ± 126 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


Itertuples is about 4 times faster than iterrows, even though it provides practically the same functionality.
In this example, *itertuples* is even faster than *apply*, however this may not always be the case.

In [53]:
for row in df_mixed_type.itertuples():
    print(row.int, type(row.int))
    print(row.float, type(row.float))

1 <class 'int'>
2.5 <class 'float'>
3 <class 'int'>
3.14 <class 'float'>


In contrast to *iterrows()*, *itertuples()* preserves data types.

#### Looping with Explicit Row Access

In [54]:
res = []
passenger_count_loc = df_sample.columns.get_loc('passenger_count')
payment_type_loc = df_sample.columns.get_loc('payment_type')
for i in range(len(df_sample)):
    res.append(str(df_sample.iloc[i,passenger_count_loc]) + '_' 
                               + str(df_sample.iloc[i,payment_type_loc]))
df_sample['useless_string_5'] = res

In [55]:
df_sample.head()

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,drive_time,avg_speed,useless_string,useless_string2,useless_string_3,useless_string_4,useless_string_5
355692,2,2018-01-15 13:13:33,2018-01-15 13:33:28,N,1,95,198,1,5.57,19.0,0.0,0.5,0.0,0.0,,0.3,19.8,2,1.0,00:19:55,16.779916,1_2,1_2,1_2,1_2,1_2
430185,2,2018-01-18 15:04:51,2018-01-18 15:54:32,N,1,55,39,1,9.69,35.0,0.0,0.5,0.0,0.0,,0.3,35.8,1,1.0,00:49:41,11.702113,1_1,1_1,1_1,1_1,1_1
491288,1,2018-01-20 15:58:29,2018-01-20 16:24:46,N,1,25,188,1,4.1,19.5,0.0,0.5,3.05,0.0,,0.3,23.35,1,1.0,00:26:17,9.359543,1_1,1_1,1_1,1_1,1_1
434187,2,2018-01-18 17:05:23,2018-01-18 17:09:37,N,1,75,236,2,0.43,5.0,1.0,0.5,1.36,0.0,,0.3,8.16,1,1.0,00:04:14,6.094488,2_1,2_1,2_1,2_1,2_1
73867,1,2018-01-04 01:26:15,2018-01-04 01:36:28,N,1,146,162,1,3.5,13.0,0.5,0.5,0.0,0.0,,0.3,14.3,2,1.0,00:10:13,20.554649,1_2,1_2,1_2,1_2,1_2


In [56]:
%%timeit
res = []
passenger_count_loc = df_sample.columns.get_loc('passenger_count')
payment_type_loc = df_sample.columns.get_loc('payment_type')
for i in range(len(df_sample)):
    res.append(str(df_sample.iloc[i,passenger_count_loc]) + '_' 
                               + str(df_sample.iloc[i,payment_type_loc]))
df_sample['useless_string_5'] = res

74.3 ms ± 105 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


Slightly different approach using *.loc* instead of *.iloc* gives much shorter code:

In [73]:
res = []
for i in df_sample.index:
    res.append(str(df_sample.loc[i, 'passenger_count']) + '_' 
                               + str(df_sample.loc[i, 'payment_type']))
df_sample['useless_string_6'] = res

In [74]:
df_sample.head()

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,drive_time,avg_speed,useless_string,useless_string2,useless_string_3,useless_string_4,useless_string_5,useless_string_6
355692,2,2018-01-15 13:13:33,2018-01-15 13:33:28,N,1,95,198,1,5.57,19.0,0.0,0.5,0.0,0.0,,0.3,19.8,2,1.0,00:19:55,16.779916,1_2,1_2,1_2,1_2,1_2,1_2
430185,2,2018-01-18 15:04:51,2018-01-18 15:54:32,N,1,55,39,1,9.69,35.0,0.0,0.5,0.0,0.0,,0.3,35.8,1,1.0,00:49:41,11.702113,1_1,1_1,1_1,1_1,1_1,1_1
491288,1,2018-01-20 15:58:29,2018-01-20 16:24:46,N,1,25,188,1,4.1,19.5,0.0,0.5,3.05,0.0,,0.3,23.35,1,1.0,00:26:17,9.359543,1_1,1_1,1_1,1_1,1_1,1_1
434187,2,2018-01-18 17:05:23,2018-01-18 17:09:37,N,1,75,236,2,0.43,5.0,1.0,0.5,1.36,0.0,,0.3,8.16,1,1.0,00:04:14,6.094488,2_1,2_1,2_1,2_1,2_1,2_1
73867,1,2018-01-04 01:26:15,2018-01-04 01:36:28,N,1,146,162,1,3.5,13.0,0.5,0.5,0.0,0.0,,0.3,14.3,2,1.0,00:10:13,20.554649,1_2,1_2,1_2,1_2,1_2,1_2


In [75]:
%%timeit
res = []
for i in df_sample.index:
    res.append(str(df_sample.loc[i, 'passenger_count']) + '_' 
                               + str(df_sample.loc[i, 'payment_type']))
df_sample['useless_string_6'] = res

69.3 ms ± 49.6 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


Looping with explicit access of DataFrame positions takes only slightly longer as *itertuples()*. The code however may be a bit more complicated.

### Summary DataFrame Calculations

1. Use vectorized calculations on Numpy arrays wherever possible. This is by far the fastest way. Note that in addition to simple calculations, practically all Numpy functions work on Numpy arrays and therefore also on Pandas Series / DataFrames.
2. If it is not possible to vectorize a calculation, use either the *apply* syntax or iterate over the DataFrame using *itertuples()* or *.loc*. If required, parallelization could be used for speed-up, see [Multithreading and Multiprocessing](parallel_computing.ipynb).
3. Never use *iterrows()*. It is much slower than *itertuples()* and does not preserve data types.

## Indices and Multi-Indices

In [64]:
df_taxi.index

RangeIndex(start=0, stop=793529, step=1)

If no index is specified explicitly, Pandas creates a *RangeIndex* containing integer numbers from 0 to n-1.

In [73]:
df_taxi_new_index = df_taxi.set_index('lpep_pickup_datetime')
df_taxi_new_index.index

DatetimeIndex(['2018-01-01 00:18:50', '2018-01-01 00:30:26',
               '2018-01-01 00:07:25', '2018-01-01 00:32:40',
               '2018-01-01 00:32:40', '2018-01-01 00:38:35',
               '2018-01-01 00:18:41', '2018-01-01 00:38:02',
               '2018-01-01 00:05:02', '2018-01-01 00:35:23',
               ...
               '2018-01-31 23:09:04', '2018-01-31 23:36:22',
               '2018-01-31 23:38:46', '2018-01-31 23:10:17',
               '2018-01-31 23:34:07', '2018-01-31 23:42:10',
               '2018-02-01 00:00:48', '2018-01-31 23:19:43',
               '2018-01-31 23:57:59', '2018-01-31 23:30:45'],
              dtype='datetime64[ns]', name='lpep_pickup_datetime', length=793529, freq=None)

In [74]:
df_taxi_new_index.head()

Unnamed: 0_level_0,VendorID,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type
lpep_pickup_datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2018-01-01 00:18:50,2,2018-01-01 00:24:39,N,1,236,236,5,0.7,6.0,0.5,0.5,0.0,0.0,,0.3,7.3,2,1.0
2018-01-01 00:30:26,2,2018-01-01 00:46:42,N,1,43,42,5,3.5,14.5,0.5,0.5,0.0,0.0,,0.3,15.8,2,1.0
2018-01-01 00:07:25,2,2018-01-01 00:19:45,N,1,74,152,1,2.14,10.0,0.5,0.5,0.0,0.0,,0.3,11.3,2,1.0
2018-01-01 00:32:40,2,2018-01-01 00:33:41,N,1,255,255,1,0.03,-3.0,-0.5,-0.5,0.0,0.0,,-0.3,-4.3,3,1.0
2018-01-01 00:32:40,2,2018-01-01 00:33:41,N,1,255,255,1,0.03,3.0,0.5,0.5,0.0,0.0,,0.3,4.3,2,1.0


A column of the DataFrame can be set as the new index.

Note that the index does not need to be unique.

In [75]:
df_taxi_new_index.reset_index().head()

Unnamed: 0,lpep_pickup_datetime,VendorID,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type
0,2018-01-01 00:18:50,2,2018-01-01 00:24:39,N,1,236,236,5,0.7,6.0,0.5,0.5,0.0,0.0,,0.3,7.3,2,1.0
1,2018-01-01 00:30:26,2,2018-01-01 00:46:42,N,1,43,42,5,3.5,14.5,0.5,0.5,0.0,0.0,,0.3,15.8,2,1.0
2,2018-01-01 00:07:25,2,2018-01-01 00:19:45,N,1,74,152,1,2.14,10.0,0.5,0.5,0.0,0.0,,0.3,11.3,2,1.0
3,2018-01-01 00:32:40,2,2018-01-01 00:33:41,N,1,255,255,1,0.03,-3.0,-0.5,-0.5,0.0,0.0,,-0.3,-4.3,3,1.0
4,2018-01-01 00:32:40,2,2018-01-01 00:33:41,N,1,255,255,1,0.03,3.0,0.5,0.5,0.0,0.0,,0.3,4.3,2,1.0


Reset index: the index is converted to regular column(s) and a new *RangeIndex* is set.

In [79]:
df_taxi_new_index2 = df_taxi.set_index(['VendorID', 'lpep_pickup_datetime'])
df_taxi_new_index2.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type
VendorID,lpep_pickup_datetime,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2,2018-01-01 00:18:50,2018-01-01 00:24:39,N,1,236,236,5,0.7,6.0,0.5,0.5,0.0,0.0,,0.3,7.3,2,1.0
2,2018-01-01 00:30:26,2018-01-01 00:46:42,N,1,43,42,5,3.5,14.5,0.5,0.5,0.0,0.0,,0.3,15.8,2,1.0
2,2018-01-01 00:07:25,2018-01-01 00:19:45,N,1,74,152,1,2.14,10.0,0.5,0.5,0.0,0.0,,0.3,11.3,2,1.0
2,2018-01-01 00:32:40,2018-01-01 00:33:41,N,1,255,255,1,0.03,-3.0,-0.5,-0.5,0.0,0.0,,-0.3,-4.3,3,1.0
2,2018-01-01 00:32:40,2018-01-01 00:33:41,N,1,255,255,1,0.03,3.0,0.5,0.5,0.0,0.0,,0.3,4.3,2,1.0


## Grouping and Aggregation

In [7]:
taxi_grouped = df_taxi.groupby(['passenger_count', 'payment_type'])
taxi_grouped

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7efc2ebf25c0>

In [80]:
taxi_grouped.mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,VendorID,RatecodeID,PULocationID,DOLocationID,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,trip_type,amt_per_mile
passenger_count,payment_type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,1,1.845953,1.031332,121.545692,123.91906,3.455065,13.724543,0.496084,0.496084,2.494569,0.060157,,0.29765,17.712037,1.007833,inf
1,2,1.854015,1.038929,118.875912,129.182482,2.449465,10.36618,0.490268,0.490268,0.0,0.014015,,0.294161,11.654891,1.009732,inf
1,3,1.666667,2.333333,147.0,136.333333,0.49,-2.166667,-0.333333,-0.333333,0.0,0.0,,-0.2,-3.033333,1.333333,-52.48366
1,4,2.0,1.0,149.0,149.0,0.045,-2.5,-0.5,-0.5,0.0,0.0,,-0.3,-3.8,1.0,-85.5
2,1,1.77551,1.163265,129.265306,151.306122,3.484694,14.244898,0.479592,0.479592,2.193673,0.0,,0.287755,17.804898,1.040816,6.678656
2,2,1.738095,1.119048,116.357143,123.238095,2.41119,11.142857,0.47619,0.488095,0.0,0.0,,0.292857,12.4,1.02381,132.115467
3,1,1.8,1.0,86.2,140.466667,3.354,13.333333,0.5,0.5,2.814667,0.0,,0.3,17.448,1.0,6.321971
3,2,1.636364,1.0,147.454545,78.181818,1.980909,9.090909,0.5,0.5,0.0,0.0,,0.3,10.390909,1.0,6.176261
4,1,1.5,1.0,211.5,131.0,3.015,12.5,0.5,0.5,1.025,0.0,,0.3,14.825,1.0,5.811971
4,2,1.625,1.5,133.75,132.125,2.3025,7.3125,0.4375,0.4375,0.0,0.0,,0.2625,8.45,1.125,5.262157


Simultaneous aggregation over all columns

In [88]:
taxi_grouped.agg({'trip_distance': min, 'fare_amount': max, 'total_amount': np.median, 
                  'VendorID': 'nunique', 'trip_type': 'unique'})

Unnamed: 0_level_0,Unnamed: 1_level_0,trip_distance,fare_amount,total_amount,VendorID,trip_type
passenger_count,payment_type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1,0.0,53.0,14.16,2,"[1.0, 2.0]"
1,2,0.0,61.5,9.8,2,"[1.0, 2.0]"
1,3,0.03,0.0,-4.3,2,"[1.0, 2.0]"
1,4,0.04,-2.5,-3.8,1,[1.0]
2,1,0.5,55.0,16.0,2,"[1.0, 2.0]"
2,2,0.01,52.0,9.3,2,"[1.0, 2.0]"
3,1,0.84,33.5,14.76,2,[1.0]
3,2,0.7,19.5,9.8,2,[1.0]
4,1,1.6,16.0,14.825,2,[1.0]
4,2,0.43,12.5,9.05,2,"[2.0, 1.0]"


Column specific aggregation functions.

In [12]:
agg = taxi_grouped.agg({'trip_distance': [min, max], 'fare_amount': max})
agg.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,trip_distance,trip_distance,fare_amount
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max,max
passenger_count,payment_type,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,1,0.0,12.61,50.0
0,2,0.0,15.77,52.0
0,3,0.0,2.0,9.35
0,4,0.0,0.4,3.5
1,1,0.0,113.72,376.0


More than one aggregation function may be defined per column, in this case a MultiIndex is created.

In [11]:
agg.columns = list(map('_'.join, agg.columns.values))
agg.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,trip_distance_min,trip_distance_max,fare_amount_max
passenger_count,payment_type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,1,0.0,12.61,50.0
0,2,0.0,15.77,52.0
0,3,0.0,2.0,9.35
0,4,0.0,0.4,3.5
1,1,0.0,113.72,376.0


Convert column MultiIndex to single Index level

In [8]:
for i, (idx, df) in enumerate(taxi_grouped):
    print(idx)
    print(df.head())
    if i>1:
        break

(0, 1)
        VendorID lpep_pickup_datetime lpep_dropoff_datetime  \
3111           2  2018-01-01 01:58:48   2018-01-01 01:58:50   
45708          1  2018-01-02 22:31:12   2018-01-02 22:43:18   
83450          1  2018-01-05 09:10:21   2018-01-05 09:13:08   
110577         1  2018-01-06 06:18:54   2018-01-06 06:26:23   
110773         1  2018-01-06 07:02:36   2018-01-06 07:13:33   

       store_and_fwd_flag  RatecodeID  PULocationID  DOLocationID  \
3111                    N           5            42            42   
45708                   N           1            74           262   
83450                   N           1            75            74   
110577                  N           1            74            75   
110773                  N           1            74            24   

        passenger_count  trip_distance  fare_amount  extra  mta_tax  \
3111                  0            0.0          8.0    0.0      0.0   
45708                 0            2.7         11.5    0.

Iterating over the grouped object yields the index and content of each group.

## Pivot Tables

In [21]:
df_taxi.pivot_table(index=['VendorID', 'passenger_count'], # y-axis
                    columns='payment_type', # x-axis
                    values=['total_amount', 'trip_distance'], # values to be aggregated 
                    aggfunc=np.sum, # aggregation function
                    margins=True, # displays totals (All) 
                    fill_value=0 # defines what is shown if there is no value for a cell 
                    # (defalt NaN)
                   )

Unnamed: 0_level_0,Unnamed: 1_level_0,total_amount,total_amount,total_amount,total_amount,total_amount,total_amount,trip_distance,trip_distance,trip_distance,trip_distance,trip_distance,trip_distance
Unnamed: 0_level_1,payment_type,1,2,3,4,5,All,1,2,3,4,5,All
VendorID,passenger_count,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
1,0.0,759.81,468.5,64.76,4.8,0.0,1297.87,119.7,79.6,10.2,0.4,0.0,209.9
1,1.0,962858.2,620027.0,23417.12,11063.44,481.1,1617847.0,172248.6,123092.2,4605.6,2219.6,123.8,302289.8
1,2.0,97137.82,73293.03,1129.7,848.7,73.8,172483.0,17614.5,14829.6,228.5,244.7,31.8,32949.1
1,3.0,20842.52,20313.97,348.06,211.0,11.3,41726.85,3808.4,4066.7,79.5,78.9,2.3,8035.8
1,4.0,5851.02,6259.96,116.5,10.3,0.0,12237.78,1008.4,1368.8,26.7,1.4,0.0,2405.3
1,5.0,504.59,316.06,0.0,0.0,0.0,820.65,97.6,62.7,0.0,0.0,0.0,160.3
1,6.0,409.12,84.4,0.0,0.0,0.0,493.52,87.9,17.2,0.0,0.0,0.0,105.1
2,0.0,449.02,512.88,0.0,-27.0,0.0,934.9,17.5,85.62,0.0,0.0,0.0,103.12
2,1.0,4881119.0,2886567.0,-7545.07,-3149.81,0.0,7756991.0,917335.6,563914.53,379.56,142.64,0.0,1481772.33
2,2.0,413520.9,276238.8,-1153.59,-203.4,0.0,688402.8,77741.5,54602.89,49.47,8.12,0.0,132401.98


Pivot table, analogue to Excel.

In [44]:
try:
    df_taxi.pivot(index='VendorID', # y-axis
              columns='payment_type', # x-axis
              values='total_amount')
except ValueError as e:
    print(e)

Index contains duplicate entries, cannot reshape


The *pivot* method has (in contrast to *pivot_table*) no aggregation function defined, thus it raises a *ValueError* if an index-column combination is not unique.

In [48]:
df_taxi_no_dups = df_taxi[df_taxi.index.isin(
    df_taxi[['VendorID', 'payment_type']].drop_duplicates().index)]
df_taxi_no_dups

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type
0,2,2018-01-01 00:18:50,2018-01-01 00:24:39,N,1,236,236,5,0.7,6.0,0.5,0.5,0.0,0.0,,0.3,7.3,2,1.0
3,2,2018-01-01 00:32:40,2018-01-01 00:33:41,N,1,255,255,1,0.03,-3.0,-0.5,-0.5,0.0,0.0,,-0.3,-4.3,3,1.0
7,2,2018-01-01 00:38:02,2018-01-01 00:55:02,N,1,189,225,5,3.45,14.5,0.5,0.5,3.16,0.0,,0.3,18.96,1,1.0
15,1,2018-01-01 00:07:40,2018-01-01 00:15:20,N,1,225,37,1,1.9,8.0,0.5,0.5,3.0,0.0,,0.3,12.3,1,1.0
49,1,2018-01-01 00:06:43,2018-01-01 00:20:45,N,5,136,169,1,4.2,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,2,2.0
470,1,2018-01-01 00:48:34,2018-01-01 00:51:25,N,5,74,42,1,1.1,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,3,2.0
777,2,2018-01-01 00:56:21,2018-01-01 00:56:28,N,1,130,130,1,0.04,-2.5,-0.5,-0.5,0.0,0.0,,-0.3,-3.8,4,1.0
1177,1,2018-01-01 00:34:12,2018-01-01 01:07:06,N,1,66,35,1,6.5,25.5,0.5,0.5,0.0,0.0,,0.3,26.8,4,1.0
15341,1,2018-01-01 14:22:16,2018-01-01 15:03:44,N,3,33,1,2,14.7,63.0,0.0,0.0,0.0,10.5,,0.3,73.8,5,1.0


In [49]:
df_taxi_no_dups.pivot(index='VendorID', # y-axis
                      columns='payment_type', # x-axis
                      values='total_amount')

payment_type,1,2,3,4,5
VendorID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,12.3,0.0,0.0,26.8,73.8
2,18.96,7.3,-4.3,-3.8,


The usage of *pivot* is rather situational, usually *pivot_table* will be used.

## Joining Tables

In [30]:
payment_fees = {1: 0.05, 2: 0.01, 3: 0.1, 4: 0.008, 5: 0.015}
payment_type_df = pd.DataFrame([{'payment_type': key, 'description': value} for 
                                key, value in payments.items()])\
                                .set_index('payment_type')
payment_type_df['fees'] = payment_type_df.index.map(payment_fees)
payment_type_df

Unnamed: 0_level_0,description,fees
payment_type,Unnamed: 1_level_1,Unnamed: 2_level_1
1,cash,0.05
2,credit card,0.01
3,debit card,0.1
4,gold,0.008
5,bill,0.015


Construction of example table for joining the taxi DataFrame

In [36]:
df_taxi.merge(payment_type_df, # 2nd DataFrame
              left_on='payment_type', # Column name for 1st df
              right_index=True, # join 2nd df on index (alternatively on column using right_on 
              how='left', # type: inner, left, right, outer
             ).head()

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,description,fees
0,2,2018-01-01 00:18:50,2018-01-01 00:24:39,N,1,236,236,5,0.7,6.0,0.5,0.5,0.0,0.0,,0.3,7.3,2,1.0,credit card,0.01
1,2,2018-01-01 00:30:26,2018-01-01 00:46:42,N,1,43,42,5,3.5,14.5,0.5,0.5,0.0,0.0,,0.3,15.8,2,1.0,credit card,0.01
2,2,2018-01-01 00:07:25,2018-01-01 00:19:45,N,1,74,152,1,2.14,10.0,0.5,0.5,0.0,0.0,,0.3,11.3,2,1.0,credit card,0.01
3,2,2018-01-01 00:32:40,2018-01-01 00:33:41,N,1,255,255,1,0.03,-3.0,-0.5,-0.5,0.0,0.0,,-0.3,-4.3,3,1.0,debit card,0.1
4,2,2018-01-01 00:32:40,2018-01-01 00:33:41,N,1,255,255,1,0.03,3.0,0.5,0.5,0.0,0.0,,0.3,4.3,2,1.0,credit card,0.01


## Stacking and Unstacking

In [55]:
df_stacked = df_sample.stack()
df_stacked.head(20)

267782  VendorID                                   2
        lpep_pickup_datetime     2018-01-12 10:09:34
        lpep_dropoff_datetime    2018-01-12 10:18:54
        store_and_fwd_flag                         N
        RatecodeID                                 1
        PULocationID                             217
        DOLocationID                             217
        passenger_count                            1
        trip_distance                              0
        fare_amount                                7
        extra                                      0
        mta_tax                                  0.5
        tip_amount                                 0
        tolls_amount                               0
        improvement_surcharge                    0.3
        total_amount                             7.8
        payment_type                               1
        trip_type                                  1
292910  VendorID                              

*stack* creates a Pandas Series with a MultiIndex, whose 1st level is the index of the DataFrame (or MultiIndex, if the DataFrame had such) and its 2nd level is the column name.
The Series values are the column contents. 

In [57]:
df_stacked.unstack().head()

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,payment_type,trip_type
267782,2,2018-01-12 10:09:34,2018-01-12 10:18:54,N,1,217,217,1,0.0,7.0,0.0,0.5,0.0,0,0.3,7.8,1,1
292910,2,2018-01-13 00:31:34,2018-01-13 00:40:06,N,1,74,42,1,1.67,8.0,0.5,0.5,0.0,0,0.3,9.3,2,1
256288,2,2018-01-11 20:15:59,2018-01-11 20:24:09,N,1,169,20,1,1.62,8.0,0.5,0.5,2.32,0,0.3,11.62,1,1
342370,1,2018-01-14 19:04:25,2018-01-14 19:23:59,N,1,226,142,1,3.8,16.5,0.0,0.5,3.45,0,0.3,20.75,1,1
20342,2,2018-01-01 19:17:11,2018-01-01 19:22:25,N,1,56,173,1,0.92,5.5,0.0,0.5,0.0,0,0.3,6.3,1,1


*unstack* is the reverse operation: it takes the 1st level of the MultiIndex as row index and the 2nd level as column index.

In [62]:
pd.DataFrame(df_stacked).reset_index().pivot(index='level_0', 
                                             columns='level_1', values=0).head()

level_1,DOLocationID,PULocationID,RatecodeID,VendorID,extra,fare_amount,improvement_surcharge,lpep_dropoff_datetime,lpep_pickup_datetime,mta_tax,passenger_count,payment_type,store_and_fwd_flag,tip_amount,tolls_amount,total_amount,trip_distance,trip_type
level_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
14,75,244,1,2,0.5,19.0,0.3,2018-01-01 00:51:08,2018-01-01 00:36:58,0.5,2,1,N,4.0,0,24.3,6.01,1
1222,129,82,1,2,0.5,7.0,0.3,2018-01-01 01:07:02,2018-01-01 01:00:13,0.5,1,2,N,0.0,0,8.3,1.44,1
2659,136,69,1,2,0.5,13.5,0.3,2018-01-01 01:41:34,2018-01-01 01:23:48,0.5,1,2,N,0.0,0,14.8,2.92,1
4510,244,116,1,2,0.5,10.0,0.3,2018-01-01 01:26:56,2018-01-01 01:16:29,0.5,1,1,N,2.26,0,13.56,2.28,1
7531,258,129,1,1,0.5,26.0,0.3,2018-01-01 03:19:48,2018-01-01 03:00:13,0.5,1,2,N,0.0,0,27.3,8.5,1


*unstack* is equivalent to *pivot*, but provides a simpler syntax for this particular use case.

## Reading from and Writing to Databases

see [Relational Databases](relational_databases.ipynb#pandas).

## Gotchas

## Assignments vs. In-Place Modifications

In [91]:
df = df_sample.copy()
df.head()

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type
267782,2,2018-01-12 10:09:34,2018-01-12 10:18:54,N,1,217,217,1,0.0,7.0,0.0,0.5,0.0,0.0,,0.3,7.8,1,1.0
292910,2,2018-01-13 00:31:34,2018-01-13 00:40:06,N,1,74,42,1,1.67,8.0,0.5,0.5,0.0,0.0,,0.3,9.3,2,1.0
256288,2,2018-01-11 20:15:59,2018-01-11 20:24:09,N,1,169,20,1,1.62,8.0,0.5,0.5,2.32,0.0,,0.3,11.62,1,1.0
342370,1,2018-01-14 19:04:25,2018-01-14 19:23:59,N,1,226,142,1,3.8,16.5,0.0,0.5,3.45,0.0,,0.3,20.75,1,1.0
20342,2,2018-01-01 19:17:11,2018-01-01 19:22:25,N,1,56,173,1,0.92,5.5,0.0,0.5,0.0,0.0,,0.3,6.3,1,1.0


In [92]:
df.set_index('lpep_pickup_datetime') # returns modified df, but does not change df itself.
df.head()

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type
267782,2,2018-01-12 10:09:34,2018-01-12 10:18:54,N,1,217,217,1,0.0,7.0,0.0,0.5,0.0,0.0,,0.3,7.8,1,1.0
292910,2,2018-01-13 00:31:34,2018-01-13 00:40:06,N,1,74,42,1,1.67,8.0,0.5,0.5,0.0,0.0,,0.3,9.3,2,1.0
256288,2,2018-01-11 20:15:59,2018-01-11 20:24:09,N,1,169,20,1,1.62,8.0,0.5,0.5,2.32,0.0,,0.3,11.62,1,1.0
342370,1,2018-01-14 19:04:25,2018-01-14 19:23:59,N,1,226,142,1,3.8,16.5,0.0,0.5,3.45,0.0,,0.3,20.75,1,1.0
20342,2,2018-01-01 19:17:11,2018-01-01 19:22:25,N,1,56,173,1,0.92,5.5,0.0,0.5,0.0,0.0,,0.3,6.3,1,1.0


Note that the index in df has not been modified.
The behavior of most operations in Pandas is analogue.

In [94]:
df = df_sample.copy()
df.set_index('lpep_pickup_datetime', inplace=True)
df.head()

Unnamed: 0_level_0,VendorID,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type
lpep_pickup_datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2018-01-12 10:09:34,2,2018-01-12 10:18:54,N,1,217,217,1,0.0,7.0,0.0,0.5,0.0,0.0,,0.3,7.8,1,1.0
2018-01-13 00:31:34,2,2018-01-13 00:40:06,N,1,74,42,1,1.67,8.0,0.5,0.5,0.0,0.0,,0.3,9.3,2,1.0
2018-01-11 20:15:59,2,2018-01-11 20:24:09,N,1,169,20,1,1.62,8.0,0.5,0.5,2.32,0.0,,0.3,11.62,1,1.0
2018-01-14 19:04:25,1,2018-01-14 19:23:59,N,1,226,142,1,3.8,16.5,0.0,0.5,3.45,0.0,,0.3,20.75,1,1.0
2018-01-01 19:17:11,2,2018-01-01 19:22:25,N,1,56,173,1,0.92,5.5,0.0,0.5,0.0,0.0,,0.3,6.3,1,1.0


In [111]:
%%timeit 
df = df_sample.copy()
df.set_index('lpep_pickup_datetime', inplace=True)

2.19 ms ± 42.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


Option 1: Set the *inplace* keyword argument to *True* for modification of the original Series/ DataFrame.

In [98]:
df = df_sample.copy()
df = df.set_index('lpep_pickup_datetime')
df.head()

Unnamed: 0_level_0,VendorID,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type
lpep_pickup_datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2018-01-12 10:09:34,2,2018-01-12 10:18:54,N,1,217,217,1,0.0,7.0,0.0,0.5,0.0,0.0,,0.3,7.8,1,1.0
2018-01-13 00:31:34,2,2018-01-13 00:40:06,N,1,74,42,1,1.67,8.0,0.5,0.5,0.0,0.0,,0.3,9.3,2,1.0
2018-01-11 20:15:59,2,2018-01-11 20:24:09,N,1,169,20,1,1.62,8.0,0.5,0.5,2.32,0.0,,0.3,11.62,1,1.0
2018-01-14 19:04:25,1,2018-01-14 19:23:59,N,1,226,142,1,3.8,16.5,0.0,0.5,3.45,0.0,,0.3,20.75,1,1.0
2018-01-01 19:17:11,2,2018-01-01 19:22:25,N,1,56,173,1,0.92,5.5,0.0,0.5,0.0,0.0,,0.3,6.3,1,1.0


In [112]:
%%timeit
df = df_sample.copy()
df = df.set_index('lpep_pickup_datetime')

2.64 ms ± 7.45 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


Option 2: Explicitly assign the return value to the original DataFrame variable.

Both options are equally valid, personally I prefer option 2 because of the more "functional-style" syntax.

### Setting With Copy Error¶

When first slicing a dataframe and afterwards doing some modification, it is not under control if the original dataframe is changed or a copy of it. Therefore, Pandas raises a SettingWithCopy warning.

https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#indexing-view-versus-copy

In [57]:
df = pd.DataFrame([{'a' :0, 'b':1},{'a':1, 'b':2}])
df2 = df[df.a == 0]
df2['c'] = df2.a + df2.b

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


If required (e.g. for easier identification of the line throwing the warning), this can be changed into an exception with the following setting

It is recommended to use this setting for production code (put this line directly after Pandas import in each module).

In [58]:
pd.options.mode.chained_assignment = 'raise'
#raises pandas.core.common.SettingWithCopyError

In [59]:
df = pd.DataFrame([{'a' :0, 'b':1},{'a':1, 'b':2}])
try:
    df2 = df[df.a ==0 ]
    df2['c'] = df2.a + df2.b
except pd.core.common.SettingWithCopyError as e:
    print(e)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


To get rid of this error, use one of the following methods:

In [60]:
df = pd.DataFrame([{'a' :0, 'b':1},{'a':1, 'b':2}])
df['c'] = df.a + df.b
df2 = df[df.a == 0]
df2.head()

Unnamed: 0,a,b,c
0,0,1,1


Modification before slicing - may be slow for large data sets. Use this if the calculated column is required outside of the selected rows, too.

In [61]:
df = pd.DataFrame([{'a' :0, 'b':1},{'a':1, 'b':2}])
df2 = df[df.a == 0].copy()
df2['c'] = df2.a + df2.b
df2.head()

Unnamed: 0,a,b,c
0,0,1,1


Copy DataFrame - potentially large memory consumption, but it is ensured that the original DataFrame is not changed.

In [62]:
df = pd.DataFrame([{'a' :0, 'b':1},{'a':1, 'b':2}])
df.loc[df.a == 0, 'c'] = df.a + df.b
df.head()

Unnamed: 0,a,b,c
0,0,1,1.0
1,1,2,


Modify explicitly only the selected rows of the original dataframe. The new column for rows which are not selected is filled with NaN.

### Pandas vs. List of Dictionaries

In [63]:
df_taxi = pd.read_csv(csvfile, parse_dates=[1, 2]).head(1000)

#### Pandas

In [64]:
def process_pd(df):
    df['amt_per_mile'] = df['total_amount'] / df['trip_distance'] # zero divisions give NaN
    return df

In [65]:
df_taxi2 = process_pd(df_taxi)
df_taxi2.head()

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,amt_per_mile
0,2,2018-01-01 00:18:50,2018-01-01 00:24:39,N,1,236,236,5,0.7,6.0,0.5,0.5,0.0,0.0,,0.3,7.3,2,1.0,10.428571
1,2,2018-01-01 00:30:26,2018-01-01 00:46:42,N,1,43,42,5,3.5,14.5,0.5,0.5,0.0,0.0,,0.3,15.8,2,1.0,4.514286
2,2,2018-01-01 00:07:25,2018-01-01 00:19:45,N,1,74,152,1,2.14,10.0,0.5,0.5,0.0,0.0,,0.3,11.3,2,1.0,5.280374
3,2,2018-01-01 00:32:40,2018-01-01 00:33:41,N,1,255,255,1,0.03,-3.0,-0.5,-0.5,0.0,0.0,,-0.3,-4.3,3,1.0,-143.333333
4,2,2018-01-01 00:32:40,2018-01-01 00:33:41,N,1,255,255,1,0.03,3.0,0.5,0.5,0.0,0.0,,0.3,4.3,2,1.0,143.333333


In [66]:
%timeit process_pd(df_taxi)

883 µs ± 552 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)


#### DictList

In [67]:
dict_list = df_taxi.to_dict('records')  # create list of dictionaries from Pandas DataFrame

In [68]:
dict_list[:2]

[{'VendorID': 2,
  'lpep_pickup_datetime': Timestamp('2018-01-01 00:18:50'),
  'lpep_dropoff_datetime': Timestamp('2018-01-01 00:24:39'),
  'store_and_fwd_flag': 'N',
  'RatecodeID': 1,
  'PULocationID': 236,
  'DOLocationID': 236,
  'passenger_count': 5,
  'trip_distance': 0.7,
  'fare_amount': 6.0,
  'extra': 0.5,
  'mta_tax': 0.5,
  'tip_amount': 0.0,
  'tolls_amount': 0.0,
  'ehail_fee': nan,
  'improvement_surcharge': 0.3,
  'total_amount': 7.3,
  'payment_type': 2,
  'trip_type': 1.0,
  'amt_per_mile': 10.428571428571429},
 {'VendorID': 2,
  'lpep_pickup_datetime': Timestamp('2018-01-01 00:30:26'),
  'lpep_dropoff_datetime': Timestamp('2018-01-01 00:46:42'),
  'store_and_fwd_flag': 'N',
  'RatecodeID': 1,
  'PULocationID': 43,
  'DOLocationID': 42,
  'passenger_count': 5,
  'trip_distance': 3.5,
  'fare_amount': 14.5,
  'extra': 0.5,
  'mta_tax': 0.5,
  'tip_amount': 0.0,
  'tolls_amount': 0.0,
  'ehail_fee': nan,
  'improvement_surcharge': 0.3,
  'total_amount': 15.8,
  'payment

In [69]:
def process_dl(dl):
    dl_out = []
    for line in dl:
        try:
            line['amt_per_mile'] = line['total_amount'] / line['trip_distance']
        except ZeroDivisionError:
            line['amt_per_mile'] = float('nan')
        dl_out.append(line)
    return dl_out

In [70]:
dict_list = process_dl(dict_list)

In [71]:
%timeit process_dl(dict_list)

558 µs ± 1.58 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


Processing in Pandas is at least a factor 50 faster than for a list of dictionaries. Furthermore, code is simpler and memory requirements smaller.

In [72]:
del dict_list # free up memory