# Pandas

In [1]:
import pandas as pd
import numpy as np
pd.options.display.max_columns = None # display all columns in notebook

## Download Sample Data

In [2]:
import os
import urllib.request

In [3]:
url = 'https://s3.amazonaws.com/nyc-tlc/trip+data/green_tripdata_2018-01.csv' # 70 MB
csvfile = './test_data.csv'
if not os.path.exists('./test_data.csv'):
    print('downloading sample data')
    urllib.request.urlretrieve(url, csvfile)

## Series

In [4]:
s = pd.Series(np.random.randn(1000))
s.head() # shows the first n (standard: 5) rows of a dataframe

0   -1.447737
1    1.292847
2   -1.432174
3   -0.954645
4   -0.785512
dtype: float64

In [5]:
type(s.index), type(s.values)

(pandas.core.indexes.range.RangeIndex, numpy.ndarray)

A Pandas Series is actually a numpy array with an index.

## DataFrames

In [6]:
df_rdn = pd.DataFrame(np.random.randn(10000, 5), columns=['a', 'b', 'c', 'd', 'e'])
df_rdn.head() # shows the first n (standard: 5) rows of a dataframe

Unnamed: 0,a,b,c,d,e
0,-0.469624,-1.021124,-1.992691,-0.185474,-0.167933
1,-0.821151,0.107866,-1.400592,0.112157,-1.029502
2,0.489748,-1.280041,-0.469183,-1.536354,0.506659
3,0.8852,2.825593,0.788247,2.486991,-0.726929
4,-0.712666,-2.120333,-0.045888,-1.34862,0.946487


Construct a Pandas DataFrame from numpy random numbers

In [7]:
type(df_rdn.index), type(df_rdn.a), type(df_rdn.a.values), type(df_rdn['b'])

(pandas.core.indexes.range.RangeIndex,
 pandas.core.series.Series,
 numpy.ndarray,
 pandas.core.series.Series)

A DataFrame is an index combined with multiple columns. 

Each column can be extracted as a Pandas Series (the column values together with the index).

Access to columns (as Series) is either possible with the syntax df['col'] or df.col (the latter only if the column is a valid Python name, i.e. it does not contain invalid characters like spaces, +, -, etc.).

In [8]:
df_taxi = pd.read_csv(csvfile)
df_taxi.head()

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type
0,2,2018-01-01 00:18:50,2018-01-01 00:24:39,N,1,236,236,5,0.7,6.0,0.5,0.5,0.0,0.0,,0.3,7.3,2,1.0
1,2,2018-01-01 00:30:26,2018-01-01 00:46:42,N,1,43,42,5,3.5,14.5,0.5,0.5,0.0,0.0,,0.3,15.8,2,1.0
2,2,2018-01-01 00:07:25,2018-01-01 00:19:45,N,1,74,152,1,2.14,10.0,0.5,0.5,0.0,0.0,,0.3,11.3,2,1.0
3,2,2018-01-01 00:32:40,2018-01-01 00:33:41,N,1,255,255,1,0.03,-3.0,-0.5,-0.5,0.0,0.0,,-0.3,-4.3,3,1.0
4,2,2018-01-01 00:32:40,2018-01-01 00:33:41,N,1,255,255,1,0.03,3.0,0.5,0.5,0.0,0.0,,0.3,4.3,2,1.0


In [31]:
len(df_taxi)

793529

Construct DataFrame from csv file

### Column Types

In [9]:
df_taxi.dtypes # show data types of columns

VendorID                   int64
lpep_pickup_datetime      object
lpep_dropoff_datetime     object
store_and_fwd_flag        object
RatecodeID                 int64
PULocationID               int64
DOLocationID               int64
passenger_count            int64
trip_distance            float64
fare_amount              float64
extra                    float64
mta_tax                  float64
tip_amount               float64
tolls_amount             float64
ehail_fee                float64
improvement_surcharge    float64
total_amount             float64
payment_type               int64
trip_type                float64
dtype: object

Numerical data types (integers, floats) are usually detected automatically during csv import.
Date (-time) columns are not identified automatically but imported as *object* (analogue to strings).

In [10]:
df_taxi.lpep_pickup_datetime = pd.to_datetime(df_taxi.lpep_pickup_datetime)
df_taxi.lpep_dropoff_datetime = pd.to_datetime(df_taxi.lpep_dropoff_datetime)

In [11]:
df_taxi.dtypes

VendorID                          int64
lpep_pickup_datetime     datetime64[ns]
lpep_dropoff_datetime    datetime64[ns]
store_and_fwd_flag               object
RatecodeID                        int64
PULocationID                      int64
DOLocationID                      int64
passenger_count                   int64
trip_distance                   float64
fare_amount                     float64
extra                           float64
mta_tax                         float64
tip_amount                      float64
tolls_amount                    float64
ehail_fee                       float64
improvement_surcharge           float64
total_amount                    float64
payment_type                      int64
trip_type                       float64
dtype: object

Ex-post conversion to Pandas Datetime using *pd.to_datetime()*. An explicit datetime format can be passed if Pandas cannot identify it automatically.

In [12]:
df_taxi = pd.read_csv(csvfile, parse_dates=[1, 2])
df_taxi.head()

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type
0,2,2018-01-01 00:18:50,2018-01-01 00:24:39,N,1,236,236,5,0.7,6.0,0.5,0.5,0.0,0.0,,0.3,7.3,2,1.0
1,2,2018-01-01 00:30:26,2018-01-01 00:46:42,N,1,43,42,5,3.5,14.5,0.5,0.5,0.0,0.0,,0.3,15.8,2,1.0
2,2,2018-01-01 00:07:25,2018-01-01 00:19:45,N,1,74,152,1,2.14,10.0,0.5,0.5,0.0,0.0,,0.3,11.3,2,1.0
3,2,2018-01-01 00:32:40,2018-01-01 00:33:41,N,1,255,255,1,0.03,-3.0,-0.5,-0.5,0.0,0.0,,-0.3,-4.3,3,1.0
4,2,2018-01-01 00:32:40,2018-01-01 00:33:41,N,1,255,255,1,0.03,3.0,0.5,0.5,0.0,0.0,,0.3,4.3,2,1.0


In [13]:
df_taxi.dtypes

VendorID                          int64
lpep_pickup_datetime     datetime64[ns]
lpep_dropoff_datetime    datetime64[ns]
store_and_fwd_flag               object
RatecodeID                        int64
PULocationID                      int64
DOLocationID                      int64
passenger_count                   int64
trip_distance                   float64
fare_amount                     float64
extra                           float64
mta_tax                         float64
tip_amount                      float64
tolls_amount                    float64
ehail_fee                       float64
improvement_surcharge           float64
total_amount                    float64
payment_type                      int64
trip_type                       float64
dtype: object

Datetime conversion during csv import by specifying datetime columns.

### Filtering

In [14]:
df_taxi_clean = df_taxi[df_taxi.total_amount > 0]
print(f'{len(df_taxi) - len(df_taxi_clean)} lines with non-positive payments removed')

3646 lines with non-positive payments removed


Meaning of this syntax: take all elements of df where the filter condition inside the brackets is True. The filter condition is actually a Pandas Series:

In [15]:
filter_ = df_taxi.total_amount > 0
filter_.head()

0     True
1     True
2     True
3    False
4     True
Name: total_amount, dtype: bool

In [38]:
df_taxi_clean = df_taxi[(df_taxi.total_amount > 0) & (df_taxi.passenger_count > 0)]
print(f'{len(df_taxi) - len(df_taxi_clean)} '
      f'lines with non-positive payments and passengers removed')

3817 lines with non-positive payments and passengers removed


Multiple conditions can be used for filtering using the operators & (and), | (or), ~ (not). Brackets must be made around the comparisons.

The syntax above is the most common one.

In [41]:
%timeit df_taxi_clean = df_taxi[(df_taxi.total_amount > 0) & (df_taxi.passenger_count > 0)] 
# filtering of 800k lines

169 ms ± 7.05 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [39]:
df_taxi_clean2 = df_taxi.loc[(df_taxi.total_amount > 0) & (df_taxi.passenger_count > 0),:]

In [40]:
pd.testing.assert_frame_equal(df_taxi_clean, df_taxi_clean2)
# raises AssertionError if DataFrames are different

In [42]:
%timeit df_taxi_clean2 = df_taxi.loc[(df_taxi.total_amount > 0) 
                                     & (df_taxi.passenger_count > 0),:]

170 ms ± 8.1 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


Alternative syntax using df.loc[row, col] gives identical results in same time.

In [44]:
df_taxi_clean3 = df_taxi.query('total_amount > 0 and passenger_count > 0')

In [46]:
pd.testing.assert_frame_equal(df_taxi_clean, df_taxi_clean3)

In [47]:
%timeit df_taxi_clean3 = df_taxi.query('total_amount > 0 and passenger_count > 0')

165 ms ± 344 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


Yet another (rather exotic) filtering syntax, where the condition is given as a sting of Python code.

In [48]:
del df_taxi_clean2, df_taxi_clean3

### Calculations on Columns

In [17]:
df_rdn['f'] = df_rdn['a'] + df_rdn.b * df_rdn.c * np.exp(df_rdn.d) / df_rdn.e
df_rdn['g'] = df_rdn.f.astype(np.int64) # type conversion
df_rdn.head()

Unnamed: 0,a,b,c,d,e,f,g
0,-0.469624,-1.021124,-1.992691,-0.185474,-0.167933,-10.535057,-10
1,-0.821151,0.107866,-1.400592,0.112157,-1.029502,-0.656986,0
2,0.489748,-1.280041,-0.469183,-1.536354,0.506659,0.744795,0
3,0.8852,2.825593,0.788247,2.486991,-0.726929,-35.958807,-35
4,-0.712666,-2.120333,-0.045888,-1.34862,0.946487,-0.68598,0


Definition of a new column using vectorized calculations. Numpy functions can be used here (as described above, Pandas columns are actually numpy arrays).

The index makes sure that for operations with Series the rows are matched correctly. 

In [18]:
df_taxi['drive_time'] = df_taxi.lpep_dropoff_datetime - df_taxi.lpep_pickup_datetime
df_taxi['avg_speed'] = df_taxi.trip_distance / (df_taxi.drive_time.dt.seconds / 3600)
df_taxi.head()

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,drive_time,avg_speed
0,2,2018-01-01 00:18:50,2018-01-01 00:24:39,N,1,236,236,5,0.7,6.0,0.5,0.5,0.0,0.0,,0.3,7.3,2,1.0,00:05:49,7.22063
1,2,2018-01-01 00:30:26,2018-01-01 00:46:42,N,1,43,42,5,3.5,14.5,0.5,0.5,0.0,0.0,,0.3,15.8,2,1.0,00:16:16,12.909836
2,2,2018-01-01 00:07:25,2018-01-01 00:19:45,N,1,74,152,1,2.14,10.0,0.5,0.5,0.0,0.0,,0.3,11.3,2,1.0,00:12:20,10.410811
3,2,2018-01-01 00:32:40,2018-01-01 00:33:41,N,1,255,255,1,0.03,-3.0,-0.5,-0.5,0.0,0.0,,-0.3,-4.3,3,1.0,00:01:01,1.770492
4,2,2018-01-01 00:32:40,2018-01-01 00:33:41,N,1,255,255,1,0.03,3.0,0.5,0.5,0.0,0.0,,0.3,4.3,2,1.0,00:01:01,1.770492


Operations on Pandas Datetime and Timedelta objects.

## Indices

## Performance

### Pandas vs. List of Dictionaries

In [20]:
df_taxi = pd.read_csv(csvfile, parse_dates=[1, 2])

#### Pandas

In [21]:
def process_pd(df):
    df['amt_per_mile'] = df['total_amount'] / df['trip_distance'] # zero divisions give NaN
    return df

In [22]:
df_taxi2 = process_pd(df_taxi)
df_taxi2.head()

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,amt_per_mile
0,2,2018-01-01 00:18:50,2018-01-01 00:24:39,N,1,236,236,5,0.7,6.0,0.5,0.5,0.0,0.0,,0.3,7.3,2,1.0,10.428571
1,2,2018-01-01 00:30:26,2018-01-01 00:46:42,N,1,43,42,5,3.5,14.5,0.5,0.5,0.0,0.0,,0.3,15.8,2,1.0,4.514286
2,2,2018-01-01 00:07:25,2018-01-01 00:19:45,N,1,74,152,1,2.14,10.0,0.5,0.5,0.0,0.0,,0.3,11.3,2,1.0,5.280374
3,2,2018-01-01 00:32:40,2018-01-01 00:33:41,N,1,255,255,1,0.03,-3.0,-0.5,-0.5,0.0,0.0,,-0.3,-4.3,3,1.0,-143.333333
4,2,2018-01-01 00:32:40,2018-01-01 00:33:41,N,1,255,255,1,0.03,3.0,0.5,0.5,0.0,0.0,,0.3,4.3,2,1.0,143.333333


In [23]:
%timeit process_pd(df_taxi)

8.51 ms ± 61.9 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


#### DictList

In [24]:
dict_list = df_taxi.to_dict('records')  # create list of dictionaries from Pandas DataFrame

In [25]:
dict_list[:2]

[{'VendorID': 2,
  'lpep_pickup_datetime': Timestamp('2018-01-01 00:18:50'),
  'lpep_dropoff_datetime': Timestamp('2018-01-01 00:24:39'),
  'store_and_fwd_flag': 'N',
  'RatecodeID': 1,
  'PULocationID': 236,
  'DOLocationID': 236,
  'passenger_count': 5,
  'trip_distance': 0.7,
  'fare_amount': 6.0,
  'extra': 0.5,
  'mta_tax': 0.5,
  'tip_amount': 0.0,
  'tolls_amount': 0.0,
  'ehail_fee': nan,
  'improvement_surcharge': 0.3,
  'total_amount': 7.3,
  'payment_type': 2,
  'trip_type': 1.0,
  'amt_per_mile': 10.428571428571429},
 {'VendorID': 2,
  'lpep_pickup_datetime': Timestamp('2018-01-01 00:30:26'),
  'lpep_dropoff_datetime': Timestamp('2018-01-01 00:46:42'),
  'store_and_fwd_flag': 'N',
  'RatecodeID': 1,
  'PULocationID': 43,
  'DOLocationID': 42,
  'passenger_count': 5,
  'trip_distance': 3.5,
  'fare_amount': 14.5,
  'extra': 0.5,
  'mta_tax': 0.5,
  'tip_amount': 0.0,
  'tolls_amount': 0.0,
  'ehail_fee': nan,
  'improvement_surcharge': 0.3,
  'total_amount': 15.8,
  'payment

In [26]:
def process_dl(dl):
    dl_out = []
    for line in dl:
        try:
            line['amt_per_mile'] = line['total_amount'] / line['trip_distance']
        except ZeroDivisionError:
            line['amt_per_mile'] = float('nan')
        dl_out.append(line)
    return dl_out

In [27]:
dict_list = process_dl(dict_list)

In [28]:
%timeit process_dl(dict_list)

425 ms ± 1.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


Processing in Pandas is at least a factor 50 faster than for a list of dictionaries. Furthermore, code is simpler and memory requirements smaller.

In [29]:
del dict_list # free up memory