# Pandas

In [2]:
import pandas as pd
import numpy as np
pd.options.display.max_columns = None # display all columns in notebook

## Download Sample Data

In [2]:
import os
import urllib.request

In [3]:
url = 'https://s3.amazonaws.com/nyc-tlc/trip+data/green_tripdata_2018-01.csv' # 70 MB
csvfile = './example_files/test_data.csv'
if not os.path.exists(csvfile):
    print('downloading sample data')
    urllib.request.urlretrieve(url, csvfile)

## Series

In [4]:
s = pd.Series(np.random.randn(1000))
s.head() # shows the first n (standard: 5) rows of a dataframe

0   -2.138211
1   -1.052011
2    0.012799
3   -0.170571
4   -0.852382
dtype: float64

In [5]:
type(s.index), type(s.values)

(pandas.core.indexes.range.RangeIndex, numpy.ndarray)

A Pandas Series is actually a numpy array with an index.

## DataFrames

In [6]:
df_rdn = pd.DataFrame(np.random.randn(10000, 5), columns=['a', 'b', 'c', 'd', 'e'])
df_rdn.head() # shows the first n (standard: 5) rows of a dataframe

Unnamed: 0,a,b,c,d,e
0,0.585391,-0.234348,-1.094889,0.795303,-1.20533
1,-0.050897,-0.021503,0.765588,-0.531363,1.355724
2,0.036443,0.125581,0.538434,0.02305,-1.069896
3,-0.334732,0.760048,-1.233545,-0.09607,0.266424
4,-1.176401,-1.178876,-0.411369,-1.674715,-0.623226


Construct a Pandas DataFrame from numpy random numbers

In [7]:
type(df_rdn.index), type(df_rdn.a), type(df_rdn.a.values), type(df_rdn['b'])

(pandas.core.indexes.range.RangeIndex,
 pandas.core.series.Series,
 numpy.ndarray,
 pandas.core.series.Series)

A DataFrame is an index combined with multiple columns. 

Each column can be extracted as a Pandas Series (the column values together with the index).

Access to columns (as Series) is either possible with the syntax df['col'] or df.col (the latter only if the column is a valid Python name, i.e. it does not contain invalid characters like spaces, +, -, etc.).

In [8]:
df_taxi = pd.read_csv(csvfile)
df_taxi.head()

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type
0,2,2018-01-01 00:18:50,2018-01-01 00:24:39,N,1,236,236,5,0.7,6.0,0.5,0.5,0.0,0.0,,0.3,7.3,2,1.0
1,2,2018-01-01 00:30:26,2018-01-01 00:46:42,N,1,43,42,5,3.5,14.5,0.5,0.5,0.0,0.0,,0.3,15.8,2,1.0
2,2,2018-01-01 00:07:25,2018-01-01 00:19:45,N,1,74,152,1,2.14,10.0,0.5,0.5,0.0,0.0,,0.3,11.3,2,1.0
3,2,2018-01-01 00:32:40,2018-01-01 00:33:41,N,1,255,255,1,0.03,-3.0,-0.5,-0.5,0.0,0.0,,-0.3,-4.3,3,1.0
4,2,2018-01-01 00:32:40,2018-01-01 00:33:41,N,1,255,255,1,0.03,3.0,0.5,0.5,0.0,0.0,,0.3,4.3,2,1.0


In [9]:
len(df_taxi)

793529

Construct DataFrame from csv file

### Column Types

In [10]:
df_taxi.dtypes # show data types of columns

VendorID                   int64
lpep_pickup_datetime      object
lpep_dropoff_datetime     object
store_and_fwd_flag        object
RatecodeID                 int64
PULocationID               int64
DOLocationID               int64
passenger_count            int64
trip_distance            float64
fare_amount              float64
extra                    float64
mta_tax                  float64
tip_amount               float64
tolls_amount             float64
ehail_fee                float64
improvement_surcharge    float64
total_amount             float64
payment_type               int64
trip_type                float64
dtype: object

Numerical data types (integers, floats) are usually detected automatically during csv import.
Date (-time) columns are not identified automatically but imported as *object* (analogue to strings).

In [11]:
df_taxi.lpep_pickup_datetime = pd.to_datetime(df_taxi.lpep_pickup_datetime)
df_taxi.lpep_dropoff_datetime = pd.to_datetime(df_taxi.lpep_dropoff_datetime)

In [12]:
df_taxi.dtypes

VendorID                          int64
lpep_pickup_datetime     datetime64[ns]
lpep_dropoff_datetime    datetime64[ns]
store_and_fwd_flag               object
RatecodeID                        int64
PULocationID                      int64
DOLocationID                      int64
passenger_count                   int64
trip_distance                   float64
fare_amount                     float64
extra                           float64
mta_tax                         float64
tip_amount                      float64
tolls_amount                    float64
ehail_fee                       float64
improvement_surcharge           float64
total_amount                    float64
payment_type                      int64
trip_type                       float64
dtype: object

Ex-post conversion to Pandas Datetime using *pd.to_datetime()*. An explicit datetime format can be passed if Pandas cannot identify it automatically.

In [13]:
df_taxi = pd.read_csv(csvfile, parse_dates=[1, 2])
df_taxi.head()

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type
0,2,2018-01-01 00:18:50,2018-01-01 00:24:39,N,1,236,236,5,0.7,6.0,0.5,0.5,0.0,0.0,,0.3,7.3,2,1.0
1,2,2018-01-01 00:30:26,2018-01-01 00:46:42,N,1,43,42,5,3.5,14.5,0.5,0.5,0.0,0.0,,0.3,15.8,2,1.0
2,2,2018-01-01 00:07:25,2018-01-01 00:19:45,N,1,74,152,1,2.14,10.0,0.5,0.5,0.0,0.0,,0.3,11.3,2,1.0
3,2,2018-01-01 00:32:40,2018-01-01 00:33:41,N,1,255,255,1,0.03,-3.0,-0.5,-0.5,0.0,0.0,,-0.3,-4.3,3,1.0
4,2,2018-01-01 00:32:40,2018-01-01 00:33:41,N,1,255,255,1,0.03,3.0,0.5,0.5,0.0,0.0,,0.3,4.3,2,1.0


In [14]:
df_taxi.dtypes

VendorID                          int64
lpep_pickup_datetime     datetime64[ns]
lpep_dropoff_datetime    datetime64[ns]
store_and_fwd_flag               object
RatecodeID                        int64
PULocationID                      int64
DOLocationID                      int64
passenger_count                   int64
trip_distance                   float64
fare_amount                     float64
extra                           float64
mta_tax                         float64
tip_amount                      float64
tolls_amount                    float64
ehail_fee                       float64
improvement_surcharge           float64
total_amount                    float64
payment_type                      int64
trip_type                       float64
dtype: object

Datetime conversion during csv import by specifying datetime columns.

### Filtering

In [15]:
df_taxi_clean = df_taxi[df_taxi.total_amount > 0]
print(f'{len(df_taxi) - len(df_taxi_clean)} lines with non-positive payments removed')

3646 lines with non-positive payments removed


Meaning of this syntax: take all elements of df where the filter condition inside the brackets is True. The filter condition is actually a Pandas Series:

In [16]:
filter_ = df_taxi.total_amount > 0
filter_.head()

0     True
1     True
2     True
3    False
4     True
Name: total_amount, dtype: bool

In [17]:
df_taxi_clean = df_taxi[(df_taxi.total_amount > 0) & (df_taxi.passenger_count > 0)]
print(f'{len(df_taxi) - len(df_taxi_clean)} '
      f'lines with non-positive payments and passengers removed')

3817 lines with non-positive payments and passengers removed


Multiple conditions can be used for filtering using the operators & (and), | (or), ~ (not). Brackets must be made around the comparisons.

The syntax above is the most common one.

In [18]:
%timeit df_taxi_clean = df_taxi[(df_taxi.total_amount > 0) & (df_taxi.passenger_count > 0)] 
# filtering of 800k lines

153 ms ± 137 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [19]:
df_taxi_clean2 = df_taxi.loc[(df_taxi.total_amount > 0) & (df_taxi.passenger_count > 0),:]

In [20]:
pd.testing.assert_frame_equal(df_taxi_clean, df_taxi_clean2)
# raises AssertionError if DataFrames are different

In [21]:
%timeit df_taxi_clean2 = df_taxi.loc[(df_taxi.total_amount > 0) & (df_taxi.passenger_count > 0),:]

154 ms ± 189 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


Alternative syntax using df.loc[row, col] gives identical results in same time.

In [22]:
df_taxi_clean3 = df_taxi.query('total_amount > 0 and passenger_count > 0')

In [23]:
pd.testing.assert_frame_equal(df_taxi_clean, df_taxi_clean3)

In [24]:
%timeit df_taxi_clean3 = df_taxi.query('total_amount > 0 and passenger_count > 0')

159 ms ± 272 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


Yet another (rather exotic) filtering syntax, where the condition is given as a sting of Python code.

In [25]:
del df_taxi_clean2, df_taxi_clean3

### Calculations on Columns

In [26]:
df_rdn['f'] = df_rdn['a'] + df_rdn.b * df_rdn.c * np.exp(df_rdn.d) / df_rdn.e
df_rdn['g'] = df_rdn.f.astype(np.int64) # type conversion
df_rdn.head()

Unnamed: 0,a,b,c,d,e,f,g
0,0.585391,-0.234348,-1.094889,0.795303,-1.20533,0.113848,0
1,-0.050897,-0.021503,0.765588,-0.531363,1.355724,-0.058035,0
2,0.036443,0.125581,0.538434,0.02305,-1.069896,-0.028231,0
3,-0.334732,0.760048,-1.233545,-0.09607,0.266424,-3.53141,-3
4,-1.176401,-1.178876,-0.411369,-1.674715,-0.623226,-1.322193,-1


Definition of a new column using vectorized calculations. Numpy functions can be used here (as described above, Pandas columns are actually numpy arrays).

The index makes sure that for operations with Series the rows are matched correctly. 

In [27]:
df_taxi['drive_time'] = df_taxi.lpep_dropoff_datetime - df_taxi.lpep_pickup_datetime
df_taxi['avg_speed'] = df_taxi.trip_distance / (df_taxi.drive_time.dt.seconds / 3600)
df_taxi.head()

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,drive_time,avg_speed
0,2,2018-01-01 00:18:50,2018-01-01 00:24:39,N,1,236,236,5,0.7,6.0,0.5,0.5,0.0,0.0,,0.3,7.3,2,1.0,00:05:49,7.22063
1,2,2018-01-01 00:30:26,2018-01-01 00:46:42,N,1,43,42,5,3.5,14.5,0.5,0.5,0.0,0.0,,0.3,15.8,2,1.0,00:16:16,12.909836
2,2,2018-01-01 00:07:25,2018-01-01 00:19:45,N,1,74,152,1,2.14,10.0,0.5,0.5,0.0,0.0,,0.3,11.3,2,1.0,00:12:20,10.410811
3,2,2018-01-01 00:32:40,2018-01-01 00:33:41,N,1,255,255,1,0.03,-3.0,-0.5,-0.5,0.0,0.0,,-0.3,-4.3,3,1.0,00:01:01,1.770492
4,2,2018-01-01 00:32:40,2018-01-01 00:33:41,N,1,255,255,1,0.03,3.0,0.5,0.5,0.0,0.0,,0.3,4.3,2,1.0,00:01:01,1.770492


Operations on Pandas Datetime and Timedelta objects.

### Apply Syntax

In [28]:
df_sample = df_taxi.sample(n=1000)

In [33]:
df_sample['useless_string']=df_sample.apply(lambda x: str(x['passenger_count'])+'_'+str(x['payment_type']),axis=1)
df_sample.useless_string.head()

623053    1_1
245438    1_2
120476    1_1
231643    1_4
354580    1_1
Name: useless_string, dtype: object

In [30]:
%timeit df_sample['useless_string']=df_sample.apply(lambda x: str(x['passenger_count'])+'_'+str(x['payment_type']),axis=1)

115 ms ± 129 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


Calculations which cannot be executed on whole columns at once can be applied element-wise (usually slower but more versatile than applying on whole column).

In [34]:
df_sample['useless_string2']=df_sample.passenger_count.astype('str')+'_'+df_sample.payment_type.astype('str')
df_sample.useless_string2.head()

623053    1_1
245438    1_2
120476    1_1
231643    1_4
354580    1_1
Name: useless_string2, dtype: object

In [35]:
%timeit df_sample['useless_string2']=df_sample.passenger_count.astype('str')+'_'+df_sample.payment_type.astype('str')

7.03 ms ± 46.7 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [36]:
df_taxi.payment_type.unique()

array([2, 3, 1, 4, 5])

The same as vectorized syntax is more than 10 times faster.
However, the apply syntax is more flexible.

In [42]:
payments = {1: 'cash',
    2: 'credit card',
    3: 'debit card',
    4: 'gold',
    5: 'bill'}
df_sample.payment_type.apply(lambda x: payments.get(x, None)).head()

623053           cash
245438    credit card
120476           cash
231643           gold
354580           cash
Name: payment_type, dtype: object

In [43]:
%timeit df_sample.payment_type.apply(lambda x: payments.get(x, None))

1.2 ms ± 6.36 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


The apply function can be used both on Pandas Series and DataFrames.

In [44]:
df_sample.payment_type.map(payments).head()

623053           cash
245438    credit card
120476           cash
231643           gold
354580           cash
Name: payment_type, dtype: object

In [45]:
%timeit df_sample.payment_type.map(payments)

1.65 ms ± 3.66 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


Alternatively in this example, the map function can be used to apply a dictionary to a Pandas Series.

## Indices

## Gotchas

### Setting With Copy Error¶

When first slicing a dataframe and afterwards doing some modification, it is not under control if the original dataframe is changed or a copy of it. Therefore, Pandas raises a SettingWithCopy warning.

https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#indexing-view-versus-copy

In [47]:
df = pd.DataFrame([{'a' :0, 'b':1},{'a':1, 'b':2}])
df2 = df[df.a == 0]
df2['c'] = df2.a + df2.b

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


If required (e.g. for easier identification of the line throwing the warning), this can be changed into an exception with the following setting

It is recommended to use this setting for production code (put this line directly after Pandas import in each module).

In [48]:
pd.options.mode.chained_assignment = 'raise'
#raises pandas.core.common.SettingWithCopyError

In [64]:
df = pd.DataFrame([{'a' :0, 'b':1},{'a':1, 'b':2}])
try:
    df2 = df[df.a ==0 ]
    df2['c'] = df2.a + df2.b
except pd.core.common.SettingWithCopyError as e:
    print(e)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


To get rid of this error, use one of the following methods:

In [59]:
df = pd.DataFrame([{'a' :0, 'b':1},{'a':1, 'b':2}])
df['c'] = df.a + df.b
df2 = df[df.a == 0]
df2.head()

Unnamed: 0,a,b,c
0,0,1,1


Modification before slicing - may be slow for large data sets. Use this if the calculated column is required outside of the selected rows, too.

In [62]:
df = pd.DataFrame([{'a' :0, 'b':1},{'a':1, 'b':2}])
df2 = df[df.a == 0].copy()
df2['c'] = df2.a + df2.b
df2.head()

Unnamed: 0,a,b,c
0,0,1,1


Copy DataFrame - potentially large memory consumption, but it is ensured that the original DataFrame is not changed.

In [63]:
df = pd.DataFrame([{'a' :0, 'b':1},{'a':1, 'b':2}])
df.loc[df.a == 0, 'c'] = df.a + df.b
df.head()

Unnamed: 0,a,b,c
0,0,1,1.0
1,1,2,


Modify explicitly only the selected rows of the original dataframe. The new column for rows which are not selected is filled with NaN.

## Performance

### Pandas vs. List of Dictionaries

In [None]:
df_taxi = pd.read_csv(csvfile, parse_dates=[1, 2])

#### Pandas

In [None]:
def process_pd(df):
    df['amt_per_mile'] = df['total_amount'] / df['trip_distance'] # zero divisions give NaN
    return df

In [None]:
df_taxi2 = process_pd(df_taxi)
df_taxi2.head()

In [None]:
%timeit process_pd(df_taxi)

#### DictList

In [None]:
dict_list = df_taxi.to_dict('records')  # create list of dictionaries from Pandas DataFrame

In [None]:
dict_list[:2]

In [None]:
def process_dl(dl):
    dl_out = []
    for line in dl:
        try:
            line['amt_per_mile'] = line['total_amount'] / line['trip_distance']
        except ZeroDivisionError:
            line['amt_per_mile'] = float('nan')
        dl_out.append(line)
    return dl_out

In [None]:
dict_list = process_dl(dict_list)

In [None]:
%timeit process_dl(dict_list)

Processing in Pandas is at least a factor 50 faster than for a list of dictionaries. Furthermore, code is simpler and memory requirements smaller.

In [None]:
del dict_list # free up memory