# Python Asserts

## Validating and Verifying Data

* [TLC Trip Record Data](https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page)

Yellow and green taxi trip records include fields capturing pick-up and drop-off dates/times, pick-up and drop-off locations, trip distances, itemized fares, rate types, payment types, and driver-reported passenger counts. The data used in the attached datasets were collected and provided to the NYC Taxi and Limousine Commission (TLC) by technology providers authorized under the Taxicab & Livery Passenger Enhancement Programs (TPEP/LPEP). The trip data was not created by the TLC, and TLC makes no representations as to the accuracy of these data.

## Datasets

In [None]:
!wget https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet -P dataset

In [38]:
import csv
import math
import numpy as np
import numpy.testing as npt
import pandas as pd
import pandas.testing as pdt

In [29]:
yellow_tripdata_df = pd.read_parquet(
    'dataset/yellow_tripdata_2023-01.parquet'
)

yellow_tripdata_df.to_csv('dataset/yellow_tripdata_2023-01.csv')

In [39]:
yellow_tripdata_df = pd.read_csv(
    'dataset/yellow_tripdata_2023-01.csv',
    parse_dates=['tpep_pickup_datetime','tpep_dropoff_datetime'],
    nrows=1000
)
yellow_tripdata_df.head(5)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,...,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,0,0,2,2023-01-01 00:32:10,2023-01-01 00:40:36,1.0,0.97,1.0,N,161,...,2,9.3,1.0,0.5,0.0,0.0,1.0,14.3,2.5,0.0
1,1,1,2,2023-01-01 00:55:08,2023-01-01 01:01:27,1.0,1.1,1.0,N,43,...,1,7.9,1.0,0.5,4.0,0.0,1.0,16.9,2.5,0.0
2,2,2,2,2023-01-01 00:25:04,2023-01-01 00:37:49,1.0,2.51,1.0,N,48,...,1,14.9,1.0,0.5,15.0,0.0,1.0,34.9,2.5,0.0
3,3,3,1,2023-01-01 00:03:48,2023-01-01 00:13:25,0.0,1.9,1.0,N,138,...,1,12.1,7.25,0.5,0.0,0.0,1.0,20.85,0.0,1.25
4,4,4,2,2023-01-01 00:10:29,2023-01-01 00:21:19,1.0,1.43,1.0,N,107,...,1,11.4,1.0,0.5,3.28,0.0,1.0,19.68,2.5,0.0


In [43]:
# https://www.kaggle.com/datasets/neomatrix369/nyc-taxi-trip-duration-extended
trip_ext_df = pd.read_csv('dataset/nyc_trip_duration_extended.csv')
trip_ext_df.head(5)

Unnamed: 0,name,district,neighbourhood,latitude,longitude,geonumber
0,Beautiful Well Kept Private Home!,Staten Island,Tottenville,40.49979,-74.24084,23415.72136
1,Cozy Apartment,Staten Island,Tottenville,40.50641,-74.23059,23416.92321
2,Villa Digioia Visit Nyc Via Si,Staten Island,Tottenville,40.50708,-74.24285,23417.03155
3,"Beautiful 4Br/4Ba Home, Staten Island, Ny City.",Staten Island,Tottenville,40.50868,-74.23986,23417.32254
4,Cozy Getaway,Staten Island,Tottenville,40.50873,-74.23914,23417.33226


## Introduction to Asserts

### Asserts in Python

In [9]:
# simple assert
x = 'five'
assert x == 5
# AssertionError

AssertionError: 

In [14]:
list = [6,2,3,4,5]
assert all(list[i] <= list[i+1] for i in range(len(list)-1))
# AssertionError

AssertionError: 

In [17]:
def add(a,b):
    return a + b

assert add(2,3) < 5
AssertionError

AssertionError: 

In [20]:
trip_ext_df.columns

# Index(['name', 'district', 'neighbourhood', 'latitude', 'longitude',
#        'geonumber'],
#       dtype='object')

Index(['name', 'district', 'neighbourhood', 'latitude', 'longitude',
       'geonumber'],
      dtype='object')

In [22]:
with open('dataset/nyc_trip_duration_extended.csv') as f:
    reader = csv.DictReader(f)
    
    expected_columns = ['name', 'district', 'neighbourhood', 'latitude', 'longitude',
       'geonumber', 'missing_column']
    
    assert reader.fieldnames == expected_columns, f"Expected columns: {expected_columns}, but got {reader.fieldnames}"
    
# AssertionError:
# Expected columns: ['name', 'district', 'neighbourhood', 'latitude', 'longitude', 'geonumber', 'missing_column'],
# but got ['name', 'district', 'neighbourhood', 'latitude', 'longitude', 'geonumber']


AssertionError: Expected columns: ['name', 'district', 'neighbourhood', 'latitude', 'longitude', 'geonumber', 'missing_column'], but got ['name', 'district', 'neighbourhood', 'latitude', 'longitude', 'geonumber']

In [19]:
with open('dataset/yellow_tripdata_2023-01.csv') as f:
    reader = csv.DictReader(f)
    
    for row in reader:
        # check passenger count is positive int
        assert float(row['passenger_count']) > 0., f"ERROR :: Invalid Passenger Count: {row['passenger_count']}"
        
# AssertionError: ERROR :: Invalid Passenger Count: 0.0

AssertionError: ERROR :: Invalid Passenger Count: 0.0

In [22]:
# how many trips were without passengers
trips_without_passengers = 0

with open('dataset/yellow_tripdata_2023-01.csv') as f:
    next(f) # # skip header
    
    for line in f:
        values = line.strip().split(',')
        trip_id = values[0]
        passenger_count = values[4]
        if passenger_count == '0.0':
            trips_without_passengers += 1
            
print(f'Trips without passengers: {trips_without_passengers}')
# Trips without passengers: 51164

Trips without passengers: 51164


In [41]:
perct_zero_trips = len(zero_trips) * 100 / len(yellow_tripdata_df)
print("%.2f" % perct_zero_trips + ' %')
# 1.67 %

1.67 %


### Asserts in Pandas

#### Indices

In [6]:
index1 = pd.Index([1,2,3])
index2 = pd.Index([1,2,'three'])

pdt.assert_index_equal(index1, index2)

# Index classes are different
# [left]:  Int64Index([1, 2, 3], dtype='int64')
# [right]: Index([1, 2, 'three'], dtype='object')

AssertionError: Index are different

Index classes are different
[left]:  Int64Index([1, 2, 3], dtype='int64')
[right]: Index([1, 2, 'three'], dtype='object')

In [8]:
index1 = pd.Index([1,2,3])
index2 = pd.Index([3,2,1])

pdt.assert_index_equal(index1, index2, check_order=True)
# Index values are different (66.66667 %)
# [left]:  Int64Index([1, 2, 3], dtype='int64')
# [right]: Int64Index([3, 2, 1], dtype='int64')

AssertionError: Index are different

Index values are different (66.66667 %)
[left]:  Int64Index([1, 2, 3], dtype='int64')
[right]: Int64Index([3, 2, 1], dtype='int64')

In [21]:
index1 = pd.Index([1.0,2.0,3.0])
index2 = pd.Index([1.0,2.0,3.1])

pdt.assert_index_equal(index1, index2, check_exact=False, atol=0.1)

# Index values are different (33.33333 %)
# [left]:  Float64Index([1.0, 2.0, 3.0], dtype='float64')
# [right]: Float64Index([1.0, 2.0, 3.1], dtype='float64')

AssertionError: Index are different

Index values are different (33.33333 %)
[left]:  Float64Index([1.0, 2.0, 3.0], dtype='float64')
[right]: Float64Index([1.0, 2.0, 3.1], dtype='float64')

In [33]:
index1 = pd.Index(yellow_tripdata_df['tpep_pickup_datetime'].dt.date)
index1[:3]
# Index([2023-01-01, 2023-01-01, 2023-01-01], dtype='object', name='tpep_pickup_datetime')

Index([2023-01-01, 2023-01-01, 2023-01-01], dtype='object', name='tpep_pickup_datetime')

In [34]:
index2 = pd.Index(yellow_tripdata_df['tpep_dropoff_datetime'].dt.date)
index2[:3]
# Index([2023-01-01, 2023-01-01, 2023-01-01], dtype='object', name='tpep_dropoff_datetime')

Index([2023-01-01, 2023-01-01, 2023-01-01], dtype='object', name='tpep_dropoff_datetime')

In [35]:
pdt.assert_index_equal(index1, index2, check_exact=True, check_names=False)

# Index values are different (0.3 %)
# [left]:  Index([2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01,
#        2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01,
#        ...
#        2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01,
#        2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01],
#       dtype='object', name='tpep_pickup_datetime', length=1000)
# [right]: Index([2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01,
#        2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01,
#        ...
#        2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01,
#        2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01],
#       dtype='object', name='tpep_dropoff_datetime', length=1000)

AssertionError: Index are different

Index values are different (0.3 %)
[left]:  Index([2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01,
       2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01,
       ...
       2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01,
       2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01],
      dtype='object', name='tpep_pickup_datetime', length=1000)
[right]: Index([2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01,
       2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01,
       ...
       2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01,
       2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01],
      dtype='object', name='tpep_dropoff_datetime', length=1000)

In [39]:
# show difference
index_diff = yellow_tripdata_df[
    index1 != index2
][
    ['tpep_pickup_datetime','tpep_dropoff_datetime']
]

index_diff.head()

#     tpep_pickup_datetime tpep_dropoff_datetime
# 383  2023-01-01 00:36:07   2023-01-02 00:17:13
# 567  2022-12-31 23:59:37   2023-01-01 00:07:28
# 761  2022-12-31 23:58:27   2023-01-01 00:02:21

Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime
383,2023-01-01 00:36:07,2023-01-02 00:17:13
567,2022-12-31 23:59:37,2023-01-01 00:07:28
761,2022-12-31 23:58:27,2023-01-01 00:02:21


#### Series

In [41]:
s1 = pd.Series([1,2,3], name='series1')
s2 = pd.Series([1,2,3], name='series2')

pdt.assert_series_equal(s1,s2,check_names=True)

# Attribute "name" are different
# [left]:  series1
# [right]: series2

AssertionError: Series are different

Attribute "name" are different
[left]:  series1
[right]: series2

In [3]:
s1 = pd.Series([1,2,3], name='series1')
s2 = pd.Series(['1','2','3'], name='series2')

pdt.assert_series_equal(s1,s2,check_names=False, check_dtype=False)

# Series values are different (100.0 %)
# [index]: [0, 1, 2]
# [left]:  [1, 2, 3]
# [right]: [1, 2, 3]

AssertionError: Series are different

Series values are different (100.0 %)
[index]: [0, 1, 2]
[left]:  [1, 2, 3]
[right]: [1, 2, 3]

In [5]:
pickup_series = yellow_tripdata_df['tpep_pickup_datetime'].dt.date
pickup_series.head(2)

# 0    2023-01-01
# 1    2023-01-01
# Name: tpep_pickup_datetime, dtype: object

0    2023-01-01
1    2023-01-01
Name: tpep_pickup_datetime, dtype: object

In [6]:
dropoff_series = yellow_tripdata_df['tpep_dropoff_datetime'].dt.date
pickup_series.head(2)

# 0    2023-01-01
# 1    2023-01-01
# Name: tpep_pickup_datetime, dtype: object

0    2023-01-01
1    2023-01-01
Name: tpep_pickup_datetime, dtype: object

In [7]:
pdt.assert_series_equal(pickup_series, dropoff_series, check_exact=True, check_names=False)

# AssertionError: Series are different
# Series values are different (0.3 %)

AssertionError: Series are different

Series values are different (0.3 %)
[index]: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, ...]
[left]:  [2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2022-12-31, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, ...]
[right]: [2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2022-12-31, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, ...]

In [10]:
# drop values that don't fit
index_diff = pickup_series.index[pickup_series != dropoff_series]
yellow_tripdata_df_drop = yellow_tripdata_df.drop(index_diff).reset_index(drop=True)
# rebuild series
pickup_series = yellow_tripdata_df_drop['tpep_pickup_datetime'].dt.date
dropoff_series = yellow_tripdata_df_drop['tpep_dropoff_datetime'].dt.date
# re-check - this times it works
pdt.assert_series_equal(pickup_series, dropoff_series, check_exact=True, check_names=False)

#### DataFrames

In [15]:
df1 = pd.DataFrame({'A': [1,2,3], 'B': [3,2,1]})
df2 = pd.DataFrame({'B': [3,2,1], 'A': [1,2,3]})

pdt.assert_frame_equal(df1,df2,check_like=False)

# DataFrame.columns values are different (100.0 %)
# [left]:  Index(['A', 'B'], dtype='object')
# [right]: Index(['B', 'A'], dtype='object')

AssertionError: DataFrame.columns are different

DataFrame.columns values are different (100.0 %)
[left]:  Index(['A', 'B'], dtype='object')
[right]: Index(['B', 'A'], dtype='object')

In [18]:
pickup_df = yellow_tripdata_df.copy()
dropoff_df = yellow_tripdata_df.copy()

pickup_df['date'] = yellow_tripdata_df['tpep_pickup_datetime'].dt.date
dropoff_df['date'] = yellow_tripdata_df['tpep_dropoff_datetime'].dt.date

pdt.assert_frame_equal(
    pickup_df[['date']],
    dropoff_df[['date']],
    check_exact=True,
    check_names=False
)

# AssertionError: DataFrame.iloc[:, 0] (column name="date") are different
# DataFrame.iloc[:, 0] (column name="date") values are different (0.3 %)

AssertionError: DataFrame.iloc[:, 0] (column name="date") are different

DataFrame.iloc[:, 0] (column name="date") values are different (0.3 %)
[index]: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, ...]
[left]:  [2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2022-12-31, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, ...]
[right]: [2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2022-12-31, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, ...]

In [21]:
# get index of mismatched rows
index_diff = pickup_df.index[
    pickup_df['date'].ne(dropoff_df['date'])
]

index_diff
# Int64Index([383, 567, 761], dtype='int64')

Int64Index([383, 567, 761], dtype='int64')

In [22]:
# drop rows at those indices
pickup_df_drop = pickup_df.drop(index_diff)
dropoff_df_drop = dropoff_df.drop(index_diff)

# verify that assert now works
pdt.assert_frame_equal(
    pickup_df_drop[['date']],
    dropoff_df_drop[['date']],
    check_exact=True,
    check_names=False
)

### Asserts in Numpy

In [28]:
a = np.array([1,2,3])
b = np.array([3,2,1])

npt.assert_array_equal(a,b)

# AssertionError: Arrays are not equal
# Mismatched elements: 2 / 3 (66.7%)
# Max absolute difference: 2
# Max relative difference: 2.
#  x: array([1, 2, 3])
#  y: array([3, 2, 1])

AssertionError: 
Arrays are not equal

Mismatched elements: 2 / 3 (66.7%)
Max absolute difference: 2
Max relative difference: 2.
 x: array([1, 2, 3])
 y: array([3, 2, 1])

In [26]:
string1 = 'string'
string2 = 'STRING'

npt.assert_string_equal(string1,string2)

AssertionError: Differences in strings:
- string+ STRING

In [32]:
a = np.array([1,2,3])
b = np.array([0.98,2.02,2.98])

npt.assert_allclose(a,b,atol=0.01)

# AssertionError: Not equal to tolerance rtol=1e-07, atol=0.01
# Mismatched elements: 3 / 3 (100%)
# Max absolute difference: 0.02
# Max relative difference: 0.02040816
# x: array([1, 2, 3])
# y: array([0.98, 2.02, 2.98])

AssertionError: 
Not equal to tolerance rtol=1e-07, atol=0.01

Mismatched elements: 3 / 3 (100%)
Max absolute difference: 0.02
Max relative difference: 0.02040816
 x: array([1, 2, 3])
 y: array([0.98, 2.02, 2.98])

In [36]:
a = np.array([1,2,3])
b = np.array([2,3,4])

npt.assert_array_less(a,b)

In [37]:
npt.assert_array_less(b,a)

# AssertionError: Arrays are not less-ordered
# Mismatched elements: 3 / 3 (100%)
# Max absolute difference: 1
# Max relative difference: 1.
#  x: array([2, 3, 4])
#  y: array([1, 2, 3])

AssertionError: 
Arrays are not less-ordered

Mismatched elements: 3 / 3 (100%)
Max absolute difference: 1
Max relative difference: 1.
 x: array([2, 3, 4])
 y: array([1, 2, 3])

## Assert-based Testing

### Quantitative Tests

In [40]:
def test_for_missing_data(df):
    # count all missing values and assert number to be zero
    assert df.isnull().sum().sum() == 0, 'ERROR :: DataFrame contains missing data!'
    return True

In [66]:
assert test_for_missing_data(trip_ext_df)
# True

In [47]:
def test_non_numerical_data_types(df, columns):
    for col in columns:
        assert df[col].dtype == 'int64' or df[col].dtype =='float64', f'ERROR :: {col} has a non-numerical dType'
    return True

In [65]:
test_columns = ['neighbourhood','latitude','longitude','geonumber']
assert test_non_numerical_data_types(trip_ext_df, trip_ext_df[test_columns])
# AssertionError: ERROR :: neighbourhood has a non-numerical dType

AssertionError: ERROR :: neighbourhood has a non-numerical dType

In [52]:
def test_for_out_of_range(df, columns):
    for col in columns:
        assert df[col].dtype == 'int64' or df[col].dtype == 'float64', f'ERROR :: {col} has a non-numerical dType'
        assert df[col].max() <= math.inf, f'ERROR :: {col} contains infinite values'
        assert df[col].min() >= -math.inf, f'ERROR :: {col} contains infinite values'
        assert not np.isnan(df[col]).any(), f'ERROR :: {col} contains NaN values'
        assert not np.isinf(df[col]).any(), f'ERROR :: {col} contains infinite values'
    return True

In [67]:
test_columns = ['latitude','longitude','geonumber']
assert test_non_numerical_data_types(trip_ext_df, trip_ext_df[test_columns])
# True

### Logical Tests

In [70]:
def test_for_logical_errors(df):
    # all dropoffs AFTER pickups
    assert all(df['tpep_dropoff_datetime'] > df['tpep_pickup_datetime']), 'ERROR :: Drop-off time before pickup'
    # no negative trip distances
    assert (df['trip_distance'] >= 0).all(), 'ERROR :: Negative trip distances'
    # no negative passenger count
    assert (df['passenger_count'] >= 0).all(), 'ERROR :: Negative passenger count'
    
    return True

In [71]:
assert test_for_logical_errors(yellow_tripdata_df)
# True