# Sample Data Exploration to check for inconsistencies

## Checking what kind of data exists on the sample 1 week data using Pandas, before proceeding to do it on PySpark

In [179]:
pd.options.display.max_columns = None
pd.options.display.max_rows = 3000
pd.options.display.min_rows = 1000

In [188]:
import numpy as np

We will use the csv file we downloaded from S3 and imported into jupyter (see notebook `1_ChiTaxi_DataCollection_CDP_Lambda_SFN.ipynb`), after the lambda function extracted data from the Chicago Data Portal and stored it as a csv file in S3

In [138]:
temp_csv_check.head()

Unnamed: 0,trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_year,trip_month,trip_seconds,trip_miles,pickup_community_area,dropoff_community_area,...,extras,trip_total,payment_type,company,pickup_centroid_latitude,pickup_centroid_longitude,pickup_centroid_location,dropoff_centroid_latitude,dropoff_centroid_longitude,dropoff_centroid_location
0,7a21e28ce94ed9aed58559cf586363393c83031f,6fb5a8ee938d1b5889226c5c63543241224edf0ff04b42...,2019-07-28T23:45:00.000,2019-07-29T00:15:00.000,2019,7,1829.0,18.16,76,32,...,4.0,56.92,Credit Card,Flash Cab,41.980264,-87.913625,"{'type': 'Point', 'coordinates': [-87.91362459...",41.878866,-87.625192,"{'type': 'Point', 'coordinates': [-87.62519214..."
1,3bd419eaaa746ebda44d3c47bedc46c86450d17d,918e4b0b5ee31425b7c511d49dd73b371abdd9159c793d...,2019-07-28T23:45:00.000,2019-07-29T00:15:00.000,2019,7,1320.0,2.6,21,19,...,0.0,13.5,Cash,Taxi Affiliation Services,41.938666,-87.711211,"{'type': 'Point', 'coordinates': [-87.71121059...",41.927261,-87.765502,"{'type': 'Point', 'coordinates': [-87.76550160..."
2,67db20f5d5100771d8035bf483a68590cefb7e69,44367ae8c260038452e9d4e625548eb42e57456bb77ffd...,2019-07-28T23:45:00.000,2019-07-29T00:15:00.000,2019,7,1680.0,17.6,76,32,...,4.0,56.5,Credit Card,Star North Management LLC,41.979071,-87.90304,"{'type': 'Point', 'coordinates': [-87.90303966...",41.880994,-87.632746,"{'type': 'Point', 'coordinates': [-87.63274648..."
3,8743e94931d656be910ae0bc36644c9f906476bb,354fafd255c37a15b0c8f6f67d810b3e5374f54ebbadb9...,2019-07-28T23:45:00.000,2019-07-29T00:00:00.000,2019,7,1069.0,9.09,32,4,...,2.5,33.3,Credit Card,City Service,41.878866,-87.625192,"{'type': 'Point', 'coordinates': [-87.62519214...",41.975171,-87.687516,"{'type': 'Point', 'coordinates': [-87.68751551..."
4,9e820c3e904964ea5780d89c29c85b458f552990,343d099d6cca99a245757b878f39ef1777121a56028dea...,2019-07-28T23:45:00.000,2019-07-28T23:45:00.000,2019,7,239.0,0.57,8,8,...,1.0,8.5,Credit Card,City Service,41.893216,-87.637844,"{'type': 'Point', 'coordinates': [-87.63784420...",41.893216,-87.637844,"{'type': 'Point', 'coordinates': [-87.63784420..."


In [139]:
len(temp_csv_check.index)

272927

Things to check:
- [x] Table column types
- [x] If Trip_id is indeed unique
- [x] If any column has an "NA" in its values
- [x] Community Area from/to which maximum trips have taken place from and to for a small section of the map
- [x] Check max and min fair, tips, tolls, extras and total
- [x] Check if there are cases where trip time or payment values are 0 or negative
- [X] Check unique types of payment, company

In [144]:
temp_csv_check.dtypes

trip_id                        object
taxi_id                        object
trip_start_timestamp           object
trip_end_timestamp             object
trip_year                       int64
trip_month                      int64
trip_seconds                  float64
trip_miles                    float64
pickup_community_area           int64
dropoff_community_area          int64
fare                          float64
tips                          float64
tolls                         float64
extras                        float64
trip_total                    float64
payment_type                   object
company                        object
pickup_centroid_latitude      float64
pickup_centroid_longitude     float64
pickup_centroid_location       object
dropoff_centroid_latitude     float64
dropoff_centroid_longitude    float64
dropoff_centroid_location      object
dtype: object

In [152]:
temp_csv_check.agg(['count','size','nunique'])

Unnamed: 0,trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_year,trip_month,trip_seconds,trip_miles,pickup_community_area,dropoff_community_area,fare,tips,tolls,extras,trip_total,payment_type,company,pickup_centroid_latitude,pickup_centroid_longitude,pickup_centroid_location,dropoff_centroid_latitude,dropoff_centroid_longitude,dropoff_centroid_location
count,272927,272799,272927,272927,272927,272927,272899,272919,272927,272927,272901,272901,272901,272901,272901,272927,272927,272927,272927,272927,272927,272927,272927
size,272927,272927,272927,272927,272927,272927,272927,272927,272927,272927,272927,272927,272927,272927,272927,272927,272927,272927,272927,272927,272927,272927,272927
nunique,272927,4052,672,693,1,1,5391,2736,77,77,479,1067,17,214,2680,8,45,238,238,238,323,323,323


We see that the following columns have NULL values:
1. taxi_id
2. trip_seconds
3. trip_miles
4. trip total
    
Let's explore them further

### taxi_id

In [184]:
temp_csv_check[temp_csv_check['taxi_id'].isnull()]

Unnamed: 0,trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_year,trip_month,trip_seconds,trip_miles,pickup_community_area,dropoff_community_area,fare,tips,tolls,extras,trip_total,payment_type,company,pickup_centroid_latitude,pickup_centroid_longitude,pickup_centroid_location,dropoff_centroid_latitude,dropoff_centroid_longitude,dropoff_centroid_location
18704,fff5ca357a7e1a8147058338e552c84f624e882c,,2019-07-28T07:15:00.000,2019-07-28T07:30:00.000,2019,7,840.0,0.0,43,68,13.5,0.0,0.0,0.0,13.5,Unknown,Blue Ribbon Taxi Association Inc.,41.761578,-87.572782,"{'type': 'Point', 'coordinates': [-87.57278198...",41.777196,-87.642498,"{'type': 'Point', 'coordinates': [-87.64249752..."
23164,83b02d867f4146198c4309a31ad916d66f856f79,,2019-07-28T02:00:00.000,2019-07-28T02:15:00.000,2019,7,540.0,0.1,28,32,8.25,2.0,0.0,1.0,11.25,Credit Card,Taxi Affiliation Services,41.885281,-87.657233,"{'type': 'Point', 'coordinates': [-87.65723319...",41.880994,-87.632746,"{'type': 'Point', 'coordinates': [-87.63274648..."
23531,bfd8ecaf65f3cdee8e2feec3b07f20ee3b911ba7,,2019-07-28T01:45:00.000,2019-07-28T02:00:00.000,2019,7,840.0,0.3,20,28,15.5,0.0,0.0,0.0,15.5,Cash,Taxi Affiliation Services,41.924347,-87.73474,"{'type': 'Point', 'coordinates': [-87.73473975...",41.874005,-87.663518,"{'type': 'Point', 'coordinates': [-87.66351754..."
23838,8a28b2137bdd343d9ec0e9727cd1df5f5ebf9361,,2019-07-28T01:30:00.000,2019-07-28T01:30:00.000,2019,7,420.0,0.0,22,22,7.0,0.0,0.0,2.0,9.0,Cash,Taxi Affiliation Services,41.922761,-87.699155,"{'type': 'Point', 'coordinates': [-87.69915534...",41.922761,-87.699155,"{'type': 'Point', 'coordinates': [-87.69915534..."
25464,14673d86a2b6b81347c4d206f574204623c2981a,,2019-07-28T00:30:00.000,2019-07-28T01:15:00.000,2019,7,2820.0,0.4,8,21,25.75,0.0,0.0,1.0,26.75,Cash,Taxi Affiliation Services,41.899602,-87.633308,"{'type': 'Point', 'coordinates': [-87.63330803...",41.938666,-87.711211,"{'type': 'Point', 'coordinates': [-87.71121059..."
26164,615f630c23aa5adbf57f0463fd7c169a39a929d3,,2019-07-28T00:15:00.000,2019-07-28T00:15:00.000,2019,7,660.0,0.1,8,32,8.5,0.0,0.0,1.0,9.5,Cash,Taxi Affiliation Services,41.905858,-87.630865,"{'type': 'Point', 'coordinates': [-87.63086502...",41.884987,-87.620993,"{'type': 'Point', 'coordinates': [-87.62099291..."
27516,3be49f19bfab32bd808120f6121f89be2bb4a4d9,,2019-07-27T23:30:00.000,2019-07-28T00:00:00.000,2019,7,1200.0,0.2,8,7,14.75,0.0,0.0,1.0,15.75,Cash,Taxi Affiliation Services,41.890922,-87.618868,"{'type': 'Point', 'coordinates': [-87.61886835...",41.914747,-87.654007,"{'type': 'Point', 'coordinates': [-87.65400702..."
28138,ae633e43b0c1dcb46cb1fa1a2be50f73419fd07a,,2019-07-27T23:00:00.000,2019-07-27T23:15:00.000,2019,7,960.0,0.3,7,33,16.5,3.4,0.0,0.0,19.9,Credit Card,Taxi Affiliation Services,41.922686,-87.649489,"{'type': 'Point', 'coordinates': [-87.64948872...",41.857184,-87.620335,"{'type': 'Point', 'coordinates': [-87.62033462..."
29319,698de65c7dcb7b186d039772b196b595492fb5ca,,2019-07-27T22:30:00.000,2019-07-27T22:45:00.000,2019,7,1320.0,0.2,28,6,15.5,2.0,0.0,0.0,17.5,Credit Card,Taxi Affiliation Services,41.8853,-87.642808,"{'type': 'Point', 'coordinates': [-87.64280846...",41.949221,-87.65197,"{'type': 'Point', 'coordinates': [-87.65197039..."
32736,9cdc8ba347d8aa3666d49af8862c81d6b04cdeed,,2019-07-27T20:30:00.000,2019-07-27T20:45:00.000,2019,7,900.0,0.1,8,28,11.5,0.0,0.0,0.0,11.5,Cash,Taxi Affiliation Services,41.892508,-87.626215,"{'type': 'Point', 'coordinates': [-87.62621490...",41.879067,-87.657005,"{'type': 'Point', 'coordinates': [-87.65700502..."


In [185]:
len(temp_csv_check[temp_csv_check['taxi_id'].isnull()].index)

128

In [195]:
temp_csv_check[['trip_seconds','trip_miles','fare', 'tips', 'tolls','extras', 'trip_total']].describe()

Unnamed: 0,trip_seconds,trip_miles,fare,tips,tolls,extras,trip_total
count,272899.0,272919.0,272901.0,272901.0,272901.0,272901.0,272901.0
mean,905.878098,3.403924,13.740803,1.758741,0.000523,1.039424,16.671594
std,1409.70266,5.363516,20.222235,2.886162,0.102603,27.82434,35.556084
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,369.0,0.6,6.25,0.0,0.0,0.0,7.5
50%,600.0,1.24,8.25,0.0,0.0,0.0,10.0
75%,1012.0,2.91,13.75,2.0,0.0,1.0,16.0
max,86340.0,333.3,7066.93,100.0,50.0,9555.0,9558.25


The trips with taxi_id = NaN, do seem legitimate, but given they form such a small part of the overall trips, we can remove them

### trip_seconds

In [196]:
temp_csv_check[temp_csv_check['trip_seconds'].isnull()]

Unnamed: 0,trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_year,trip_month,trip_seconds,trip_miles,pickup_community_area,dropoff_community_area,fare,tips,tolls,extras,trip_total,payment_type,company,pickup_centroid_latitude,pickup_centroid_longitude,pickup_centroid_location,dropoff_centroid_latitude,dropoff_centroid_longitude,dropoff_centroid_location
23777,134f9852ce0f4e033465af46222f67fe7b482bc3,b9365537031797f54f7e3c06ce06e3e22cf2bb223772cd...,2019-07-28T01:30:00.000,2019-07-31T17:00:00.000,2019,7,,3.8,8,28,13.25,3.0,0.0,0.0,16.25,Credit Card,Taxi Affiliation Services,41.899602,-87.633308,"{'type': 'Point', 'coordinates': [-87.63330803...",41.874005,-87.663518,"{'type': 'Point', 'coordinates': [-87.66351754..."
26178,db22e2856743a2e17ed59d5d96f3f0cbdd11cbfb,e8ebe1fb12cc70947d4031ad2982bd07dee6c4a3b7d381...,2019-07-28T00:15:00.000,2019-07-29T15:00:00.000,2019,7,,1.72,7,28,8.5,0.0,0.0,0.0,8.5,Cash,Nova Taxi Affiliation Llc,41.929047,-87.651311,"{'type': 'Point', 'coordinates': [-87.65131087...",41.885281,-87.657233,"{'type': 'Point', 'coordinates': [-87.65723319..."
37118,91dc3999f13ed714c341ad39895cc485df7ad39c,0d888febc2187a67daffb2a05d317e36a1739f84d1985e...,2019-07-27T18:15:00.000,2019-07-29T05:30:00.000,2019,7,,0.0,56,56,55.0,8.32,0.0,0.0,63.82,Credit Card,Flash Cab,41.785999,-87.750934,"{'type': 'Point', 'coordinates': [-87.75093428...",41.785999,-87.750934,"{'type': 'Point', 'coordinates': [-87.75093428..."
43416,9e899c687bde9170fdbca838234314635a5f2016,d6d458e42750ebec37cf83607ae045545538608a488d76...,2019-07-27T15:00:00.000,2019-07-29T09:30:00.000,2019,7,,17.08,76,76,44.0,0.0,0.0,5.0,49.0,Cash,Sun Taxi,41.979071,-87.90304,"{'type': 'Point', 'coordinates': [-87.90303966...",41.979071,-87.90304,"{'type': 'Point', 'coordinates': [-87.90303966..."
50815,0a031c32c1f3cd2858aabc2d7e2465611fd5ada2,347ea3e9f3630ab7c6c80c979ef536bceeecc8a49c7459...,2019-07-27T10:15:00.000,2019-07-29T11:00:00.000,2019,7,,0.0,15,15,3.25,0.0,0.0,0.0,3.25,Cash,Sun Taxi,41.954028,-87.763399,"{'type': 'Point', 'coordinates': [-87.76339903...",41.954028,-87.763399,"{'type': 'Point', 'coordinates': [-87.76339903..."
59469,0dbc78270da3180e5af26a717f4d4169b4da2659,df0730de65d455fc451fcb38dd627f877943cf490f5719...,2019-07-26T23:30:00.000,2019-07-29T06:45:00.000,2019,7,,1.36,28,33,14.25,0.0,0.0,0.0,14.25,Cash,Chicago Carriage Cab Corp,41.874005,-87.663518,"{'type': 'Point', 'coordinates': [-87.66351754...",41.857184,-87.620335,"{'type': 'Point', 'coordinates': [-87.62033462..."
63678,1929d94965b6debe15f142e8be6a568f76706372,c4b7d4a2434871020014e186ac8489b75620cc38c41f49...,2019-07-26T21:15:00.000,2019-07-29T07:45:00.000,2019,7,,0.0,7,7,0.0,0.0,0.0,0.0,0.0,Cash,Chicago Carriage Cab Corp,41.922686,-87.649489,"{'type': 'Point', 'coordinates': [-87.64948872...",41.922686,-87.649489,"{'type': 'Point', 'coordinates': [-87.64948872..."
63736,d95d091751813c4952edb8694c3de20751c440c2,c4b7d4a2434871020014e186ac8489b75620cc38c41f49...,2019-07-26T21:15:00.000,2019-07-29T08:45:00.000,2019,7,,20.06,7,16,51.0,0.0,0.0,2.5,53.5,Cash,Chicago Carriage Cab Corp,41.922686,-87.649489,"{'type': 'Point', 'coordinates': [-87.64948872...",41.953582,-87.723452,"{'type': 'Point', 'coordinates': [-87.72345239..."
64202,858a8fc49f6f5a006c433c7e92c1a53eabb13c62,c381062fedb173b99266e896dd47f45f966410f69a8cbc...,2019-07-26T21:00:00.000,2019-07-29T07:15:00.000,2019,7,,16.66,8,8,42.25,0.0,0.0,0.0,42.25,Cash,Patriot Taxi Dba Peace Taxi Associat,41.892508,-87.626215,"{'type': 'Point', 'coordinates': [-87.62621490...",41.890922,-87.618868,"{'type': 'Point', 'coordinates': [-87.61886835..."
70027,ab90bb3d4147cf47d388864e4e730bcecd070134,26211b46f6f6c6987bee5e2062934ba8692bced4811d1a...,2019-07-26T18:45:00.000,2019-07-28T09:15:00.000,2019,7,,1.98,32,8,8.25,0.0,0.0,0.0,8.25,Cash,Chicago Carriage Cab Corp,41.880994,-87.632746,"{'type': 'Point', 'coordinates': [-87.63274648...",41.900266,-87.632109,"{'type': 'Point', 'coordinates': [-87.63210921..."


In [197]:
len(temp_csv_check[temp_csv_check['trip_seconds'].isnull()].index)

28

The trips with trip_seconds = NaN, do seem legitimate, but given they form such a small part of the overall trips, we can remove them

### trip_miles

In [198]:
temp_csv_check[temp_csv_check['trip_miles'].isnull()]

Unnamed: 0,trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_year,trip_month,trip_seconds,trip_miles,pickup_community_area,dropoff_community_area,fare,tips,tolls,extras,trip_total,payment_type,company,pickup_centroid_latitude,pickup_centroid_longitude,pickup_centroid_location,dropoff_centroid_latitude,dropoff_centroid_longitude,dropoff_centroid_location
25184,42994f1e06b2d715e95e003e6fc5f3800e17294b,f7630edc68a131c4a97382dd93206949da80ec1443d8da...,2019-07-28T00:45:00.000,2019-07-28T01:00:00.000,2019,7,1003.0,,6,24,,,,,,Cash,Medallion Leasin,41.946295,-87.654298,"{'type': 'Point', 'coordinates': [-87.65429808...",41.898306,-87.653614,"{'type': 'Point', 'coordinates': [-87.65361398..."
70154,3d60782f6758d6255f1c0d3a106fbb155956128e,485299324a5421f5327c141bd42091be7007124453d3f4...,2019-07-26T18:30:00.000,2019-07-26T19:15:00.000,2019,7,2564.0,,76,32,,,,,,Cash,Chicago Taxicab,41.979071,-87.90304,"{'type': 'Point', 'coordinates': [-87.90303966...",41.877406,-87.621972,"{'type': 'Point', 'coordinates': [-87.62197165..."
102629,e36100f747f24ac06bdc1ad52a808e66b4d6927b,b1244716a083ba8c4151173bf8d5e2d0fe69d61d92ee5d...,2019-07-26T00:15:00.000,2019-07-26T00:30:00.000,2019,7,1276.0,,32,1,,,,,,Credit Card,Medallion Leasin,41.878866,-87.625192,"{'type': 'Point', 'coordinates': [-87.62519214...",42.009623,-87.670167,"{'type': 'Point', 'coordinates': [-87.67016685..."
102905,8719c46c51d2371870bbbfc93ce76cd00d96b17e,4364f0c1355b9baf32e22c09883a7f4693a475d4a6bc9c...,2019-07-26T00:00:00.000,2019-07-26T00:15:00.000,2019,7,636.0,,32,6,,,,,,Cash,City Service,41.880994,-87.632746,"{'type': 'Point', 'coordinates': [-87.63274648...",41.942577,-87.647079,"{'type': 'Point', 'coordinates': [-87.64707850..."
128847,175402afc894320fef81798f4c8a415ea5c4c729,e56ea216ac8ba6fee5746eaa688882c0ae9c70e6fb702d...,2019-07-25T14:15:00.000,2019-07-25T14:30:00.000,2019,7,365.0,,32,8,,,,,,Cash,Medallion Leasin,41.884987,-87.620993,"{'type': 'Point', 'coordinates': [-87.62099291...",41.890922,-87.618868,"{'type': 'Point', 'coordinates': [-87.61886835..."
143966,511c3e04fb5f88881eeff1ff367859bcd03b3f29,ae0fb599586f950b7811c99713a56c03504a54c30c6cde...,2019-07-25T08:15:00.000,2019-07-25T08:15:00.000,2019,7,381.0,,7,8,0.0,0.0,0.0,0.0,0.0,Cash,Chicago Carriage Cab Corp,41.929263,-87.635891,"{'type': 'Point', 'coordinates': [-87.63589095...",41.90752,-87.626659,"{'type': 'Point', 'coordinates': [-87.62665890..."
197624,f9cba037c9868baac44c9a257956628ccc1eafac,42e7bc45af73f754adedc33abc0d5bbfc2456582e3300c...,2019-07-23T20:45:00.000,2019-07-23T21:15:00.000,2019,7,1616.0,,56,6,,,,,,Credit Card,City Service,41.785999,-87.750934,"{'type': 'Point', 'coordinates': [-87.75093428...",41.935989,-87.670966,"{'type': 'Point', 'coordinates': [-87.67096638..."
208694,6f56d991689c7caa046442bd3f2383bc3ef5e88c,97aa8cc2c0691d3c3b616f9aaad1722d1b46eb1da0c217...,2019-07-23T16:45:00.000,2019-07-23T17:30:00.000,2019,7,3122.0,,77,1,,,,,,Cash,Medallion Leasin,41.986712,-87.663416,"{'type': 'Point', 'coordinates': [-87.66341640...",42.009623,-87.670167,"{'type': 'Point', 'coordinates': [-87.67016685..."


In [199]:
len(temp_csv_check[temp_csv_check['trip_miles'].isnull()].index)

8

In [212]:
temp_csv_check[temp_csv_check['fare'].isnull()]

Unnamed: 0,trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_year,trip_month,trip_seconds,trip_miles,pickup_community_area,dropoff_community_area,fare,tips,tolls,extras,trip_total,payment_type,company,pickup_centroid_latitude,pickup_centroid_longitude,pickup_centroid_location,dropoff_centroid_latitude,dropoff_centroid_longitude,dropoff_centroid_location
25184,42994f1e06b2d715e95e003e6fc5f3800e17294b,f7630edc68a131c4a97382dd93206949da80ec1443d8da...,2019-07-28T00:45:00.000,2019-07-28T01:00:00.000,2019,7,1003.0,,6,24,,,,,,Cash,Medallion Leasin,41.946295,-87.654298,"{'type': 'Point', 'coordinates': [-87.65429808...",41.898306,-87.653614,"{'type': 'Point', 'coordinates': [-87.65361398..."
34380,a0374e35a64caa622f5e96a8cdd29a4b96d58e86,6b6ceb851bc1c33fcad40360a39cdd774c3269b1a13464...,2019-07-27T19:30:00.000,2019-07-27T19:30:00.000,2019,7,82.0,0.14,8,8,,,,,,Cash,Chicago Carriage Cab Corp,41.892508,-87.626215,"{'type': 'Point', 'coordinates': [-87.62621490...",41.890922,-87.618868,"{'type': 'Point', 'coordinates': [-87.61886835..."
44452,fd23dc3f3510065f8aae5efbd831f6daa4ee7289,99ff119b3d76460510fcacacfcc36d47f72e488829dda3...,2019-07-27T14:30:00.000,2019-07-27T14:45:00.000,2019,7,221.0,1.08,32,33,,,,,,Cash,Chicago Carriage Cab Corp,41.870607,-87.622173,"{'type': 'Point', 'coordinates': [-87.62217293...",41.85935,-87.617358,"{'type': 'Point', 'coordinates': [-87.61735800..."
60529,f42ea4b09d9a764b3dcd44381750ceda13c5e85c,99ff119b3d76460510fcacacfcc36d47f72e488829dda3...,2019-07-26T23:00:00.000,2019-07-26T23:30:00.000,2019,7,1456.0,15.54,56,41,,,,,,Cash,Chicago Carriage Cab Corp,41.792592,-87.769615,"{'type': 'Point', 'coordinates': [-87.76961545...",41.79409,-87.592311,"{'type': 'Point', 'coordinates': [-87.59231085..."
70154,3d60782f6758d6255f1c0d3a106fbb155956128e,485299324a5421f5327c141bd42091be7007124453d3f4...,2019-07-26T18:30:00.000,2019-07-26T19:15:00.000,2019,7,2564.0,,76,32,,,,,,Cash,Chicago Taxicab,41.979071,-87.90304,"{'type': 'Point', 'coordinates': [-87.90303966...",41.877406,-87.621972,"{'type': 'Point', 'coordinates': [-87.62197165..."
78587,e9e3d4df39a0b94d0548c07d2092fe38caf7e101,c7dd2a8e54f02a9762854c49715711972d03770089f1b1...,2019-07-26T16:00:00.000,2019-07-26T16:15:00.000,2019,7,339.0,0.98,8,32,,,,,,Cash,Sun Taxi,41.892042,-87.631864,"{'type': 'Point', 'coordinates': [-87.63186394...",41.880994,-87.632746,"{'type': 'Point', 'coordinates': [-87.63274648..."
81020,d7166daddf89f04810dd860945254677bb81fca3,99ff119b3d76460510fcacacfcc36d47f72e488829dda3...,2019-07-26T15:00:00.000,2019-07-26T15:00:00.000,2019,7,343.0,1.15,56,56,,,,,,Cash,Chicago Carriage Cab Corp,41.785999,-87.750934,"{'type': 'Point', 'coordinates': [-87.75093428...",41.785999,-87.750934,"{'type': 'Point', 'coordinates': [-87.75093428..."
86267,0913dbc341d38bf07694da7ee1b7d99df8b106d2,05af46c115bd8e274a8528cde1449db2a7cab3cd5739cc...,2019-07-26T13:15:00.000,2019-07-26T13:15:00.000,2019,7,26.0,0.0,77,77,,,,,,Cash,Sun Taxi,41.988704,-87.657235,"{'type': 'Point', 'coordinates': [-87.65723482...",41.988704,-87.657235,"{'type': 'Point', 'coordinates': [-87.65723482..."
102629,e36100f747f24ac06bdc1ad52a808e66b4d6927b,b1244716a083ba8c4151173bf8d5e2d0fe69d61d92ee5d...,2019-07-26T00:15:00.000,2019-07-26T00:30:00.000,2019,7,1276.0,,32,1,,,,,,Credit Card,Medallion Leasin,41.878866,-87.625192,"{'type': 'Point', 'coordinates': [-87.62519214...",42.009623,-87.670167,"{'type': 'Point', 'coordinates': [-87.67016685..."
102905,8719c46c51d2371870bbbfc93ce76cd00d96b17e,4364f0c1355b9baf32e22c09883a7f4693a475d4a6bc9c...,2019-07-26T00:00:00.000,2019-07-26T00:15:00.000,2019,7,636.0,,32,6,,,,,,Cash,City Service,41.880994,-87.632746,"{'type': 'Point', 'coordinates': [-87.63274648...",41.942577,-87.647079,"{'type': 'Point', 'coordinates': [-87.64707850..."


In [200]:
len(temp_csv_check[temp_csv_check['fare'].isnull()].index)

26

<font color=#FF0000>**In conclusion we wil remove all those cases where the below columns are NULL:**</font>
1. taxi_id
2. trip_seconds
3. trip_miles
4. trip total

## Checking the number of rows for which trip_miles, trip_seconds and trip_total are 0

### trip_miles

In [221]:
temp_csv_check['trip_miles'].value_counts().reset_index().sort_values('trip_miles', ascending = True)

Unnamed: 0,trip_miles,count
0,0.0,40257
205,0.01,240
237,0.02,190
292,0.03,138
356,0.04,103
398,0.05,89
451,0.06,72
450,0.07,72
409,0.08,84
501,0.09,61


15% of the trips have trip_miles = 0.00

But given the precision of measurement is at 0.01 level, the 0.00 miles must be considered as outliers, since trips less than 16 feet don't make real world sense

### trip_seconds

In [215]:
temp_csv_check['trip_seconds'].value_counts().reset_index().sort_values('trip_seconds', ascending = True)

Unnamed: 0,trip_seconds,count
12,0.0,3846
2707,1.0,8
1588,2.0,19
2527,3.0,9
786,4.0,65
569,5.0,100
720,6.0,75
643,7.0,88
679,8.0,82
801,9.0,64


1.5% of the trips have trip_seconds = 0.0

In [219]:
len(temp_csv_check[temp_csv_check['trip_seconds'] == 0].index)

3846

In [222]:
len(temp_csv_check[temp_csv_check['trip_seconds'] < 60].index)

4356

2.5% of the trips have trip_seconds < 60s

### trip_total

In [223]:
temp_csv_check['trip_total'].value_counts().reset_index().sort_values('trip_total', ascending = True)

Unnamed: 0,trip_total,count
361,0.0,79
868,0.01,11
2611,0.03,1
2602,0.1,1
2332,0.14,1
2328,0.27,1
1505,1.0,3
2670,1.01,1
1084,2.0,6
2541,2.75,1


### Cases with all 3 values = 0

In [225]:
len(temp_csv_check[(temp_csv_check['trip_miles'] == 0) & (temp_csv_check['trip_seconds'] == 0) & (temp_csv_check['trip_total'] == 0)].index)

13

## Handling Outliers

<font color=#FF0000>**We should restrict the data to trip having:**</font>
1. trip_miles > 0 and < 40
2. trip_seconds > 10 and < 15000
3. trip_fare < 400

In [145]:
temp_csv_check.pickup_community_area.value_counts()

pickup_community_area
8     90181
32    79114
28    31923
76    22992
6      7743
      ...  
52       16
74       15
54       14
55       13
9         9
Name: count, Length: 77, dtype: int64

In [146]:
temp_csv_check.dropoff_community_area.value_counts()

dropoff_community_area
8     86350
32    65360
28    33794
7     12203
76    11268
      ...  
54       28
62       26
74       22
52       13
55        9
Name: count, Length: 77, dtype: int64

In [227]:
temp_csv_check.groupby(['pickup_community_area','dropoff_community_area']).agg(tot_pair = ('trip_id', len))

Unnamed: 0_level_0,Unnamed: 1_level_0,tot_pair
pickup_community_area,dropoff_community_area,Unnamed: 2_level_1
1,1,302
1,2,146
1,3,86
1,4,32
1,5,3
1,6,49
1,7,16
1,8,71
1,11,1
1,12,1


In [228]:
temp_csv_check.payment_type.value_counts()

payment_type
Cash           135731
Credit Card    127796
Mobile           3919
Prcard           2795
Unknown          2117
No Charge         474
Dispute            83
Prepaid            12
Name: count, dtype: int64

In [230]:
len(temp_csv_check[(temp_csv_check['payment_type'] != "No Charge") & (temp_csv_check['trip_total'] == 0)].index)

77

In [231]:
temp_csv_check.company.value_counts()

company
Taxi Affiliation Services                       63590
Flash Cab                                       37992
Chicago Carriage Cab Corp                       31525
Medallion Leasin                                21382
Sun Taxi                                        20866
City Service                                    19108
Star North Management LLC                       14222
Blue Ribbon Taxi Association Inc.               13993
Choice Taxi Association                          8213
Taxicab Insurance Agency, LLC                    6597
Taxi Affiliation Service Yellow                  5726
Nova Taxi Affiliation Llc                        5215
Top Cab Affiliation                              4000
Chicago Independents                             3526
Patriot Taxi Dba Peace Taxi Associat             3308
24 Seven Taxi                                    2778
Chicago Medallion Management                     2151
KOAM Taxi Association                            2022
Checker Taxi Affilia