In [1]:
import numpy as np
np.set_printoptions(suppress=True)
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
answers = {}

In [3]:
# Load dataframe
data_january = 'fhv_tripdata_2021-01.parquet'
df = pd.read_parquet('./data/' + data_january)

In [4]:
df.head()

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number
0,B00009,2021-01-01 00:27:00,2021-01-01 00:44:00,,,,B00009
1,B00009,2021-01-01 00:50:00,2021-01-01 01:07:00,,,,B00009
2,B00013,2021-01-01 00:01:00,2021-01-01 01:51:00,,,,B00013
3,B00037,2021-01-01 00:13:09,2021-01-01 00:21:26,,72.0,,B00037
4,B00037,2021-01-01 00:38:31,2021-01-01 00:53:44,,61.0,,B00037


In [5]:
# Question 1: Read the data for January. How many records are there?
answers['q1'] = df.shape[0]
answers

{'q1': 1154112}

In [6]:
# calculate duration and add as columns
df['duration'] = df.dropOff_datetime - df.pickup_datetime
df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

In [7]:
df.head()

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number,duration
0,B00009,2021-01-01 00:27:00,2021-01-01 00:44:00,,,,B00009,17.0
1,B00009,2021-01-01 00:50:00,2021-01-01 01:07:00,,,,B00009,17.0
2,B00013,2021-01-01 00:01:00,2021-01-01 01:51:00,,,,B00013,110.0
3,B00037,2021-01-01 00:13:09,2021-01-01 00:21:26,,72.0,,B00037,8.283333
4,B00037,2021-01-01 00:38:31,2021-01-01 00:53:44,,61.0,,B00037,15.216667


In [8]:
# Question 2: What's the average trip duration in January?

answers['q2'] =  df.duration.mean()
answers

{'q1': 1154112, 'q2': 19.167224093791006}

In [9]:
df['duration'].describe(percentiles=[.90, .95, .97]).apply(lambda x: format(x, 'f'))

count    1154112.000000
mean          19.167224
std          398.692165
min            0.016667
50%           13.400000
90%           35.633333
95%           47.250000
97%           57.750000
max       423371.050000
Name: duration, dtype: object

In [10]:
rows_before_dropping = df.shape[0]
df = df[ (df['duration'] > 1) & (df['duration'] <= 60)]
rows_after_dropping = df.shape[0]

print(f'Dropped {rows_before_dropping - rows_after_dropping} rows')
df.head()

Dropped 44907 rows


Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number,duration
0,B00009,2021-01-01 00:27:00,2021-01-01 00:44:00,,,,B00009,17.0
1,B00009,2021-01-01 00:50:00,2021-01-01 01:07:00,,,,B00009,17.0
3,B00037,2021-01-01 00:13:09,2021-01-01 00:21:26,,72.0,,B00037,8.283333
4,B00037,2021-01-01 00:38:31,2021-01-01 00:53:44,,61.0,,B00037,15.216667
5,B00037,2021-01-01 00:59:02,2021-01-01 01:08:05,,71.0,,B00037,9.05


In [11]:
# Question 3: What's the factions of missing values for the pickup location ID? (Or the fraction of "-1"s after you filled the NAs)

nonnull_values_count = df['PUlocationID'].count()
null_values_count = df['PUlocationID'].isnull().sum()

null_values_count / (nonnull_values_count + null_values_count)


0.8354190613998315

In [12]:
df['PUlocationID'].fillna(-1, inplace=True)
df['DOlocationID'].fillna(-1, inplace=True)

In [15]:
missing_values = (df['PUlocationID'] == -1).sum() /df['PUlocationID'].count()
answers['q3'] = missing_values
answers

{'q1': 1154112, 'q2': 19.167224093791006, 'q3': 0.8354190613998315}

In [18]:
train_dicts = df[['PUlocationID', 'DOlocationID']].to_dict()
train_dicts

{'PUlocationID': {0: -1.0,
  1: -1.0,
  3: -1.0,
  4: -1.0,
  5: -1.0,
  6: -1.0,
  7: -1.0,
  8: -1.0,
  9: -1.0,
  10: -1.0,
  11: -1.0,
  12: -1.0,
  13: -1.0,
  14: -1.0,
  15: -1.0,
  17: -1.0,
  18: -1.0,
  20: -1.0,
  22: -1.0,
  23: -1.0,
  25: -1.0,
  26: -1.0,
  27: -1.0,
  28: -1.0,
  29: -1.0,
  30: -1.0,
  31: -1.0,
  33: -1.0,
  34: -1.0,
  35: -1.0,
  36: -1.0,
  37: -1.0,
  38: -1.0,
  39: 236.0,
  40: -1.0,
  41: -1.0,
  42: -1.0,
  43: -1.0,
  44: -1.0,
  45: -1.0,
  46: -1.0,
  47: -1.0,
  48: -1.0,
  49: -1.0,
  50: -1.0,
  51: 196.0,
  52: -1.0,
  53: -1.0,
  54: -1.0,
  55: -1.0,
  56: -1.0,
  57: -1.0,
  58: -1.0,
  59: -1.0,
  60: -1.0,
  61: -1.0,
  62: -1.0,
  63: -1.0,
  64: -1.0,
  65: -1.0,
  66: -1.0,
  67: -1.0,
  68: -1.0,
  69: -1.0,
  70: -1.0,
  71: -1.0,
  72: -1.0,
  73: -1.0,
  74: -1.0,
  75: -1.0,
  76: -1.0,
  77: -1.0,
  78: -1.0,
  79: -1.0,
  80: -1.0,
  81: -1.0,
  82: -1.0,
  83: -1.0,
  84: -1.0,
  85: -1.0,
  86: -1.0,
  91: 252.0,
  92: 