In [1]:
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F

In [2]:
import pandas as pd
df = pd.read_csv('./NYCTaxiFares.csv')
df.head()

Unnamed: 0,pickup_datetime,fare_amount,fare_class,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2010-04-19 08:17:56 UTC,6.5,0,-73.992365,40.730521,-73.975499,40.744746,1
1,2010-04-17 15:43:53 UTC,6.9,0,-73.990078,40.740558,-73.974232,40.744114,1
2,2010-04-17 11:23:26 UTC,10.1,1,-73.994149,40.751118,-73.960064,40.766235,2
3,2010-04-11 21:25:03 UTC,8.9,0,-73.990485,40.756422,-73.971205,40.748192,1
4,2010-04-17 02:19:01 UTC,19.7,1,-73.990976,40.734202,-73.905956,40.743115,1


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120000 entries, 0 to 119999
Data columns (total 8 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   pickup_datetime    120000 non-null  object 
 1   fare_amount        120000 non-null  float64
 2   fare_class         120000 non-null  int64  
 3   pickup_longitude   120000 non-null  float64
 4   pickup_latitude    120000 non-null  float64
 5   dropoff_longitude  120000 non-null  float64
 6   dropoff_latitude   120000 non-null  float64
 7   passenger_count    120000 non-null  int64  
dtypes: float64(5), int64(2), object(1)
memory usage: 7.3+ MB


In [4]:
#we need to use haversine formula to calculate distance between two points based on longitude and latitude
def haversine_distance(df, lat1, long1, lat2, long2):
    """
    Calculates the haversine distance between 2 sets of GPS coordinates in df
    """
    r = 6371  # average radius of Earth in kilometers
       
    phi1 = np.radians(df[lat1])
    phi2 = np.radians(df[lat2])
    
    delta_phi = np.radians(df[lat2]-df[lat1])
    delta_lambda = np.radians(df[long2]-df[long1])
     
    a = np.sin(delta_phi/2)**2 + np.cos(phi1) * np.cos(phi2) * np.sin(delta_lambda/2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    d = (r * c) # in kilometers

    return d

In [6]:
df['dist_km']= haversine_distance(df,'pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude')
df.head()

Unnamed: 0,pickup_datetime,fare_amount,fare_class,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,dist_km
0,2010-04-19 08:17:56 UTC,6.5,0,-73.992365,40.730521,-73.975499,40.744746,1,2.126312
1,2010-04-17 15:43:53 UTC,6.9,0,-73.990078,40.740558,-73.974232,40.744114,1,1.392307
2,2010-04-17 11:23:26 UTC,10.1,1,-73.994149,40.751118,-73.960064,40.766235,2,3.326763
3,2010-04-11 21:25:03 UTC,8.9,0,-73.990485,40.756422,-73.971205,40.748192,1,1.864129
4,2010-04-17 02:19:01 UTC,19.7,1,-73.990976,40.734202,-73.905956,40.743115,1,7.231321


In [5]:
df.columns

Index(['pickup_datetime', 'fare_amount', 'fare_class', 'pickup_longitude',
       'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude',
       'passenger_count'],
      dtype='object')

In [10]:
#we have 'pickup_datetime' column in object dtype we need to extract useful information from it
#also the time is in utc format, we need to convert it to EDT time 
df['EDTdate']=pd.to_datetime(df['pickup_datetime'].str[:19])-pd.Timedelta(hours=4)

In [11]:
df.head(3)

Unnamed: 0,pickup_datetime,fare_amount,fare_class,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,dist_km,EDTdate
0,2010-04-19 08:17:56 UTC,6.5,0,-73.992365,40.730521,-73.975499,40.744746,1,2.126312,2010-04-19 04:17:56
1,2010-04-17 15:43:53 UTC,6.9,0,-73.990078,40.740558,-73.974232,40.744114,1,1.392307,2010-04-17 11:43:53
2,2010-04-17 11:23:26 UTC,10.1,1,-73.994149,40.751118,-73.960064,40.766235,2,3.326763,2010-04-17 07:23:26


In [12]:
df['Hour']= df['EDTdate'].dt.hour

In [13]:
df['AMorPM']= np.where(df['Hour']>12,'pm','am')
df['Weekday']= df['EDTdate'].dt.strftime('%a')
df.head(3)

Unnamed: 0,pickup_datetime,fare_amount,fare_class,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,dist_km,EDTdate,Hour,AMorPM,Weekday
0,2010-04-19 08:17:56 UTC,6.5,0,-73.992365,40.730521,-73.975499,40.744746,1,2.126312,2010-04-19 04:17:56,4,am,Mon
1,2010-04-17 15:43:53 UTC,6.9,0,-73.990078,40.740558,-73.974232,40.744114,1,1.392307,2010-04-17 11:43:53,11,am,Sat
2,2010-04-17 11:23:26 UTC,10.1,1,-73.994149,40.751118,-73.960064,40.766235,2,3.326763,2010-04-17 07:23:26,7,am,Sat


In [14]:
#now we separate the categorical and continuous-valued columns
df.columns

Index(['pickup_datetime', 'fare_amount', 'fare_class', 'pickup_longitude',
       'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude',
       'passenger_count', 'dist_km', 'EDTdate', 'Hour', 'AMorPM', 'Weekday'],
      dtype='object')

In [15]:
cat_cols = ['Hour', 'AMorPM', 'Weekday']
cont_cols= ['pickup_longitude','pickup_latitude', 'dropoff_longitude', 'dropoff_latitude','passenger_count', 'dist_km']
y_col = ['fare_amount']

In [16]:
#converting the dtype of cat_cols to categorical
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120000 entries, 0 to 119999
Data columns (total 13 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   pickup_datetime    120000 non-null  object        
 1   fare_amount        120000 non-null  float64       
 2   fare_class         120000 non-null  int64         
 3   pickup_longitude   120000 non-null  float64       
 4   pickup_latitude    120000 non-null  float64       
 5   dropoff_longitude  120000 non-null  float64       
 6   dropoff_latitude   120000 non-null  float64       
 7   passenger_count    120000 non-null  int64         
 8   dist_km            120000 non-null  float64       
 9   EDTdate            120000 non-null  datetime64[ns]
 10  Hour               120000 non-null  int64         
 11  AMorPM             120000 non-null  object        
 12  Weekday            120000 non-null  object        
dtypes: datetime64[ns](1), float64(6), int64(3), 

In [17]:
for cat in cat_cols:
    df[cat]= df[cat].astype('category')

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120000 entries, 0 to 119999
Data columns (total 13 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   pickup_datetime    120000 non-null  object        
 1   fare_amount        120000 non-null  float64       
 2   fare_class         120000 non-null  int64         
 3   pickup_longitude   120000 non-null  float64       
 4   pickup_latitude    120000 non-null  float64       
 5   dropoff_longitude  120000 non-null  float64       
 6   dropoff_latitude   120000 non-null  float64       
 7   passenger_count    120000 non-null  int64         
 8   dist_km            120000 non-null  float64       
 9   EDTdate            120000 non-null  datetime64[ns]
 10  Hour               120000 non-null  category      
 11  AMorPM             120000 non-null  category      
 12  Weekday            120000 non-null  category      
dtypes: category(3), datetime64[ns](1), float64(6

In [19]:
df['Hour'].cat.categories

Int64Index([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
            17, 18, 19, 20, 21, 22, 23],
           dtype='int64')

In [20]:
df['Hour'].cat.codes

0          4
1         11
2          7
3         17
4         22
          ..
119995    10
119996     6
119997    14
119998     4
119999    12
Length: 120000, dtype: int8

In [21]:
df['Hour'].cat.categories.values

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23], dtype=int64)

In [22]:
hr = df['Hour'].cat.codes.values

In [23]:
hr

array([ 4, 11,  7, ..., 14,  4, 12], dtype=int8)

In [24]:
weekday= df['Weekday'].cat.codes.values
ampm = df['AMorPM'].cat.codes.values

cats = np.stack([hr,ampm,weekday], axis=1)
cats

array([[ 4,  0,  1],
       [11,  0,  2],
       [ 7,  0,  2],
       ...,
       [14,  1,  3],
       [ 4,  0,  5],
       [12,  0,  2]], dtype=int8)

In [25]:
conts= np.stack([df[con].values for con in cont_cols],1)


In [28]:
#converting np arrays to tensors
cats = torch.tensor(cats, dtype=torch.int64)
conts = torch.tensor(conts, dtype=torch.float32)
y= torch.tensor(df[y_col].values, dtype=torch.float32).reshape(-1,1)

conts

  cats = torch.tensor(cats, dtype=torch.int64)
  conts = torch.tensor(conts, dtype=torch.float32)


tensor([[-73.9924,  40.7305, -73.9755,  40.7447,   1.0000,   2.1263],
        [-73.9901,  40.7406, -73.9742,  40.7441,   1.0000,   1.3923],
        [-73.9941,  40.7511, -73.9601,  40.7662,   2.0000,   3.3268],
        ...,
        [-73.9886,  40.7498, -74.0115,  40.7078,   3.0000,   5.0525],
        [-74.0044,  40.7245, -73.9927,  40.7308,   1.0000,   1.2089],
        [-73.9554,  40.7719, -73.9676,  40.7630,   3.0000,   1.4274]])

In [29]:
len(conts)

120000

In [30]:
#we would need to normalize the continous valued columns and do embedding for the categorical columns
#we need to find the embeddings size
cat_szs = [len(df[col].cat.categories) for col in cat_cols]
emb_szs = [(size, min(50, (size+1)//2)) for size in cat_szs]
emb_szs

[(24, 12), (2, 1), (7, 4)]

In [46]:
class TabularModel(nn.Module):
    def __init__(self,emb_szs,n_cont,out_sz,layers,p=0.5):
        super().__init__()
        self.embeds = nn.ModuleList([nn.Embedding(ni, nf) for ni,nf in emb_szs])
        self.emb_drop = nn.Dropout(p)
        self.bn_cont = nn.BatchNorm1d(n_cont)

        layerslist = []
        n_emb = sum((nf for ni,nf in emb_szs))
        n_in = n_cont+n_emb
        for i in layers:
            layerslist.append(nn.Linear(n_in,i))
            layerslist.append(nn.ReLU(inplace=True))
            layerslist.append(nn.BatchNorm1d(i))
            layerslist.append(nn.Dropout(p))
            n_in=i
        
        layerslist.append(nn.Linear(layers[-1],out_sz))

        self.layers = nn.Sequential(*layerslist)

    def forward(self, x_cat, x_cont):
        #we must preprocess the categorical and continuous features by embedding and normalizing.
        embeddings=[]
        for i,e in enumerate(self.embeds):
            embeddings.append(e(x_cat[:,i]))
        x= torch.cat(embeddings,1)
        x= self.emb_drop(x)

        x_cont= self.bn_cont(x_cont)
        x=torch.cat([x,x_cont],1)
        x = self.layers(x)
        return x


catz = cats[:4]
catz

In [42]:
embeds = nn.ModuleList([nn.Embedding(ni, nf) for ni, nf in emb_szs])
embeds[2](torch.tensor([4,1,3]))

tensor([[-0.8036, -0.7438,  0.9858,  2.0659],
        [-1.7208, -1.1857, -0.4929,  0.6283],
        [-0.8091, -0.1654, -1.0341, -1.2994]], grad_fn=<EmbeddingBackward0>)

In [47]:
model = TabularModel(emb_szs, conts.shape[-1], 1,[200,100], 0.4)

In [48]:
model

TabularModel(
  (embeds): ModuleList(
    (0): Embedding(24, 12)
    (1): Embedding(2, 1)
    (2): Embedding(7, 4)
  )
  (emb_drop): Dropout(p=0.4, inplace=False)
  (bn_cont): BatchNorm1d(6, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layers): Sequential(
    (0): Linear(in_features=23, out_features=200, bias=True)
    (1): ReLU(inplace=True)
    (2): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Dropout(p=0.4, inplace=False)
    (4): Linear(in_features=200, out_features=100, bias=True)
    (5): ReLU(inplace=True)
    (6): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (7): Dropout(p=0.4, inplace=False)
    (8): Linear(in_features=100, out_features=1, bias=True)
  )
)

In [49]:
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [50]:
#train test split:
batch_size =60000
test_size = int(batch_size*.2)
test_size

12000

In [53]:
cat_train = cats[:batch_size-test_size]
cat_test = cats[batch_size-test_size:batch_size]

cont_train = conts[:batch_size-test_size]
conts_test=conts[batch_size-test_size:batch_size]

y_train = y[:batch_size-test_size]
y_test = y[batch_size-test_size:batch_size]

In [56]:
#training the model:
import time
start_time = time.time()
losses =[]
epochs=300
for i in range(epochs):
    y_pred= model(cat_train,cont_train)
    loss = torch.sqrt(criterion(y_pred,y_train))
    losses.append(loss)
    if i%25 == 1:
        print(f'epoch: {i:3}  loss: {loss.item():10.8f}')
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()


print(f'\nDuration: {time.time() - start_time:.0f} seconds')

epoch:   1  loss: 12.41162872
epoch:  26  loss: 10.74138355
epoch:  51  loss: 10.10084629
epoch:  76  loss: 9.66070938
epoch: 101  loss: 9.13043404
epoch: 126  loss: 8.36895657
epoch: 151  loss: 7.36783409
epoch: 176  loss: 6.18766928
epoch: 201  loss: 4.99095726
epoch: 226  loss: 4.15431118
epoch: 251  loss: 3.83959675
epoch: 276  loss: 3.71212673

Duration: 251 seconds


In [61]:
with torch.no_grad():
    y_val = model(cat_test,conts_test)
    loss = torch.sqrt(criterion(y_val,y_test))

print(loss)

tensor(3.6005)
