In [1]:
import torch
import numpy as np
import torch.nn as nn
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
df = pd.read_csv("/home/faustin/Desktop/Pytorch/PYTORCH_NOTEBOOKS/Data/NYCTaxiFares.csv")

FileNotFoundError: ignored

In [None]:
df.head()

In [None]:
df['fare_amount'].describe()

In [None]:
def haversine_distance(df, lat1, long1, lat2, long2):
    """
    Calculates the haversine distance between 2 sets of GPS coordinates in df
    """
    r = 6371  # average radius of Earth in kilometers
       
    phi1 = np.radians(df[lat1])
    phi2 = np.radians(df[lat2])
    
    delta_phi = np.radians(df[lat2]-df[lat1])
    delta_lambda = np.radians(df[long2]-df[long1])
     
    a = np.sin(delta_phi/2)**2 + np.cos(phi1) * np.cos(phi2) * np.sin(delta_lambda/2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    d = (r * c) # in kilometers

    return d

In [None]:
df['dist_km'] = haversine_distance(df,'pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude')
df.head()

In [None]:
df['EDTdate'] = pd.to_datetime(df['pickup_datetime'].str[:19]) - pd.Timedelta(hours=4)
df['Hour'] = df['EDTdate'].dt.hour
df['AMorPM'] = np.where(df['Hour']<12,'am','pm')
df['Weekday'] = df['EDTdate'].dt.strftime("%a")
df.head()

In [3]:
cat_cols = ['Hour', 'AMorPM', 'Weekday']
cont_cols = ['pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude', 'passenger_count', 'dist_km']
y_col = ['fare_amount']  # this column contains the labe

In [4]:
for cat in cat_cols:
    df[cat] = df[cat].astype('category')

NameError: ignored

In [None]:
df['Hour'].head()

In [None]:
df['AMorPM'].head()

In [None]:
df["Weekday"].head()

In [None]:
hr = df['Hour'].cat.codes.values
ampm = df['AMorPM'].cat.codes.values
wkdy = df['Weekday'].cat.codes.values

In [None]:
cats = np.stack([hr,ampm,wkdy],axis=1)

In [None]:
cats

In [None]:
conts = np.stack([df[col].values for col in cont_cols],axis=1)

In [None]:
conts = torch.tensor(conts,dtype=torch.float)

In [None]:
cats = torch.tensor(cats,dtype=torch.int64)

In [None]:
y = torch.tensor(df[y_col].values,dtype=torch.float)

In [None]:
cats.shape

In [None]:
conts.shape

In [None]:
y.shape

In [None]:
cat_szs = [len(df[col].cat.categories) for col in cat_cols ]

In [None]:
cat_szs

In [None]:
emb_szs = [(size, min(50, (size+1)//2)) for size in cat_szs]
emb_szs

In [None]:
catz = cats[:2]

In [None]:
catz

In [None]:
selfEmbeds = nn.ModuleList([nn.Embedding(ni,nf) for ni,nf in emb_szs])

In [None]:
selfEmbeds

In [None]:
embendingz = []

for i,e in enumerate(selfEmbeds):
    print(i,e)
    embendingz.append(e(catz[:,i]))
    print(catz[:,i])
# print(embendingz)
for i in embendingz:
    print(i)

In [None]:
z = torch.cat(embendingz,1)

In [None]:
z

In [None]:
class TabularModel(nn.Module):
    def __init__(self,emb_szs,n_cont,out_sz,layers,p=0.5):
        super().__init__()
        self.embeds = nn.ModuleList([nn.Embedding(ni,nf) for ni,nf in emb_szs])
        self.embed_drop = nn.Dropout(p)
        self.bn_cont = nn.BatchNorm1d(n_cont)
        
        layerList = []
        n_embed = sum([nf for ni,nf in emb_szs])
        n_in = n_embed + n_cont
        for i in layers:
            layerList.append(nn.Linear(n_in,i))
            layerList.append(nn.ReLU(inplace=True))
            layerList.append(nn.BatchNorm1d(i))
            layerList.append(nn.Dropout(p))
            n_in = i
        layerList.append(nn.Linear(layers[-1],out_sz))
        self.layers = nn.Sequential(*layerList)
    def forward(self,x_cat,x_cont):
        embeddings = []
        for i,e in enumerate(self.embeds):
            embeddings.append(e(x_cat[:,i]))
        x = torch.cat(embeddings,1)
        x = self.embed_drop(x)
        x_cont = self.bn_cont(x_cont)
        x = torch.cat([x,x_cont],1)
        x = self.layers(x)
        return x

In [None]:
torch.manual_seed(33)
model = TabularModel(emb_szs,conts.shape[1],1,[200,100],p=0.4)

In [None]:
model

In [None]:
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(),lr=0.001)

In [None]:
batch_size = 60000
test_size = int(batch_size * .2)

cat_train = cats[:batch_size-test_size]
cat_test = cats[batch_size-test_size:batch_size]
con_train = conts[:batch_size-test_size]
con_test = conts[batch_size-test_size:batch_size]
y_train = y[:batch_size-test_size]
y_test = y[batch_size-test_size:batch_size]

In [None]:
import time
epochs=300
losses = []

startTime = time.time()

for i in range(epochs):
    y_pred = model(cat_train,con_train)
    loss = torch.sqrt(criterion(y_pred,y_train))
    losses.append(loss)
    if (i+1)%10==0:
        print("Epoch of {}/{}, loss:{}".format(i+1,epochs,loss))
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
duration = time.time() - startTime
print("Training took {} minutes".format(duration/60))