# Taxi Travel Data Analysis

In this demo, we will be doing some demos on temporal feature engineering with the Kaggle Dataset

### Loading libraries, datasets

In [79]:
import numpy as np, pandas as pd
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader, Dataset
import torch

In [80]:
# These are all of the files you are given
df_tr = pd.read_csv("archive/train.csv")


ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

In [None]:
df_tr["ORIGIN_CALL"].value_counts()

ORIGIN_CALL
2002.0     57571
63882.0     6406
2001.0      2499
13168.0     1314
6728.0      1115
           ...  
23600.0        1
37142.0        1
7028.0         1
49288.0        1
34164.0        1
Name: count, Length: 57105, dtype: int64

In [None]:
#df_tr.head()
# avg = df_tr['LEN'].mean()
# print(avg)

### Get Computed Time from POLYLINE

Our goal is to predict the travel-time of the taxi, which can be derived from the POLYLINE length.

Recall:

```
The travel time of the trip (the prediction target of this project) is defined as the (number of points-1) x 15 seconds. 
For example, a trip with 101 data points in POLYLINE has a length of (101-1) * 15 = 1500 seconds. Some trips have missing 
data points in POLYLINE, indicated by MISSING_DATA column, and it is part of the challenge how you utilize this knowledge.
```

We are not doing anything with the MISSING_DATA. It is up to you to find a way to use (or ignore) that information.

In [None]:
# Over every single 
def polyline_to_trip_duration(polyline):
  return max(polyline.count("[") - 2, 0) * 15

# This code creates a new column, "LEN", in our dataframe. The value is
# the (polyline_length - 1) * 15, where polyline_length = count("[") - 1
# df_tr["LEN"] = df_tr["POLYLINE"].apply(polyline_to_trip_duration)

In [None]:
from datetime import datetime
def parse_time(x):
  # We are using python's builtin datetime library
  # https://docs.python.org/3/library/datetime.html#datetime.date.fromtimestamp

  # Each x is essentially a 1 row, 1 column pandas Series
  dt = datetime.fromtimestamp(x["TIMESTAMP"])
  return dt.year, dt.month, dt.day, dt.hour, dt.weekday()

# Because we are assigning multiple values at a time, we need to "expand" our computed (year, month, day, hour, weekday) tuples on 
# the column axis, or axis 1
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.apply.html
# df_tr[["YR", "MON", "DAY", "HR", "WK"]] = df_tr[["TIMESTAMP"]].apply(parse_time, axis=1, result_type="expand")
def parse_midnight_minutes(x):
    dt = datetime.fromtimestamp(x["TIMESTAMP"])
    return dt.hour * 60 + dt.minute

In [None]:
def onehot(x):
    x=x.values[0]
    if x=="A":
        return 1, 0, 0
    elif x=="B":
        return 0, 1, 0
    elif x=="C":
        return 0, 0, 1
    else:
        return 0, 0, 0
    
# df_tr[["CALL_A", "CALL_B", "CALL_C"]] = df_tr[["CALL_TYPE"]].apply(onehot, axis=1, result_type="expand")   
# df_tr[["DAY_A", "DAY_B", "DAY_C"]] = df_tr[["DAY_TYPE"]].apply(onehot, axis=1, result_type="expand")     


### Create a Prediction File

In [None]:
# vals = df_tr["HR"].value_counts()
# print(vals)
# def reduce(x):
#     return x - 2013
# hr_oh = torch.nn.functional.one_hot(torch.tensor(df_tr["HR"].values))
# print(hr_oh)
# print(hr_oh.shape)

In [None]:
# vals = df_tr["TAXI_ID"].values
# print(vals)
# print(min(vals))
# print(max(vals))

# # df_tr["TAXI_ID"] = df_tr["TAXI_ID"].apply(reduce)
# vals = df_tr["TAXI_ID"].values
# print(vals)
# print(min(vals))
# print(max(vals))
# oh = torch.nn.functional.one_hot(torch.tensor(df_tr["TAXI_ID"].values))
# print(oh)
# print(oh.shape)


In [None]:
# df_tr[["CALL_A", "LEN"]].values

In [None]:
# Feature Trimming
# mean, std = df_tr["LEN"].mean(), df_tr["LEN"].std()
# median = df_tr["LEN"].median()
# outlier_threshold = 3
# df_trimmed = df_tr[df_tr["LEN"] < mean + outlier_threshold * std]
# df_trimmed = df_trimmed[df_trimmed['MISSING_DATA'] == False]
# print("Before Trimming: " + str(len(df_tr)))
# print("After Trimming: " + str(len(df_trimmed)))

# df_tr = df_trimmed

In [81]:

class MyDataset(Dataset):
  def __init__(self, csvpath):
    df_tr = pd.read_csv(csvpath)

    if "POLYLINE" not in df_tr: #test dataset
      df_tr["POLYLINE"]="trololololo"
      df_tr["LEN"] = df_tr["POLYLINE"].apply(polyline_to_trip_duration)
    else: 
      df_tr["LEN"] = df_tr["POLYLINE"].apply(polyline_to_trip_duration)
      # first trim the dataset

      mean, std = df_tr["LEN"].mean(), df_tr["LEN"].std()
      median = df_tr["LEN"].median()
      outlier_threshold = 3
      df_trimmed = df_tr[df_tr["LEN"] < mean + outlier_threshold * std]
      df_trimmed = df_trimmed[df_trimmed['MISSING_DATA'] == False]
      print("Before Trimming: " + str(len(df_tr)))
      print("After Trimming: " + str(len(df_trimmed)))

      df_tr = df_trimmed


    # df_tr[["YR", "MON", "DAY", "HR", "WK"]] = df_tr[["TIMESTAMP"]].apply(parse_time, axis=1, result_type="expand")
    df_tr["MIDMINS"] = df_tr[["TIMESTAMP"]].apply(parse_midnight_minutes, axis=1, result_type="expand")
    #df_tr["MIDMINS"]=(df_tr["MIDMINS"]-df_tr["MIDMINS"].min())/(df_tr["MIDMINS"].max()-df_tr["MIDMINS"].min())
    df_tr = pd.get_dummies(data=df_tr, columns=['CALL_TYPE', "DAY_TYPE", "ORIGIN_STAND", "TAXI_ID"])


    cols = list(df_tr.columns)
    badcols = ["TRIP_ID", 'ORIGIN_CALL',  'TIMESTAMP', 'MISSING_DATA', 'POLYLINE', 'LEN',] 
    for b in badcols:
      cols.remove(b)
    print(cols)
    
    
         
    x=df_tr[cols].astype(int).values
    print(x)
    y=df_tr["LEN"].values
 
    self.x_train=torch.tensor(x,dtype=torch.float32)
    print(self.x_train)
    self.y_train=torch.tensor(y,dtype=torch.float32)
    self.df = df_tr
 
  def __len__(self):
    return len(self.y_train)
   
  def __getitem__(self,idx):
    return self.x_train[idx],self.y_train[idx]

In [82]:
myDs=MyDataset("archive/train.csv")
train_loader=DataLoader(myDs,batch_size=64, shuffle=True)

Before Trimming: 1710670
After Trimming: 1692763
['MIDMINS', 'CALL_TYPE_A', 'CALL_TYPE_B', 'CALL_TYPE_C', 'DAY_TYPE_A', 'ORIGIN_STAND_1.0', 'ORIGIN_STAND_2.0', 'ORIGIN_STAND_3.0', 'ORIGIN_STAND_4.0', 'ORIGIN_STAND_5.0', 'ORIGIN_STAND_6.0', 'ORIGIN_STAND_7.0', 'ORIGIN_STAND_8.0', 'ORIGIN_STAND_9.0', 'ORIGIN_STAND_10.0', 'ORIGIN_STAND_11.0', 'ORIGIN_STAND_12.0', 'ORIGIN_STAND_13.0', 'ORIGIN_STAND_14.0', 'ORIGIN_STAND_15.0', 'ORIGIN_STAND_16.0', 'ORIGIN_STAND_17.0', 'ORIGIN_STAND_18.0', 'ORIGIN_STAND_19.0', 'ORIGIN_STAND_20.0', 'ORIGIN_STAND_21.0', 'ORIGIN_STAND_22.0', 'ORIGIN_STAND_23.0', 'ORIGIN_STAND_24.0', 'ORIGIN_STAND_25.0', 'ORIGIN_STAND_26.0', 'ORIGIN_STAND_27.0', 'ORIGIN_STAND_28.0', 'ORIGIN_STAND_29.0', 'ORIGIN_STAND_30.0', 'ORIGIN_STAND_31.0', 'ORIGIN_STAND_32.0', 'ORIGIN_STAND_33.0', 'ORIGIN_STAND_34.0', 'ORIGIN_STAND_35.0', 'ORIGIN_STAND_36.0', 'ORIGIN_STAND_37.0', 'ORIGIN_STAND_38.0', 'ORIGIN_STAND_39.0', 'ORIGIN_STAND_40.0', 'ORIGIN_STAND_41.0', 'ORIGIN_STAND_42.0', 'ORIGIN

In [86]:
import torch.nn as nn
import torch.nn.functional as F


class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten(start_dim=1)
        # self.conv1 = nn.Conv2d(3, 6, 5)
        # self.pool = nn.MaxPool2d(2, 2)
        # self.conv2 = nn.Conv2d(6, 16, 5)
        self.dro = nn.Dropout(p=0.3)
        self.fc1 = nn.Linear(516, 1000)
        self.fc2 = nn.Linear(1000, 1000)
        self.fc3 = nn.Linear(1000, 1000)
        self.fc4 = nn.Linear(1000, 1000)
        self.fc5 = nn.Linear(1000, 1000)
        self.fc6 = nn.Linear(1000, 1000)
        self.fc7 = nn.Linear(1000, 1)

    def forward(self, x):
        # x = self.pool(F.relu(self.conv1(x)))
        # x = self.pool(F.relu(self.conv2(x)))
        # x = torch.flatten(x, -1) # flatten all dimensions except batch
        
        x = F.relu(self.fc1(x))
        x = self.dro(x)
        x = F.relu(self.fc2(x))
        x = self.dro(x)
        x = self.fc7(x)
        # x = torch.transpose(x, 0, 1)
        return x


net = Net()

In [87]:
import torch.optim as optim
def RMSELoss(yhat,y):
    return torch.sqrt(torch.mean((yhat-y)**2))
criterion = RMSELoss
optimizer = optim.Adam(net.parameters(), lr=0.0001)

In [88]:
for epoch in range(2):  # loop over the dataset multiple times

    running_loss = 0.0
    for i, data in enumerate(train_loader, 0):

        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        labels=labels.unsqueeze(1)
        outputs = net(inputs)
        
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if i % 100 == 99:    # print every 2000 mini-batches
            print(outputs[0], labels[0])
            print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 100:.3f}')
            running_loss = 0.0

print('Finished Training')

tensor([7.8501], grad_fn=<SelectBackward0>) tensor([405.])
[1,   100] loss: 620.018
tensor([649.1924], grad_fn=<SelectBackward0>) tensor([225.])
[1,   200] loss: 556.580
tensor([283.8115], grad_fn=<SelectBackward0>) tensor([315.])
[1,   300] loss: 563.184
tensor([452.8177], grad_fn=<SelectBackward0>) tensor([210.])
[1,   400] loss: 559.261
tensor([749.9622], grad_fn=<SelectBackward0>) tensor([45.])
[1,   500] loss: 563.220
tensor([1021.0473], grad_fn=<SelectBackward0>) tensor([735.])
[1,   600] loss: 548.171
tensor([109.9346], grad_fn=<SelectBackward0>) tensor([450.])
[1,   700] loss: 559.863
tensor([164.9598], grad_fn=<SelectBackward0>) tensor([1455.])
[1,   800] loss: 551.600
tensor([409.7453], grad_fn=<SelectBackward0>) tensor([2280.])
[1,   900] loss: 537.782
tensor([235.1538], grad_fn=<SelectBackward0>) tensor([240.])
[1,  1000] loss: 522.379
tensor([1071.6766], grad_fn=<SelectBackward0>) tensor([540.])
[1,  1100] loss: 504.436
tensor([670.1725], grad_fn=<SelectBackward0>) tensor(

In [None]:
# zero_call = df_tr.sort_values(by='ORIGIN_CALL', ascending=True)
# print(zero_call)

In [None]:
testset = MyDataset("archive/test_public.csv")
testloader = DataLoader(testset)
preds = []
with torch.no_grad():
    for data in testloader:
        features, labels = data
        # calculate outputs by running images through the network
        output = net(features)
        preds.append(round(output.item()))
ids = testset.df["TRIP_ID"]        
print(preds)
d = {"TRIP_ID" : ids, "TRAVEL_TIME" : preds}
newdf = pd.DataFrame(d)
print(newdf)
newdf.to_csv("my_pred.csv", index=None)

tensor([[0., 1., 0.,  ..., 0., 0., 0.],
        [0., 1., 0.,  ..., 0., 0., 0.],
        [0., 1., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 1.,  ..., 0., 0., 0.],
        [1., 0., 0.,  ..., 0., 0., 0.],
        [1., 0., 0.,  ..., 0., 0., 0.]])
[752, 730, 707, 699, 743, 799, 780, 803, 843, 760, 553, 655, 651, 693, 743, 625, 669, 718, 803, 667, 859, 783, 828, 778, 664, 730, 803, 805, 688, 695, 810, 712, 753, 637, 667, 663, 829, 602, 822, 703, 624, 696, 740, 880, 665, 659, 743, 719, 573, 607, 604, 653, 662, 783, 769, 621, 883, 696, 631, 843, 813, 656, 752, 928, 902, 803, 734, 770, 713, 566, 733, 642, 703, 746, 605, 865, 637, 744, 793, 770, 727, 881, 691, 745, 683, 678, 736, 771, 618, 654, 684, 766, 846, 671, 643, 733, 715, 734, 731, 733, 614, 781, 756, 532, 545, 720, 693, 554, 785, 697, 679, 727, 612, 643, 689, 808, 637, 735, 759, 649, 744, 783, 694, 680, 589, 587, 792, 718, 811, 523, 574, 674, 757, 712, 687, 755, 765, 561, 723, 695, 865, 572, 733, 943, 670, 693, 637, 864, 790, 

In [None]:
# # Sample submission file that is given on kaggle
# df_sample = pd.read_csv("archive/sampleSubmission.csv")

# df_sample["TRAVEL_TIME"] = 716.43

# # mean(716.43) -> 792.73593
# # median(600) -> 784.74219
# df_sample.to_csv("my_pred.csv", index=None)

### Do some Feature Analysis

For our feature analysis, we are looking at which of our engineered features may be useful in making a taxicab time regression model

In [None]:
# First n samples to analyze. Set to -1 to use all data
end = -1

outlier_threshold = 3

# "Choose all data, where the trip length is less than 3 standard deviations away from the mean"
# This is to remove outliers. Otherwise, our plots would look very squished (since there are some
# VERRRRRY long taxi trips in the dataset)
df_trimmed = df_tr[df_tr["LEN"] < mean + outlier_threshold * std]

# Because our y-values only take on multiples of 15, we want just enough buckets in a histogram
# such that each buckets counts one value's frequency. (e.x. one bucket counts how many 15s trips, 
# how many 30s trips, etc. )
buckets = (int(mean + outlier_threshold * std) // 15)

print(f"Using: {len(df_trimmed)}/{len(df_tr)}")

fig, axs = plt.subplots(nrows=2, ncols=3, figsize=(18,14))

# Now, we visualize some features that we think might be useful
for idx, v in enumerate(["YR", "MON", "DAY", "HR", "WK", "ORIGIN_STAND"]):
  # idx // 3 = row, idx % 3 = column
  ax = axs[idx // 3, idx % 3]
  
  # Remove any rows with invalid values
  df_subset = df_trimmed.dropna(subset=v)
  
  # Create a histogram. Look up the documentation for more details
  ax.hist2d(df_subset[v][:end], df_subset["LEN"][:end], cmap="CMRmap", bins=(120,buckets))
  
  # Some stylistic things to make the graphs look nice
  ax.set_xlim(ax.get_xlim()[0] - 1, ax.get_xlim()[1] + 1)
  ax.set_facecolor("black")
  ax.set_ylabel("seconds", fontsize=18)
  ax.set_title(f"Feature: {v}", fontsize=20)


NameError: name 'mean' is not defined

In [None]:
plt.figure(figsize=(10,10))
for v in [0, 5, 11, 17, 23]:
  # Filter data where the HR matches v
  hourly_data = df_trimmed[df_trimmed["HR"] == v]["LEN"]
  histogram, bin_boundary = np.histogram(hourly_data, bins=buckets)
  histogram = histogram / len(hourly_data)
  # The center is the left_bound and right_bound of a bucket
  bin_centers = [(bin_boundary[i] + bin_boundary[i + 1]) / 2 for i in range(buckets)]
  plt.plot(bin_centers, histogram, label=f"HR={v}")
plt.legend();