In [14]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np, pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [2]:
df = pd.read_csv("data/preprocessed_train.csv")
df.head().T

Unnamed: 0,0,1,2,3,4
MasterDataTime,2009-06-15 17:26:21.0000001,2010-01-05 16:52:16.0000002,2011-08-18 00:35:00.00000049,2012-04-21 04:30:42.0000001,2010-03-09 07:51:00.000000135
fare_amount,4.5,16.9,5.7,7.7,5.3
pickup_datetime,2009-06-15 17:26:21 UTC,2010-01-05 16:52:16 UTC,2011-08-18 00:35:00 UTC,2012-04-21 04:30:42 UTC,2010-03-09 07:51:00 UTC
pickup_longitude,-73.8443,-74.016,-73.9827,-73.9871,-73.9681
pickup_latitude,40.7213,40.7113,40.7613,40.7331,40.768
dropoff_longitude,-73.8416,-73.9793,-73.9912,-73.9916,-73.9567
dropoff_latitude,40.7123,40.782,40.7506,40.7581,40.7838
passenger_count,1,1,2,1,1
EDT,2009-06-15 21:26:21,2010-01-05 20:52:16,2011-08-18 04:35:00,2012-04-21 08:30:42,2010-03-09 11:51:00
Hour,21,20,4,8,11


In [3]:
df.dtypes

MasterDataTime        object
fare_amount          float64
pickup_datetime       object
pickup_longitude     float64
pickup_latitude      float64
dropoff_longitude    float64
dropoff_latitude     float64
passenger_count        int64
EDT                   object
Hour                   int64
Weekday               object
AMorPM                object
dtype: object

In [4]:
target = 'fare_amount'
features = ['passenger_count','Hour','Weekday','AMorPM']

# numericalising
for col in features:
    if df[col].dtype == 'object':
        df[col] = df[col].astype("category").cat.codes

## Data split for modelling

In [8]:
X = df[features].copy().values
y = df[target].copy().values

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

## Building a super-simple Linear Regression

In [11]:
model_basic = LinearRegression().fit(X_train, y_train)

In [12]:
# r2 score
model_basic.score(X_train, y_train)

0.0006355124093436748

In [17]:
preds_basic = model_basic.predict(X_test)

In [18]:
np.sqrt(mean_squared_error(y_test, preds_basic))

9.775299648984795

In [19]:
# average price
df[target].mean()

11.348078720000004

## Super Basic LR gave bad result, so now to try with the remaining features

In [22]:
df.columns

Index(['MasterDataTime', 'fare_amount', 'pickup_datetime', 'pickup_longitude',
       'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude',
       'passenger_count', 'EDT', 'Hour', 'Weekday', 'AMorPM'],
      dtype='object')

In [23]:
df.dtypes

MasterDataTime        object
fare_amount          float64
pickup_datetime       object
pickup_longitude     float64
pickup_latitude      float64
dropoff_longitude    float64
dropoff_latitude     float64
passenger_count        int64
EDT                   object
Hour                   int64
Weekday                 int8
AMorPM                  int8
dtype: object

In [25]:
features.extend(['pickup_longitude',
                 'pickup_latitude',
                 'dropoff_longitude',
                 'dropoff_latitude'])
features

['passenger_count',
 'Hour',
 'Weekday',
 'AMorPM',
 'pickup_longitude',
 'pickup_latitude',
 'dropoff_longitude',
 'dropoff_latitude']

### DataSplits

In [30]:
len(df)

1000000

In [31]:
len(df.dropna())

999990

In [35]:
df = df.dropna()
X = df[features].copy().values
y = df[target].copy().values

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

## Model, and performance

In [36]:
model_baseline = LinearRegression().fit(X_train, y_train)

# r2 score
model_baseline.score(X_train, y_train)

0.0007812195707043168

In [40]:
preds_baseline = model_baseline.predict(X_test)

In [41]:
np.sqrt(mean_squared_error(y_test, preds_baseline))

9.827672576205861

## Linear Regression is using one hot encoding


In [42]:
from sklearn.preprocessing import OneHotEncoder

In [43]:
ohe = OneHotEncoder(sparse=False, handle_unknown="ignore")

In [44]:
df = pd.read_csv("data/preprocessed_train.csv")
df.head().T

Unnamed: 0,0,1,2,3,4
MasterDataTime,2009-06-15 17:26:21.0000001,2010-01-05 16:52:16.0000002,2011-08-18 00:35:00.00000049,2012-04-21 04:30:42.0000001,2010-03-09 07:51:00.000000135
fare_amount,4.5,16.9,5.7,7.7,5.3
pickup_datetime,2009-06-15 17:26:21 UTC,2010-01-05 16:52:16 UTC,2011-08-18 00:35:00 UTC,2012-04-21 04:30:42 UTC,2010-03-09 07:51:00 UTC
pickup_longitude,-73.8443,-74.016,-73.9827,-73.9871,-73.9681
pickup_latitude,40.7213,40.7113,40.7613,40.7331,40.768
dropoff_longitude,-73.8416,-73.9793,-73.9912,-73.9916,-73.9567
dropoff_latitude,40.7123,40.782,40.7506,40.7581,40.7838
passenger_count,1,1,2,1,1
EDT,2009-06-15 21:26:21,2010-01-05 20:52:16,2011-08-18 04:35:00,2012-04-21 08:30:42,2010-03-09 11:51:00
Hour,21,20,4,8,11


In [72]:
df = pd.get_dummies(df, columns=["Weekday","AMorPM"],drop_first=True)

In [82]:
new_feats =  ['pickup_longitude',
       'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude',
       'passenger_count', 'Hour', 'Weekday_Monday', 'Weekday_Saturday',
       'Weekday_Sunday', 'Weekday_Thursday', 'Weekday_Tuesday',
       'Weekday_Wednesday', 'AMorPM_PM']

In [83]:
df = df.dropna()
X = df[new_feats].copy().values
y = df[target].copy().values

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [84]:
model_baseline = LinearRegression().fit(X_train, y_train)

# r2 score
model_baseline.score(X_train, y_train)

0.0009487657819273744

In [85]:
preds_baseline = model_baseline.predict(X_test)

In [86]:
np.sqrt(mean_squared_error(y_test, preds_baseline))

9.8265765200406

All the scores so far are terrible. Linear Regression is not meant for this data