In [3]:
%%time
import pandas as pd

df = pd.read_feather('./all/train.feather')

Wall time: 46.3 s


In [4]:
# CHECK FOR INVALID COORDINATES (outside of NYC range)
# determined with https://www.mapdevelopers.com/geocode_bounding_box.php

def valid_coordinates(lat_list, lon_list):
    for i in lat_list:
        if i < 40.477399 or i > 40.917577:
            return False
    for i in lon_list:
        if i < -74.259090 or i > -73.700272:
            return False
    return True

def clean_coordinates(df):
    df.query('~(pickup_latitude < 40.477399 or pickup_latitude > 40.917577) &\
              ~(dropoff_latitude < 40.477399 or dropoff_latitude > 40.917577) &\
              ~(pickup_longitude < -74.259090 or pickup_longitude > -73.700272) &\
              ~(dropoff_longitude < -74.259090 or dropoff_longitude > -73.700272)', inplace=True)

In [5]:
# REMOVE INVALID PASSENGER COUNTS AND FARES, DROPNA

def clean_pfd(df):
    df.dropna(inplace=True)
    df.query('passenger_count > 0 & fare_amount > 0', inplace=True)

In [6]:
# CALCULATE EUCLIDEAN DISTANCE

from scipy.spatial import distance

def euclidean_distance(x1, y1, x2, y2):
    return ((x1-x2)**2 + (y2-y1)**2)**0.5

def add_euclidean_distance(df):
    df['euclidean_distance'] = euclidean_distance(df['pickup_latitude'], df['pickup_longitude'], 
                                                  df['dropoff_latitude'], df['dropoff_longitude'])

In [7]:
def preprocess(df, training=True):
    if training:
        clean_pfd(df)
        clean_coordinates(df)
    add_euclidean_distance(df)
    #add_dmh(df)

In [8]:
%%time
# PREPROCESS TRAINING DATA

preprocess(df, training=True)
#

Wall time: 9min 55s


In [10]:
df.reset_index(inplace=True)
df.to_feather('./all/preprocessed_train.feather')

In [17]:
df

Unnamed: 0,index,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,euclidean_distance
0,0,2009-06-15 17:26:21.0000001,4.50,2009-06-15 17:26:21 UTC,-73.844311,40.721319,-73.841610,40.712278,1,0.009436
1,1,2010-01-05 16:52:16.0000002,16.90,2010-01-05 16:52:16 UTC,-74.016048,40.711303,-73.979268,40.782004,1,0.079696
2,2,2011-08-18 00:35:00.00000049,5.70,2011-08-18 00:35:00 UTC,-73.982738,40.761270,-73.991242,40.750562,2,0.013674
3,3,2012-04-21 04:30:42.0000001,7.70,2012-04-21 04:30:42 UTC,-73.987130,40.733143,-73.991567,40.758092,1,0.025340
4,4,2010-03-09 07:51:00.000000135,5.30,2010-03-09 07:51:00 UTC,-73.968095,40.768008,-73.956655,40.783762,1,0.019470
5,5,2011-01-06 09:50:45.0000002,12.10,2011-01-06 09:50:45 UTC,-74.000964,40.731630,-73.972892,40.758233,1,0.038675
6,6,2012-11-20 20:35:00.0000001,7.50,2012-11-20 20:35:00 UTC,-73.980002,40.751662,-73.973802,40.764842,1,0.014565
7,7,2012-01-04 17:22:00.00000081,16.50,2012-01-04 17:22:00 UTC,-73.951300,40.774138,-73.990095,40.751048,1,0.045146
8,8,2012-12-03 13:10:00.000000125,9.00,2012-12-03 13:10:00 UTC,-74.006462,40.726713,-73.993078,40.731628,1,0.014258
9,9,2009-09-02 01:11:00.00000083,8.90,2009-09-02 01:11:00 UTC,-73.980658,40.733873,-73.991540,40.758138,2,0.026593


In [13]:
# FIT THE MODEL AND SCORE (TRAIN/TEST DATA)

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

model = LinearRegression()
X = df[['euclidean_distance', 'passenger_count']]
y = df['fare_amount']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=3)

model.fit(X_train, y_train)
print('Score:', model.score(X_test, y_test))

Score: 0.15482831020065668


In [15]:
# PRINT COEFFICIENTS

for i in list(zip(X.columns, model.coef_)):
    print('{0}: {1}'.format(i[0], i[1]))

euclidean_distance: 217.02623969016372
passenger_count: 0.030791065479204738


In [18]:
# PREPROCESS ACTUAL DATA

real_df = pd.read_csv('./all/test.csv')
preprocess(real_df, training=False)

In [19]:
# FIT MODEL TO ACTUAL DATA

real_X = real_df[['euclidean_distance', 'passenger_count']]

predictions = model.predict(real_X)
data = list(zip(real_df['key'], predictions))

submission = pd.DataFrame(data, columns=['key', 'fare_amount'])
submission.set_index('key', inplace=True)

submission.to_csv('submission.csv')