# Import the datasets

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
% matplotlib inline
plt.style.use('seaborn-whitegrid')

TRAIN_PATH = '../input/train.csv'
TEST_PATH = '../input/test.csv'

print(os.listdir("../input"))

In [None]:
# https://www.kaggle.com/szelee/how-to-import-a-csv-file-of-55-million-rows
# https://www.kaggle.com/breemen/nyc-taxi-fare-data-exploration

In [None]:
# Set columns to most suitable type to optimize for memory usage
traintypes = {'fare_amount': 'float32',
              'pickup_datetime': 'str', 
              'pickup_longitude': 'float32',
              'pickup_latitude': 'float32',
              'dropoff_longitude': 'float32',
              'dropoff_latitude': 'float32',
              'passenger_count': 'uint8'}

cols = list(traintypes.keys())

df_train =  pd.read_csv(TRAIN_PATH, usecols=cols, dtype=traintypes, nrows = 10_000_000)
df_train.dtypes

df_train['pickup_datetime'] = df_train['pickup_datetime'].str.slice(0, 16)
df_train['pickup_datetime'] = pd.to_datetime(df_train['pickup_datetime'], utc=True, format='%Y-%m-%d %H:%M')

In [None]:
df_train.head(5)

## Add Features and Data Cleaning

In [None]:
# For this plot and further analysis, we need a function to calculate the distance in miles between locations in lon,lat coordinates.
# This function is based on https://stackoverflow.com/questions/27928/
# calculate-distance-between-two-latitude-longitude-points-haversine-formula 
# return distance in miles

def add_time(df):
    # add time information
    df['year'] = df.pickup_datetime.apply(lambda t: t.year)
    df['weekday'] = df.pickup_datetime.apply(lambda t: t.weekday())
    df['hour'] = df.pickup_datetime.apply(lambda t: t.hour)
    
def distance(lat1, lon1, lat2, lon2):
    p = 0.017453292519943295 # Pi/180
    a = 0.5 - np.cos((lat2 - lat1) * p)/2 + np.cos(lat1 * p) * np.cos(lat2 * p) * (1 - np.cos((lon2 - lon1) * p)) / 2
    return 0.6213712 * 12742 * np.arcsin(np.sqrt(a)) # 2*R*asin...

def add_travel_vector_features(df):
    # add new column to dataframe with distance in miles
    df['distance_miles'] = distance(df.pickup_latitude, df.pickup_longitude, \
                                          df.dropoff_latitude, df.dropoff_longitude)
    #df['abs_diff_longitude'] = (df.dropoff_longitude - df.pickup_longitude).abs()
    #df['abs_diff_latitude'] = (df.dropoff_latitude - df.pickup_latitude).abs()
    #df['abs_diff_lon_lat'] = (df.abs_diff_longitude + df.abs_diff_latitude)

def add_airport_dist(df):
    """
    Return minumum distance from pickup or dropoff coordinates to each airport.
    JFK: John F. Kennedy International Airport
    EWR: Newark Liberty International Airport
    LGA: LaGuardia Airport
    """
    jfk_coord = (40.639722, -73.778889)
    ewr_coord = (40.6925, -74.168611)
    lga_coord = (40.77725, -73.872611)
    
    pickup_lat = df['pickup_latitude']
    dropoff_lat = df['dropoff_latitude']
    pickup_lon = df['pickup_longitude']
    dropoff_lon = df['dropoff_longitude']
    
    pickup_jfk = distance(pickup_lat, pickup_lon, jfk_coord[0], jfk_coord[1]) 
    dropoff_jfk = distance(jfk_coord[0], jfk_coord[1], dropoff_lat, dropoff_lon) 
    pickup_ewr = distance(pickup_lat, pickup_lon, ewr_coord[0], ewr_coord[1])
    dropoff_ewr = distance(ewr_coord[0], ewr_coord[1], dropoff_lat, dropoff_lon) 
    pickup_lga = distance(pickup_lat, pickup_lon, lga_coord[0], lga_coord[1]) 
    dropoff_lga = distance(lga_coord[0], lga_coord[1], dropoff_lat, dropoff_lon) 
    
    df['jfk_dist'] = pd.concat([pickup_jfk, dropoff_jfk], axis=1).min(axis=1)
    df['ewr_dist'] = pd.concat([pickup_ewr, dropoff_ewr], axis=1).min(axis=1)
    df['lga_dist'] = pd.concat([pickup_lga, dropoff_lga], axis=1).min(axis=1)

def add_features(df):
    add_time(df)
    add_travel_vector_features(df)
    add_airport_dist(df)
    
    return df

def clean_df(df):
    print('Old size: %d' % len(df))
    
    # Remove observations with missing values
    df.dropna(how='any', axis='rows', inplace=True)

    # Removing observations with erroneous values
    mask = df['pickup_longitude'].between(-75, -73)
    mask &= df['dropoff_longitude'].between(-75, -73)
    mask &= df['pickup_latitude'].between(40, 42)
    mask &= df['dropoff_latitude'].between(40, 42)
    mask &= df['passenger_count'].between(0, 6)
    mask &= df['fare_amount'].between(0, 250)
    mask &= df['distance_miles'].between(0.05, 100)

    df = df[mask]
    
    print('New size: %d' % len(df))
    
    return df

In [None]:
df_train = add_features(df_train)
df_train = clean_df(df_train)

## Dataset preparation for Linear Regression

In [None]:
cols = ['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'passenger_count', 'year', 'weekday', 'hour', 'distance_miles', 'jfk_dist', 'ewr_dist', 'lga_dist']
X = df_train[cols]
y = df_train['fare_amount']

X.head()

In [None]:
'''def encode_categories(df, cats):
    return pd.get_dummies(df, columns=cats)

cats = ['passenger_count', 'year','weekday','hour']
X = encode_categories(X, cats)

X.head()'''

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=42)

In [None]:
%%time

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
y_pred[y_pred < 0] = 0

np.sqrt(mean_squared_error(y_test, y_pred))

In [None]:
model.coef_

In [None]:
from sklearn.metrics import mean_squared_error
import math

print('Score:', math.sqrt(mean_squared_error(y_test, y_pred)))

In [None]:
# plot prediction and actual data

plt.figure(figsize=(14,7))
plt.subplot(1, 2, 1)
plt.plot(y_test, y_pred, '.', markersize=1.5)
plt.title('Actual fare vs Predicted fare')
plt.xlabel('Actual fare')
plt.ylabel('Predicted fare')

plt.subplot(1, 2, 2)
plt.plot(y_test, y_pred, '.', markersize=1.5)
plt.title('Actual fare vs Predicted fare (max $80)')
plt.xlabel('Actual fare')
plt.ylabel('Predicted fare')
plt.xlim(0, 80)
plt.ylim(0, 80)

plt.show()

## Make predictions on the test set

In [None]:
df_test = pd.read_csv('../input/test.csv')
df_test['pickup_datetime'] = df_test['pickup_datetime'].str.slice(0, 16)
df_test['pickup_datetime'] = pd.to_datetime(df_test['pickup_datetime'], utc=True, format='%Y-%m-%d %H:%M')

In [None]:
df_test = add_features(df_test)

In [None]:
cols = ['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'passenger_count', 'year', 'weekday', 'hour', 'distance_miles', 'jfk_dist', 'ewr_dist', 'lga_dist']
X_test = df_test[cols]

In [None]:
'''cats = ['year','weekday','hour']
X_test = encode_categories(X_test, cats)'''

In [None]:
pred_test = model.predict(X_test)
pred_test[pred_test < 0] = 0

In [None]:
# Write the predictions to a CSV file which we can submit to the competition.
submission = pd.DataFrame(
    {'key': df_test.key, 'fare_amount': pred_test},
    columns = ['key', 'fare_amount'])
submission.to_csv('submission.csv', index = False)

print(os.listdir('.'))