In [3]:
import os
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge

In [5]:
def write_submission_file(prediction, filename,
                          path_to_sample=os.path.join('SampleSubmission.csv')):
    submission = pd.read_csv(path_to_sample, index_col='Order_No')
    
    submission['Time from Pickup to Arrival'] = prediction
    submission.to_csv(filename)

In [6]:
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

### Preparing the dataset


In [7]:
train = pd.read_csv('Train.csv', sep=',') 
test = pd.read_csv('Test.csv', sep=',') 
riders = pd.read_csv('Riders.csv', sep=',')

In [8]:
train = pd.merge(train, riders, how='left', left_on='Rider Id', right_on='Rider Id', left_index=True)
test = pd.merge(test, riders, how='left', left_on='Rider Id', right_on='Rider Id', left_index=True)

In [9]:
print(train.describe())

       Platform Type  Placement - Day of Month  Placement - Weekday (Mo = 1)  \
count   21201.000000              21201.000000                  21201.000000   
mean        2.752182                 15.653696                      3.240083   
std         0.625178                  8.798916                      1.567295   
min         1.000000                  1.000000                      1.000000   
25%         3.000000                  8.000000                      2.000000   
50%         3.000000                 15.000000                      3.000000   
75%         3.000000                 23.000000                      5.000000   
max         4.000000                 31.000000                      7.000000   

       Confirmation - Day of Month  Confirmation - Weekday (Mo = 1)  \
count                 21201.000000                     21201.000000   
mean                     15.653837                         3.240225   
std                       8.798886                         1.56722

In [10]:
#Only 'Distance (KM)', 'Average_Rating', 'Pickup - Weekday (Mo = 1)', 'Temperature' are used in this model.
X_train = train[['Order No', 'Distance (KM)', 'Average_Rating', 'Pickup - Weekday (Mo = 1)', 'Temperature']].\
                set_index('Order No')
X_test = test[['Order No','Distance (KM)', 'Average_Rating','Pickup - Weekday (Mo = 1)', 'Temperature']].\
                set_index('Order No')
#Replace missing values with median values
X_train = X_train.fillna(X_train.median())
X_test = X_test.fillna(X_test.median())

In [11]:
y_train = train['Time from Pickup to Arrival']

### Model quality check

In [12]:
ridge = Ridge(alpha=0.01, normalize = True, random_state=42)

In [13]:
X_train_part, X_valid, y_train_part, y_valid = train_test_split(X_train, y_train, test_size=0.3, random_state=42)

In [14]:
ridge.fit(X_train_part, y_train_part)
ridge_valid_pred = ridge.predict(X_valid)
#In case delivery time is forecasted negative
ridge_valid_pred[ridge_valid_pred < 0] = 0

In [15]:
#On the leaderboard 785.648
rmse(y_valid, ridge_valid_pred)

808.3919523552674

### Model training and submission creation

In [16]:
ridge.fit(X_train, y_train)
ridge_prediction = ridge.predict(X_test)

In [19]:
len(ridge_prediction)

7068

In [17]:
write_submission_file(prediction=ridge_prediction, filename='submission.csv')