In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
import sklearn
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import RFE
from sklearn.preprocessing import OneHotEncoder, scale, LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score
import math 
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn import ensemble
from sklearn.externals import joblib
import json
import warnings
warnings.filterwarnings('ignore')


import ScoringUtilities 


In [3]:
unlabeled_json = r'D:/Learn/DoorDash/data_to_predict.json'
store_category_count_table_file = r'D:\Learn\DoorDash\store_category_count_table.csv'
make_store_id_cont_table_file = r'D:\Learn\DoorDash\make_store_id_cont_table.csv'
train_file = r'D:\Learn\DoorDash\historical_data.csv'

store_category_count_table_df = pd.read_csv(store_category_count_table_file)
make_store_id_cont_table_df = pd.read_csv(make_store_id_cont_table_file)
Train = pd.read_csv(train_file)


In [4]:
unlabeled_df = ScoringUtilities.load_unlabeled_data(unlabeled_json)

In [5]:
unlabeled_df_merge_1 = unlabeled_df.merge(store_category_count_table_df, left_on='store_primary_category', right_on='store_primary_category', how='left')
unlabeled_df_merge_2 = unlabeled_df_merge_1.merge(make_store_id_cont_table_df, left_on='store_id', right_on='store_id', how='left')
unlabeled_df_merge_2[['market_id','estimated_order_place_duration','estimated_store_to_consumer_driving_duration',
             'max_item_price','min_item_price', 'num_distinct_items', 'order_protocol',
             'subtotal','total_onshift_dashers','total_busy_dashers','total_items','total_onshift_dashers',
             'total_outstanding_orders', 'store_primary_category_count', 'store_id_count']] = unlabeled_df_merge_2[['market_id','estimated_order_place_duration','estimated_store_to_consumer_driving_duration',
             'max_item_price','min_item_price', 'num_distinct_items', 'order_protocol',
             'subtotal','total_onshift_dashers','total_busy_dashers','total_items','total_onshift_dashers',
             'total_outstanding_orders', 'store_primary_category_count', 'store_id_count']].apply(pd.to_numeric, errors='coerce')


In [6]:
def process_unlabel(df):
    a1 = ScoringUtilities.create_time_feature(df)
    a = ScoringUtilities.process_continuous_features(a1)
    b = ScoringUtilities.impute_market_id(a)
    c = ScoringUtilities.impute_order_protocol(b)
    d = ScoringUtilities.make_store_category_cont_score(c)
    e = ScoringUtilities.make_store_id_cont_score(d)
    unlabeled_ready = ScoringUtilities.select_features(e, 'Test')[0]
    return unlabeled_ready, e

In [7]:
unlabeled_ready = process_unlabel(unlabeled_df_merge_2)[0]

In [8]:
#a1 = create_time_feature(unlabeled_df_merge_2)
#a = process_continuous_features(a1)
#b = impute_market_id(a)
#c = impute_order_protocol(b)
#d = make_store_category_cont_score(c)
#e = make_store_id_cont_score(d)
#unlabeled_ready = select_features(e, 'Test')[0]

In [9]:
def process_train(df):
    a0 = ScoringUtilities.create_target(df)
    a1 = ScoringUtilities.create_time_feature(a0)
    a = ScoringUtilities.process_continuous_features(a1)
    b = ScoringUtilities.impute_market_id(a)
    c = ScoringUtilities.impute_order_protocol(b)
    d = ScoringUtilities.make_store_category_cont(c)[0]
    e = ScoringUtilities.make_store_id_cont(d)[0]
    Train_ready = ScoringUtilities.select_features(e, 'Train')[0]
    del Train_ready['duration']
    return Train_ready

In [10]:
Train_ready = process_train(Train)

In [11]:

#a0 = create_target(Train)
#a1 = create_time_feature(a0)
#a = process_continuous_features(a1)
#b = impute_market_id(a)
#c = impute_order_protocol(b)
#d = make_store_category_cont(c)[0]
#e = make_store_id_cont(d)[0]
#Train_ready = select_features(e, 'Train')[0]
#del Train_ready['duration'] 

In [12]:
train_plus_unlabel = pd.concat([Train_ready, unlabeled_ready], axis=0)
NumFeatures = ScoringUtilities.select_features(process_unlabel(unlabeled_df_merge_2)[1], 'Test')[1]
CatFeatures = ScoringUtilities.select_features(process_unlabel(unlabeled_df_merge_2)[1], 'Test')[2]
train_plus_unlabel_encoded = ScoringUtilities.scale_oneHot_X(train_plus_unlabel, NumFeatures,CatFeatures)
unlabeled_ready2 = train_plus_unlabel_encoded.tail(unlabeled_ready.shape[0])

In [13]:
gbm_pred = ScoringUtilities.make_prediction('gbm_model_saved.pkl', unlabeled_ready2)

In [14]:
print(len(gbm_pred))

54778


In [15]:
gbm_pred[:20]

array([ 3066.25951689,  2836.03493721,  3136.0031403 ,  2946.97669541,
        2602.90886164,  3313.77117553,  3055.67257941,  2423.78604448,
        2235.46169491,  2616.2855599 ,  1753.19301285,  2180.29426445,
        2669.74491498,  2806.07551361,  2411.71261746,  2419.71494104,
        2286.95359022,  2297.98029943,  2623.07913919,  2625.6808472 ])

In [19]:
print("First 20 records are: \n {} ".format(gbm_pred[:20]) )

First 20 records are: 
 [ 3066.25951689  2836.03493721  3136.0031403   2946.97669541  2602.90886164
  3313.77117553  3055.67257941  2423.78604448  2235.46169491  2616.2855599
  1753.19301285  2180.29426445  2669.74491498  2806.07551361  2411.71261746
  2419.71494104  2286.95359022  2297.98029943  2623.07913919  2625.6808472 ] 
