## XGBoost

In [1]:
import numpy as np
import pandas as pd
import math
import seaborn as sns
import scipy as sp
import datetime as dt
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn import linear_model
from sklearn.svm import SVR
from sklearn.preprocessing import normalize
from sklearn.preprocessing import scale
from collections import Counter
import ziptotimezone as zip_helper
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import graphviz 
import matplotlib.pyplot as plt
import mpu
from uszipcode import SearchEngine
import pandas as pd

In [2]:
dataset = pd.read_csv('eBay_ML_Challenge_Dataset_2021_train.csv', nrows=500000)
#dataset = pd.read_csv("partially_processed_data.csv", nrows=500000)
#other_dataset = pd.read_csv('eBay_ML_Challenge_Dataset_2021_train.csv', nrows=500000)

In [3]:
b2c_c2c = np.array(dataset["b2c_c2c"])
if b2c_c2c[0] in [0,1]:
        print("Array has already been converted to numeric binary!")
else:
    for i in range(len(b2c_c2c)):
        if b2c_c2c[i][0] == "B":
            b2c_c2c[i] = 0
        else:
            b2c_c2c[i] = 1
dataset["b2c_c2c"] = b2c_c2c

In [4]:
def round_datetime_to_date(datetime):
    days = datetime.days
    hours = datetime.seconds // 3600
    if hours > 12:
        return days + 1
    else:
        return days

def calculate_handling_and_delivery_days(acceptance_timestamps, payment_timestamps, delivery_date):
    handling_labels = []
    shipping_labels = []
    delivery_labels = []
    for i in range(acceptance_timestamps.shape[0]):
        raw_payment = payment_timestamps[i]
        raw_acceptance = acceptance_timestamps[i]
        #parse raw_payment time string to separate year, month, date, and time
        p_year, p_month, p_date = int(raw_payment[0:4]), int(raw_payment[5:7]), int(raw_payment[8:10])
        p_hour, p_min, p_sec = int(raw_payment[11:13]), int(raw_payment[14:16]), int(raw_payment[17:19])
        p_datetime = dt.datetime(year=p_year, month=p_month, day=p_date, hour=p_hour, minute=p_min, second=p_sec)
            
        #parse raw_acceptance time string to separate year, month, date, and time
        raw_acceptance = acceptance_timestamps[i]
        a_year, a_month, a_date = int(raw_acceptance[0:4]), int(raw_acceptance[5:7]), int(raw_acceptance[8:10])
        a_hour, a_min, a_sec = int(raw_acceptance[11:13]), int(raw_acceptance[14:16]), int(raw_acceptance[17:19])
        a_datetime = dt.datetime(year=a_year, month=a_month, day=a_date, hour=a_hour, minute=a_min, second=a_sec)
        
        raw_delivery = delivery_date[i]
        d_year, d_month, d_date = int(raw_delivery[0:4]), int(raw_delivery[5:7]), int(raw_delivery[8:10])
        d_date = dt.datetime(year=d_year, month=d_month, day=d_date, hour=17)
        
        #handling days = acceptance time - payment time; shipping days = delivery date - acceptance time
        handling_days = a_datetime - p_datetime
        shipping_days = d_date - a_datetime
        delivery_days = d_date - p_datetime
        
        #round to nearest day
        rounded_handling_days = round_datetime_to_date(handling_days)
        rounded_shipping_days = round_datetime_to_date(shipping_days)
        rounded_delivery_days = round_datetime_to_date(delivery_days)
        
        handling_labels.append(rounded_handling_days)
        shipping_labels.append(rounded_shipping_days)
        delivery_labels.append(rounded_delivery_days)
        
    return np.array(handling_labels), np.array(shipping_labels), np.array(delivery_labels)

In [None]:
handling_days, shipping_days, delivery_days = calculate_handling_and_delivery_days(dataset["acceptance_scan_timestamp"], dataset["payment_datetime"], dataset["delivery_date"])

In [None]:
weight = dataset["weight"]
weight_units = dataset["weight_units"]
for i, unit in enumerate(weight_units):
    if unit == 2:
        #convert weight to lbs; 1 kg = 2.20462 lbs.
        weight[i] *= 2.20462
dataset["weight"] = weight

In [None]:
weight = dataset["weight"]
category_id = dataset["category_id"]

#determine average weight by category ID 
category_id_weights = {}
for i, w in enumerate(weight):
    category = category_id[i]
    if category not in category_id_weights:
        category_id_weights[category] = [w]
    else:
        category_id_weights[category].append(w)

category_id_weight_means = {}
for category in category_id_weights:
    weights = category_id_weights[category]
    average_weight = np.mean(weights)
    category_id_weight_means[category] = average_weight

#fill missing weights
weight_means = category_id_weight_means
overall_mean = np.mean(weight)
for i, w in enumerate(weight):
    if w == 0:
        #weight is missing, replace with average weight across same category id
        category = category_id[i]
        if category in weight_means:
            weight[i] = weight_means[category]
        else:
            #don't have records for this category id, so replace with overall average
            weight[i] = overall_mean

dataset["weight"] = weight

In [None]:
#package_size = dataset["package_size"]

#if type(package_size[0]) == int:
        #print("Already converted to discrete numeric values")
#else:
#encodings = {"LETTER": 0, "PACKAGE_THICK_ENVELOPE": 1, "LARGE_ENVELOPE": 2,"VERY_LARGE_PACKAGE": 3, 
             #"LARGE_PACKAGE": 4, "EXTRA_LARGE_PACKAGE": 5, "NONE": -1}
#for i, size in enumerate(package_size):
    #if type(package_size[i]) != int:
        #package_size[i] = encodings[size]

#dataset["package_size"] = package_size

In [None]:
dataset.drop(["package_size"], axis=1, inplace=True)

In [None]:

# package_size = dataset["package_size"]
# weight = dataset["weight"]

#determine average weight by package size
# package_size_weights = {}
# for i, w in enumerate(weight):
#     p_size = package_size[i]
#     if p_size not in package_size_weights:
#         package_size_weights[p_size] = [w]
#     else:
#         package_size_weights[p_size].append(w)

# package_id_weight_means = {}
# for p_size in package_size_weights:
#     weights = package_size_weights[p_size]
#     average_weight = np.mean(weights)
#     package_id_weight_means[p_size] = average_weight

# #fill in missing package sizes
# weight_means = package_id_weight_means
# weight_means.pop(-1, None)
# weight_means_list = [weight_means[key] for key in weight_means]
# for i, s in enumerate(package_size):
#     if s == -1:
#         #package size is missing, replace with package size it's weight is closest to the average of
#         w = weight[i]
#         abs_function = lambda value: abs(value-w)
#         closest_value = min(weight_means_list, key=abs_function)
#         closest_p_size = weight_means_list.index(closest_value)
#         package_size[i] = closest_p_size

# dataset["package_size"] = package_size

In [None]:
shipment_method_id = dataset["shipment_method_id"]
carrier_min_estimate = dataset["carrier_min_estimate"]
carrier_max_estimate = dataset["carrier_max_estimate"]

#determine average shipping estimates by shipping method   
carrier_min_by_shipment_method = {}
carrier_max_by_shipment_method = {}
for i, method_id in enumerate(shipment_method_id):
    carrier_min = carrier_min_estimate[i]
    carrier_max = carrier_max_estimate[i]
    if method_id not in carrier_min_by_shipment_method:
        carrier_min_by_shipment_method[method_id] = [carrier_min]
    else:
        carrier_min_by_shipment_method[method_id].append(carrier_min)

    if method_id not in carrier_max_by_shipment_method:
        carrier_max_by_shipment_method[method_id] = [carrier_max]
    else:
        carrier_max_by_shipment_method[method_id].append(carrier_max)

carrier_min_means = {}
for method_id in carrier_min_by_shipment_method:
    min_estimates = carrier_min_by_shipment_method[method_id]
    mean_min_estimate = np.mean(min_estimates)
    carrier_min_means[method_id] = mean_min_estimate

carrier_max_means = {}
for method_id in carrier_max_by_shipment_method:
    max_estimates = carrier_max_by_shipment_method[method_id]
    mean_max_estimate = np.mean(max_estimates)
    carrier_max_means[method_id] = mean_max_estimate 
    
#fill in missing estimates
overall_min_mean, overall_max_mean = np.mean(carrier_min_estimate), np.mean(carrier_max_estimate)
for i, estimate in enumerate(carrier_min_estimate):
    if estimate < 0:
        #need to fill value 
        method_id = shipment_method_id[i]
        if method_id in carrier_min_means:
            carrier_min_estimate[i] = carrier_min_means[method_id]
        else:
            carrier_min_estimate[i] = overall_min_mean
for i, estimate in enumerate(carrier_max_estimate):
    if estimate < 0:
        #need to fill value
        method_id = shipment_method_id[i]
        if method_id in carrier_max_means:
            carrier_max_estimate[i] = carrier_max_means[method_id]
        else:
            carrier_max_estimate[i] = overall_max_mean

dataset["carrier_min_estimate"] = carrier_min_estimate
dataset["carrier_max_estimate"] = carrier_max_estimate

In [None]:
#SINCE DECLARED HANDLING DAYS IS THE MOST IMPORTANT FEATURE, I DON'T THINK 
#THIS IS A GOOD DATA FEATURE TO FILL WITH A NAIVE AVERAGE. 
#declared_handling_days = dataset["declared_handling_days"]
#seller_id = dataset["seller_id"]
#def fill_missing_declared_handling_days():
#overall_mean = np.mean(declared_handling_days)
#seller_counts = Counter(seller_id)
#for i, days in enumerate(declared_handling_days):
    #if np.isnan(days):
        #need to fill
        #declared_handling_days[i] = overall_mean

#dataset["declared_handling_days"] = declared_handling_days

In [None]:
dataset["handling_days"] = handling_days
dataset.drop(["acceptance_scan_timestamp"], axis=1, inplace=True)
dataset.drop(["payment_datetime"], axis=1, inplace=True)
dataset.drop(["delivery_date"], axis=1, inplace=True)
dataset.drop(["weight_units"], axis=1, inplace=True)

In [None]:
def get_zip_features(item_zip, buyer_zip):
    """
    Haversine formula using 'mpu' library which determines the
    great-circle distance between two points on a sphere.
    """
    if item_zip is not None and buyer_zip is not None:
        search = SearchEngine(simple_zipcode=True)

        zip1 = search.by_zipcode(item_zip[0:5])
        lat1 =zip1.lat
        long1 =zip1.lng
        pop_density1 = zip1.population_density
        median_income1 = zip1.median_household_income

        zip2 =search.by_zipcode(buyer_zip[0:5])
        lat2 =zip2.lat
        long2 =zip2.lng
        pop_density2 = zip2.population_density
        median_income2 = zip2.median_household_income

        if lat1 is None or lat2 is None or long1 is None or long2 is None:
            lat1, long1 = zip_helper.zip_to_central_lat_lon(int(item_zip))
            lat2, long2 = zip_helper.zip_to_central_lat_lon(int(buyer_zip))
                
        return mpu.haversine_distance((lat1,long1),(lat2,long2)), pop_density1, median_income1, pop_density2, median_income2
    else:
        print("item zip or buyer zip was None")
        return None


def add_zip_feature_columns(item_zip, buyer_zip, dataset):
    #item_zip_str = item_zip.apply(lambda x: str(x))
    #buyer_zip_str = buyer_zip.apply(lambda x: str(x))

    #zips = pd.concat([item_zip_str, buyer_zip_str], axis=1)
    zips = pd.concat([item_zip, buyer_zip], axis=1)
    item_z = zips["item_zip"]
    zips["distance"] = [0] * zips.shape[0]
    zips["item_zip_pop_density"] = [0] * zips.shape[0]
    zips["item_zip_median_income"] = [0] * zips.shape[0]
    zips["buyer_zip_pop_density"] = [0] * zips.shape[0]
    zips["buyer_zip_median_income"] = [0] * zips.shape[0]
    remove_indeces = []
    for i, z in enumerate(item_z):
        if i%1000 == 0:
            print("on data instance number " + str(i))
        try:
            distance, pop_density1, median_income1, pop_density2, median_income2 = get_zip_features(z, zips["buyer_zip"][i])
            zips["distance"][i] = distance
            zips["item_zip_pop_density"][i] = pop_density1
            zips["item_zip_median_income"][i] = median_income1
            zips["buyer_zip_pop_density"][i] = pop_density2
            zips["buyer_zip_median_income"][i] = median_income2
        except Exception as e:
            remove_indeces.append(i)
    dataset.drop(remove_indeces, inplace=True, axis=0)
    labels = pd.DataFrame(delivery_days, columns=["delivery_days"])
    labels.drop(remove_indeces, inplace=True, axis=0)
    zips.drop(remove_indeces, inplace=True, axis=0)
            
    #zips['distance'] = zips.apply(lambda x: get_distance(x.item_zip, x.buyer_zip), axis=1)
    return zips['distance'], zips["item_zip_pop_density"], zips["item_zip_median_income"], zips["buyer_zip_pop_density"], zips["buyer_zip_median_income"], labels

distance, item_density, item_income, buyer_density, buyer_income, labels = add_zip_feature_columns(dataset["item_zip"], dataset["buyer_zip"], dataset)
dataset["zip_distance"] = distance
dataset["item_zip_pop_density"] = item_density
dataset["item_zip_median_income"] = item_income
dataset["buyer_zip_pop_density"] = buyer_density
dataset["buyer_zip_median_income"] = buyer_income
dataset["labels"] = labels

ON DATA INSTANCE NUMBER 1174000

In [6]:
dataset.to_csv("partially_processed_data.csv", sep=",", header=dataset.columns)

In [None]:
dataset.shape

In [None]:
for i in range(1000000):
    try: 
        if type(dataset["item_zip"][i]) != int:
            dataset["item_zip"][i] = int(dataset["item_zip"][i][0:5])
            dataset["buyer_zip"][i] = int(dataset["buyer_zip"][i][0:5])
    except Exception as e:
        continue

In [None]:
features = dataset.to_numpy()
labels = np.array(labels)

In [None]:
labels.shape

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(features, labels)

In [None]:
X_train = scale(X_train, with_mean=True, with_std=True)
X_test = scale(X_test, with_mean=True, with_std=True)

In [None]:
def evaluate_loss(preds, actual):
    early_loss, late_loss = 0,0 
    for i in range(len(preds)):
        if preds[i] < actual[i]:
            #early shipment
            early_loss += actual[i] - preds[i]
        elif preds[i] > actual[i]:
            #late shipment
            late_loss += preds[i] - actual[i]
    loss = (1/len(preds)) * (0.4 * (early_loss) + 0.6 * (late_loss))
    return loss

In [None]:
print("Before fine tuning")
xgbr = xgb.XGBRegressor(verbosity=0)
print(xgbr)
xgbr.fit(X_train, Y_train)
train_score = xgbr.score(X_train, Y_train)
print("train score: " + str(train_score))
pred = xgbr.predict(X_test)
loss = evaluate_loss(pred, Y_test)
print("loss: " + str(loss))

In [None]:
from sklearn.model_selection import RandomizedSearchCV
print("Tuning")

params = {
    # Parameters that we are going to tune.
    'max_depth':[int(x) for x in np.linspace(start=1, stop=20, num=1)],
    'min_child_weight':[0,1,2,3,4,5,6,7,8,9,10],
    'eta':[0.5, 0.4, 0.3, 0.2, 0.1, 0.05, 0.01, 0.005],
    'subsample': [x/100 for x in np.linspace(start=1, stop=100, num=2)],
    'colsample_bytree': [x/100 for x in np.linspace(start=1, stop=100, num=2)],
}

xgb_random_search = RandomizedSearchCV(estimator=xgbr,
                                      param_distributions = params,
                                      n_iter = 100,
                                      cv=3,
                                      verbose=2,
                                      random_state=47,
                                      n_jobs=2)

xgb_random_search.fit(X_train, Y_train)
print(xgb_random_search.best_params_)

In [None]:
best_params = xgb_random_search.best_params_
subsample = best_params["subsample"]
min_child_weight = best_params["min_child_weight"]
max_depth = best_params["max_depth"]
eta = best_params["eta"]
colsample_by_tree = best_params["colsample_bytree"]

In [None]:
print("After fine tuning")
xgbr = xgb.XGBRegressor(verbosity=0, subsample=subsample, min_child_weight=min_child_weight,
                       max_depth = max_depth, eta=eta, colsample_by_tree=colsample_by_tree)
print(xgbr)
xgbr.fit(X_train, Y_train)
train_score = xgbr.score(X_train, Y_train)
print("train score: " + str(train_score))
pred = xgbr.predict(X_test)
loss = evaluate_loss(pred, Y_test)
print("loss: " + str(loss))

In [None]:
for i in range(dataset.columns.shape[0]):
    print(dataset.columns[i] + ": " + str(xgbr.feature_importances_[i]))

Let's see if we can improve performance by dropping features who have an importance of <0.3

b2c_c2c: 0.015656047
seller_id: 0.047507692
declared_handling_days: 0.042284053
shipment_method_id: 0.035130586
shipping_fee: 0.028294528
carrier_min_estimate: 0.044301696
carrier_max_estimate: 0.055198833
item_zip: 0.0597363
buyer_zip: 0.028275775
category_id: 0.022790782
item_price: 0.043802027
quantity: 0.04765062
weight: 0.05730528
package_size: 0.020258492
record_number: 0.022046657
handling_days: 0.2548077
zip_distance: 0.04024559
item_zip_pop_density: 0.026283627
item_zip_median_income: 0.04863616
buyer_zip_pop_density: 0.03418408
buyer_zip_median_income: 0.025603488

In [None]:
dataset.drop(["b2c_c2c", "quantity", "shipping_fee", "category_id", "package_size", "record_number", "item_zip_pop_density", "buyer_zip_median_income"], axis=1, inplace=True)

In [None]:
dataset

In [None]:
features = dataset.to_numpy()
labels = np.array(labels)
X_train = scale(X_train, with_mean=True, with_std=True)
X_test = scale(X_test, with_mean=True, with_std=True)

print("Before fine tuning")
xgbr = xgb.XGBRegressor(verbosity=0)
print(xgbr)
xgbr.fit(X_train, Y_train)
train_score = xgbr.score(X_train, Y_train)
print("train score: " + str(train_score))
pred = xgbr.predict(X_test)
loss = evaluate_loss(pred, Y_test)
print("loss: " + str(loss))

In [None]:
from sklearn.model_selection import RandomizedSearchCV
print("Tuning")

params = {
    # Parameters that we are going to tune.
    'max_depth':[int(x) for x in np.linspace(start=5, stop=20, num=1)],
    'min_child_weight':[int(x) for x in np.linspace(start=1, stop=10, num=1)],
    'eta':[0.3, 0.2, 0.1, 0.05, 0.01, 0.005],
    'subsample': [x/10 for x in np.linspace(start=1, stop=10, num=1)],
    'colsample_bytree': [x/10 for x in np.linspace(start=1, stop=10, num=1)],
    'n_estimators': [int(x) for x in np.linspace(start=50, stop=500, num=50)]
}

xgb_random_search = RandomizedSearchCV(estimator=xgbr,
                                      param_distributions = params,
                                      n_iter = 100,
                                      cv=3,
                                      verbose=2,
                                      random_state=47,
                                      n_jobs=2)

xgb_random_search.fit(X_train, Y_train)
print(xgb_random_search.best_params_)

In [None]:
best_params = xgb_random_search.best_params_
subsample = best_params["subsample"]
min_child_weight = best_params["min_child_weight"]
max_depth = best_params["max_depth"]
eta = best_params["eta"]
colsample_by_tree = best_params["colsample_bytree"]
n_estimators = best_params["n_estimators"]

In [None]:
print("After fine tuning")
xgbr = xgb.XGBRegressor(verbosity=0, subsample=subsample, min_child_weight=min_child_weight,
                       max_depth = max_depth, eta=eta, colsample_by_tree=colsample_by_tree)
print(xgbr)
xgbr.fit(X_train, Y_train)
train_score = xgbr.score(X_train, Y_train)
print("train score: " + str(train_score))
pred = xgbr.predict(X_test)
loss = evaluate_loss(pred, Y_test)
print("loss: " + str(loss))

In [None]:
for i in range(dataset.columns.shape[0]):
    print(dataset.columns[i] + ": " + str(xgbr.feature_importances_[i]))

In [None]:
dataset.columns

In [None]:
dataset.drop(["declared_handling_days", "shipment_method_id", "carrier_min_estimate",
             "item_zip", "buyer_zip", "weight"], axis=1, inplace=True)

In [None]:
dataset.columns

In [None]:
features = dataset.to_numpy()
labels = np.array(labels)
X_train = scale(X_train, with_mean=True, with_std=True)
X_test = scale(X_test, with_mean=True, with_std=True)

print("Before fine tuning")
xgbr = xgb.XGBRegressor(verbosity=0)
print(xgbr)
xgbr.fit(X_train, Y_train)
train_score = xgbr.score(X_train, Y_train)
print("train score: " + str(train_score))
pred = xgbr.predict(X_test)
loss = evaluate_loss(pred, Y_test)
print("loss: " + str(loss))

In [None]:
from sklearn.model_selection import RandomizedSearchCV
print("Tuning")

params = {
    # Parameters that we are going to tune.
    'max_depth':[int(x) for x in np.linspace(start=5, stop=20, num=1)],
    'min_child_weight':[int(x) for x in np.linspace(start=1, stop=10, num=1)],
    'eta':[0.3, 0.2, 0.1, 0.05, 0.01, 0.005],
    'subsample': [x/10 for x in np.linspace(start=1, stop=10, num=1)],
    'colsample_bytree': [x/10 for x in np.linspace(start=1, stop=10, num=1)],
    'n_estimators': [int(x) for x in np.linspace(start=50, stop=500, num=50)]
}

xgb_random_search = RandomizedSearchCV(estimator=xgbr,
                                      param_distributions = params,
                                      n_iter = 100,
                                      cv=3,
                                      verbose=2,
                                      random_state=47,
                                      n_jobs=2)

xgb_random_search.fit(X_train, Y_train)
print(xgb_random_search.best_params_)

In [None]:
best_params = xgb_random_search.best_params_
subsample = best_params["subsample"]
min_child_weight = best_params["min_child_weight"]
max_depth = best_params["max_depth"]
eta = best_params["eta"]
colsample_by_tree = best_params["colsample_bytree"]
n_estimators = best_params["n_estimators"]

In [None]:
print("After fine tuning")
tuned_xgbr = xgb.XGBRegressor(verbosity=0, subsample=subsample, min_child_weight=min_child_weight,
                       max_depth = max_depth, eta=eta, colsample_by_tree=colsample_by_tree,
                       n_estimators = n_estimators)
print(xgbr)
tuned_xgbr.fit(X_train, Y_train)
train_score = tuned_xgbr.score(X_train, Y_train)
print("train score: " + str(train_score))
pred = tuned_xgbr.predict(X_test)
loss = evaluate_loss(pred, Y_test)
print("loss: " + str(loss))