## Feature Selection and Engineering 
In this file, we attempt to understand feature importance.

First, we can analyze the coefficients learned by our naive linear regression network to see which features were weighted most highly

In [1]:
import numpy as np
import pandas as pd
import math
import seaborn as sns
import scipy as sp
import datetime as dt
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn import linear_model
from sklearn.svm import SVR
from sklearn.preprocessing import normalize
from sklearn.preprocessing import scale
from collections import Counter
import ziptotimezone as z
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import mpu
from uszipcode import SearchEngine
import pandas as pd

In [13]:
# DATA CLEANING 
dataset = pd.read_csv('eBay_ML_Challenge_Dataset_2021_train.csv')

b2c_c2c = np.array(dataset["b2c_c2c"])
seller_id = np.array(dataset["seller_id"])
declared_handling_days = np.array(dataset["declared_handling_days"])
acceptance_scan_timestamp = np.array(dataset["acceptance_scan_timestamp"])
shipment_method_id = np.array(dataset["shipment_method_id"])
shipping_fee = np.array(dataset["shipping_fee"])
carrier_min_estimate = np.array(dataset["carrier_min_estimate"])
carrier_max_estimate = np.array(dataset["carrier_max_estimate"])
item_zip = dataset["item_zip"]
buyer_zip = dataset["buyer_zip"]
category_id = np.array(dataset["category_id"])
item_price = np.array(dataset["item_price"])
quantity = np.array(dataset["quantity"])
payment_datetime = np.array(dataset["payment_datetime"])
delivery_date = np.array(dataset["delivery_date"])
weight = np.array(dataset["weight"])
weight_units = np.array(dataset["weight_units"])
package_size = np.array(dataset["package_size"])

def b2c_c2c_to_binary(arr):
    if arr[0] in [0,1]:
        print("Array has already been converted to numeric binary!")
    else:
        for i in range(arr.shape[0]):
            if arr[i][0] == "B":
                arr[i] = 0
            else:
                arr[i] = 1
            
b2c_c2c_to_binary(b2c_c2c)
b2c_c2c = np.array(b2c_c2c, dtype=int)

def round_datetime_to_date(datetime):
    days = datetime.days
    hours = datetime.seconds // 3600
    if hours > 12:
        return days + 1
    else:
        return days

def calculate_handling_and_delivery_days(acceptance_timestamps, payment_timestamps, delivery_date):
    handling_labels = []
    shipping_labels = []
    delivery_labels = []
    for i in range(acceptance_timestamps.shape[0]):
        raw_payment = payment_timestamps[i]
        raw_acceptance = acceptance_timestamps[i]
        #parse raw_payment time string to separate year, month, date, and time
        p_year, p_month, p_date = int(raw_payment[0:4]), int(raw_payment[5:7]), int(raw_payment[8:10])
        p_hour, p_min, p_sec = int(raw_payment[11:13]), int(raw_payment[14:16]), int(raw_payment[17:19])
        p_datetime = dt.datetime(year=p_year, month=p_month, day=p_date, hour=p_hour, minute=p_min, second=p_sec)
            
        #parse raw_acceptance time string to separate year, month, date, and time
        raw_acceptance = acceptance_timestamps[i]
        a_year, a_month, a_date = int(raw_acceptance[0:4]), int(raw_acceptance[5:7]), int(raw_acceptance[8:10])
        a_hour, a_min, a_sec = int(raw_acceptance[11:13]), int(raw_acceptance[14:16]), int(raw_acceptance[17:19])
        a_datetime = dt.datetime(year=a_year, month=a_month, day=a_date, hour=a_hour, minute=a_min, second=a_sec)
        
        raw_delivery = delivery_date[i]
        d_year, d_month, d_date = int(raw_delivery[0:4]), int(raw_delivery[5:7]), int(raw_delivery[8:10])
        d_date = dt.datetime(year=d_year, month=d_month, day=d_date, hour=17)
        
        #handling days = acceptance time - payment time; shipping days = delivery date - acceptance time
        handling_days = a_datetime - p_datetime
        shipping_days = d_date - a_datetime
        delivery_days = d_date - p_datetime
        
        #round to nearest day
        rounded_handling_days = round_datetime_to_date(handling_days)
        rounded_shipping_days = round_datetime_to_date(shipping_days)
        rounded_delivery_days = round_datetime_to_date(delivery_days)
        
        handling_labels.append(rounded_handling_days)
        shipping_labels.append(rounded_shipping_days)
        delivery_labels.append(rounded_delivery_days)
        
    return np.array(handling_labels), np.array(shipping_labels), np.array(delivery_labels)

handling_days, shipping_days, delivery_days = calculate_handling_and_delivery_days(acceptance_scan_timestamp, payment_datetime, delivery_date) 

def convert_weights():
    for i, unit in enumerate(weight_units):
        if unit == 2:
            #convert weight to lbs; 1 kg = 2.20462 lbs.
            weight[i] *= 2.20462

convert_weights()

def determine_weight_averages_by_category_id():
    category_id_weights = {}
    for i, w in enumerate(weight):
        category = category_id[i]
        if category not in category_id_weights:
            category_id_weights[category] = [w]
        else:
            category_id_weights[category].append(w)
    
    category_id_weight_means = {}
    for category in category_id_weights:
        weights = category_id_weights[category]
        average_weight = np.mean(weights)
        category_id_weight_means[category] = average_weight
    
    return category_id_weight_means

def fill_missing_weights():
    weight_means = determine_weight_averages_by_category_id()
    overall_mean = np.mean(weight)
    for i, w in enumerate(weight):
        if w == 0:
            #weight is missing, replace with average weight across same category id
            category = category_id[i]
            if category in weight_means:
                weight[i] = weight_means[category]
            else:
                #don't have records for this category id, so replace with overall average
                weight[i] = overall_mean

fill_missing_weights()

def string_to_numeric_package_size():
    if type(package_size[0]) == int:
        print("Already converted to discrete numeric values")
    else:
        encodings = {"LETTER": 0, "PACKAGE_THICK_ENVELOPE": 1, "LARGE_ENVELOPE": 2,"VERY_LARGE_PACKAGE": 3, 
                     "LARGE_PACKAGE": 4, "EXTRA_LARGE_PACKAGE": 5, "NONE": -1}
        for i, size in enumerate(package_size):
            package_size[i] = encodings[size]
string_to_numeric_package_size()

def determine_average_weight_by_package_size():
    package_size_weights = {}
    for i, w in enumerate(weight):
        p_size = package_size[i]
        if p_size not in package_size_weights:
            package_size_weights[p_size] = [w]
        else:
            package_size_weights[p_size].append(w)
    
    package_id_weight_means = {}
    for p_size in package_size_weights:
        weights = package_size_weights[p_size]
        average_weight = np.mean(weights)
        package_id_weight_means[p_size] = average_weight
    
    return package_id_weight_means

def fill_missing_package_sizes():
    weight_means = determine_average_weight_by_package_size()
    weight_means.pop(-1, None)
    weight_means_list = [weight_means[key] for key in weight_means]
    for i, s in enumerate(package_size):
        if s == -1:
            #package size is missing, replace with package size it's weight is closest to the average of
            w = weight[i]
            abs_function = lambda value: abs(value-w)
            closest_value = min(weight_means_list, key=abs_function)
            closest_p_size = weight_means_list.index(closest_value)
            package_size[i] = closest_p_size

fill_missing_package_sizes()

def determine_average_shipping_estimates_by_shipment_method():    
    carrier_min_by_shipment_method = {}
    carrier_max_by_shipment_method = {}
    for i, method_id in enumerate(shipment_method_id):
        carrier_min = carrier_min_estimate[i]
        carrier_max = carrier_max_estimate[i]
        if method_id not in carrier_min_by_shipment_method:
            carrier_min_by_shipment_method[method_id] = [carrier_min]
        else:
            carrier_min_by_shipment_method[method_id].append(carrier_min)

        if method_id not in carrier_max_by_shipment_method:
            carrier_max_by_shipment_method[method_id] = [carrier_max]
        else:
            carrier_max_by_shipment_method[method_id].append(carrier_max)
    
    carrier_min_means = {}
    for method_id in carrier_min_by_shipment_method:
        min_estimates = carrier_min_by_shipment_method[method_id]
        mean_min_estimate = np.mean(min_estimates)
        carrier_min_means[method_id] = mean_min_estimate
    
    carrier_max_means = {}
    for method_id in carrier_max_by_shipment_method:
        max_estimates = carrier_max_by_shipment_method[method_id]
        mean_max_estimate = np.mean(max_estimates)
        carrier_max_means[method_id] = mean_max_estimate 
    
    return carrier_min_means, carrier_max_means

def fill_missing_carrier_estimates():
    #consider replacing missing values with estimates with similar distance
    carrier_min_means, carrier_max_means = determine_average_shipping_estimates_by_shipment_method()
    overall_min_mean, overall_max_mean = np.mean(carrier_min_estimate), np.mean(carrier_max_estimate)
    for i, estimate in enumerate(carrier_min_estimate):
        if estimate < 0:
            #need to fill value 
            method_id = shipment_method_id[i]
            if method_id in carrier_min_means:
                carrier_min_estimate[i] = carrier_min_means[method_id]
            else:
                carrier_min_estimate[i] = overall_min_mean
    for i, estimate in enumerate(carrier_max_estimate):
        if estimate < 0:
            #need to fill value
            method_id = shipment_method_id[i]
            if method_id in carrier_max_means:
                carrier_max_estimate[i] = carrier_max_means[method_id]
            else:
                carrier_max_estimate[i] = overall_max_mean

fill_missing_carrier_estimates()

def fill_missing_declared_handling_days():
    overall_mean = np.mean(declared_handling_days)
    seller_counts = Counter(seller_id)
    for i, days in enumerate(declared_handling_days):
        if np.isnan(days):
            #need to fill
            declared_handling_days[i] = overall_mean

features = np.column_stack((b2c_c2c, seller_id, declared_handling_days, shipment_method_id, shipping_fee,
                             carrier_min_estimate, carrier_max_estimate, category_id,
                             item_price, weight, quantity, package_size, handling_days))
labels = np.array(delivery_days)

In [14]:
print(features.shape)
features = scale(features, with_mean = True, with_std=True)
print(features.shape)
indeces = ~np.isnan(features).any(axis=1)
print(indeces)
features = features[~np.isnan(features).any(axis=1)]
print(features.shape)
labels = labels[indeces]
handling_days = handling_days[indeces]
shipping_days = shipping_days[indeces]
print(labels.shape)

(15000000, 13)




(15000000, 13)
[ True  True  True ...  True  True  True]
(14297114, 13)
(14297114,)


In [15]:
features = features[:, :-1]

In [16]:
model = LinearRegression()
model.fit(features, labels)

LinearRegression()

In [17]:
print(model.coef_)

[-8.09795735e-02  6.55274933e-02  8.86068784e-01  2.11399948e-02
 -6.52588741e-02  1.51823145e-01  4.41767331e-01  1.02764189e-01
 -2.32173062e-02 -4.01198415e-04  9.72705709e-03  6.45949975e-02]


What if we use lasso regression? An $l_1$ norm will push irrelevant features weights to 0. This will tell us which features are unimportant to the shipping prediction

In [18]:
lasso_model = linear_model.Lasso(alpha=1.0)
lasso_model.fit(features, labels)

Lasso()

In [19]:
print(lasso_model.coef_)

[ 0.  0.  0.  0.  0. -0.  0.  0. -0.  0. -0.  0.]


Finally, let's use the visualization package __ to visualize the trees learned by XGBoost

Now that we've learned more about feature importance for the entire delivery prediction (handling time + shipping time), it may be useful to look into whether some features are only important for the handling portion and don't affect the shipment portion, and vice versa. So we will apply the tactics above both (1) when the labels are only the handling time, and (2) when the labels are only the shipment time.

In [20]:
handling_linreg = LinearRegression()
handling_linreg.fit(features, handling_days)
print(handling_linreg.coef_)

[-0.07389143  0.05425154  0.86970894 -0.01883299 -0.03151593  0.00414523
  0.02560026  0.02273071  0.00155812  0.0014169   0.0193101  -0.01256442]


In [21]:
shipping_linreg = LinearRegression()
shipping_linreg.fit(features, shipping_days)
print(shipping_linreg.coef_)

[-0.01864717  0.01194991  0.01481685  0.03913644 -0.03807706  0.14917728
  0.4133649   0.07600417 -0.02316963 -0.00150535 -0.00724451  0.07959303]


In [22]:
lasso_handling = linear_model.Lasso(alpha=0.05)
lasso_handling.fit(features, handling_days)
print(lasso_handling.coef_)

[-0.          0.          0.81013996 -0.         -0.          0.
  0.          0.          0.          0.          0.         -0.        ]


In [23]:
lasso_shipping = linear_model.Lasso(alpha=0.15)
lasso_shipping.fit(features, shipping_days)
print(lasso_shipping.coef_)

[-0.         -0.          0.          0.         -0.          0.
  0.22663033  0.         -0.         -0.         -0.          0.        ]


In [27]:
from collections import Counter
c = Counter(buyer_zip)
print(c["00000"])

11
