In [1]:
# import libraries
import numpy as np
from PIL import Image
from xgboost import XGBRegressor
from sklearn import preprocessing
import pandas as pd

In [2]:
# load the clean df, but only use the most important features
packages = pd.read_csv("../data/packages_final.csv", usecols=["mean_age", "total_pop", "month", "weekday", "packages", "distance_to_nearest_city"])
packages.head()

Unnamed: 0,month,weekday,packages,mean_age,total_pop,distance_to_nearest_city
0,January,DI,403,39.0,1389.0,8.133982
1,January,DO,359,39.0,1389.0,8.133982
2,January,FR,357,39.0,1389.0,8.133982
3,January,MI,346,39.0,1389.0,8.133982
4,January,MO,230,39.0,1389.0,8.133982


In [3]:
# separate the independent (target) and dependent variables (features), 
# also drop for X month and weekday, as they can't get standardized (they have string values)
X_data = packages.drop(columns=["packages", "month", "weekday"])
target = packages["packages"]

# standardisation of dependent variables
standard = preprocessing.scale(X_data)
standard

array([[-2.0418053 , -0.54510433, -0.46911716],
       [-2.0418053 , -0.54510433, -0.46911716],
       [-2.0418053 , -0.54510433, -0.46911716],
       ...,
       [-1.24350105, -0.34205996, -0.54837119],
       [-1.24350105, -0.34205996, -0.54837119],
       [-1.24350105, -0.34205996, -0.54837119]])

In [4]:
# We have variables that are in string format, however, in statistical models we need our dataset to only 
# include numerical values!

# get all the string columns
str_cols = [col for col in packages.columns if packages[col].dtype == "O"]

# create the one hot encoding dummy variables
packages = pd.get_dummies(packages, columns=str_cols)

# turn the array (X_data) back into a data frame
packages_st = pd.DataFrame(standard, columns=X_data.columns)
packages_st.head()

Unnamed: 0,mean_age,total_pop,distance_to_nearest_city
0,-2.041805,-0.545104,-0.469117
1,-2.041805,-0.545104,-0.469117
2,-2.041805,-0.545104,-0.469117
3,-2.041805,-0.545104,-0.469117
4,-2.041805,-0.545104,-0.469117


In [6]:
# new data set that includes the dummy variables + the target
dummy = packages.drop(columns=["mean_age", "total_pop", "distance_to_nearest_city"])

# add the month/weekday/packages columns...
packages_st = pd.concat([packages_st, dummy], axis=1)
packages_st.head()

Unnamed: 0,mean_age,total_pop,distance_to_nearest_city,packages,month_April,month_August,month_December,month_February,month_January,month_July,...,month_November,month_October,month_September,weekday_DI,weekday_DO,weekday_FR,weekday_MI,weekday_MO,weekday_SA,weekday_SO
0,-2.041805,-0.545104,-0.469117,403,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
1,-2.041805,-0.545104,-0.469117,359,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
2,-2.041805,-0.545104,-0.469117,357,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
3,-2.041805,-0.545104,-0.469117,346,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
4,-2.041805,-0.545104,-0.469117,230,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0


In [7]:
# Split X and y: X dependent variables (features), y the traget variable
X = packages_st.drop("packages", axis=1)
y = packages_st[["packages"]]

# fit the data to the modell, we have chosen XGBRegressor as model, as it was the most successfull in previous testing
model = XGBRegressor(n_estimators = 134)
model.fit(X, y)
# no need for train-test-split, as we already now XGBoost works great for our data and we try to max it's predictionrate
# so we give him all the data to train

XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, feature_types=None, gamma=0, gpu_id=-1,
             grow_policy='depthwise', importance_type=None,
             interaction_constraints='', learning_rate=0.300000012, max_bin=256,
             max_cat_threshold=64, max_cat_to_onehot=4, max_delta_step=0,
             max_depth=6, max_leaves=0, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=100, n_jobs=0,
             num_parallel_tree=1, predictor='auto', random_state=0, ...)

In [8]:
# import libraries
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import mean_squared_error as mse

# test how good the pred works
ypred = model.predict(X)
print("R2: {:.2f} %".format(r2_score(y, ypred)*100))
print("MAE: {:.2f}".format(mae(y, ypred)))
print("MSE: {:.2f}".format(mse(y, ypred)))

R2: 99.73 %
MAE: 116.38
MSE: 35608.08


In [15]:
# test if predicting a random X works, it does
pred = np.array([[-2, -0.40, 0.80, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]])
model.predict(pred)

array([738.1235], dtype=float32)

In [10]:
# second test: giving the same input for X but changing the day from Tuesday to Sunday: 
# -> expecting a way smaller output (way less packages get delivered on a Sunday than Tuesday)
# -> it works
pred2 = np.array([[-2, -0.40, 0.80, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]])
model.predict(pred2)

array([76.500885], dtype=float32)

In [11]:
# defining two functionss so we can later use them in the GUI file
# function for trasforming the data (standarize, split in X and y, ...)
def transform_data(df):
    X_data = df.drop(columns=["packages", "month", "weekday"])
    target = df["packages"]
    standard = preprocessing.scale(X_data)
    str_cols = [col for col in df.columns if df[col].dtype == "O"]
    df = pd.get_dummies(df, columns=str_cols)
    packages_st = pd.DataFrame(standard, columns=X_data.columns)
    dummy = df.drop(columns=["mean_age", "total_pop", "distance_to_nearest_city"])
    packages_st = pd.concat([packages_st, dummy], axis=1)
    X = packages_st.drop("packages", axis=1)
    y = packages_st[["packages"]]
    return X, y

# function for the ML part    
def predict_packages(X, y, pred):
    model = XGBRegressor()
    model.fit(X, y)
    pred2 = model.predict(pred)
    # as the Post has aggregated the packages for all identical weekdays in a month we have to divide by 4 
    # to get the correct prediction for one day in the month
    return pred2[0]/4

# activation function
# as only predictions >= 0 make sense we need to apply rectified liniar function to the prediction
# pred < 0: output 0, pred >= 0: output pred
def relu(x):
	return max(0.0, x)

In [12]:
# testing if our functions work -> they do
packages = pd.read_csv("../data/packages_final.csv", usecols=["mean_age", "total_pop", "month", "weekday", "packages", "distance_to_nearest_city"])

pred2 = np.array([[-2, -1.40, 0.80, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]])

X, y = transform_data(packages)
a = predict_packages(X, y, pred2)
print(a)
relu(a)

-32.456085205078125


0.0