# Sarter Notebook

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import haversine as hs

## Reading Data

In [68]:
# read train data
df_orders = pd.read_csv("orders.csv", sep=";")
df_products = pd.read_csv("product_attributes.csv", sep=",")
df_dists = pd.read_csv("cities_data.csv", sep=";")
df_ordersXproducts = pd.read_csv('ordersXproducts.csv')
del df_ordersXproducts['Unnamed: 0']

# read test data
df_test = pd.read_csv("test.csv", sep=";")

## Merge tables

In [69]:
df_coord = df_dists[['city_from_name', 'city_from_coord']].drop_duplicates().reset_index().drop(columns=['index']).copy()

In [70]:
df_alt = df_dists.copy()

c2 = df_alt['city_from_name'].values.copy()
c1 = df_alt['city_to_name'].values.copy()
c4 = df_alt['city_from_coord'].values.copy()
c3 = df_alt['city_to_coord'].values.copy()

df_alt['city_from_name'] = c1
df_alt['city_to_name'] = c2
df_alt['city_from_coord'] = c3
df_alt['city_to_coord'] = c4

df_dists = pd.concat([df_dists, df_alt]).reset_index().drop(columns=['index'])

In [71]:
df = pd.merge(df_ordersXproducts, df_dists, how='left', left_on=['origin_port', 'logistic_hub'], right_on=['city_from_name', 'city_to_name'])
del df['city_from_name']
del df['city_to_name']
del df['city_from_coord']
del df['city_to_coord']
df = df.rename(columns={'distance': 'dist_origin_hub'})

df = pd.merge(df, df_dists, how='left', left_on=['logistic_hub', 'customer'], right_on=['city_from_name', 'city_to_name'])
del df['city_from_name']
del df['city_to_name']
del df['city_from_coord']
del df['city_to_coord']
df = df.rename(columns={'distance': 'dist_hub_customer'})

df = pd.merge(df, df_dists, how='left', left_on=['origin_port', 'customer'], right_on=['city_from_name', 'city_to_name'])
del df['city_from_name']
del df['city_to_name']
del df['city_from_coord']
del df['city_to_coord']
df = df.rename(columns={'distance': 'dist_origin_customer'})

df = pd.merge(df, df_coord, how='left', left_on=['origin_port'], right_on=['city_from_name'])
del df['city_from_name']
df = df.rename(columns={'city_from_coord': 'origin_port_coord'})

df = pd.merge(df, df_coord, how='left', left_on=['logistic_hub'], right_on=['city_from_name'])
del df['city_from_name']
df = df.rename(columns={'city_from_coord': 'logistic_hub_coord'})

df = pd.merge(df, df_coord, how='left', left_on=['customer'], right_on=['city_from_name'])
del df['city_from_name']
df = df.rename(columns={'city_from_coord': 'customer_coord'})

In [72]:
for i in range(len(df)):
    if (not pd.isna(df.at[i, 'dist_origin_hub'])) and (not pd.isna(df.at[i, 'dist_hub_customer'])):
        df.at[i, 'dist_origin_customer'] = df.at[i, 'dist_origin_hub'] + df.at[i, 'dist_hub_customer']

In [76]:
df.isnull().sum()

order_id                   0
origin_port                0
3pl                        0
customs_procedures         0
logistic_hub            1049
customer                   0
product_id                 0
units                      0
late_order                 0
weight                   118
material_handling        118
dist_origin_hub         1049
dist_hub_customer       1049
dist_origin_customer       0
origin_port_coord          0
logistic_hub_coord      1049
customer_coord          1112
dtype: int64

In [74]:
x = df[~df['logistic_hub'].isna()]
indexes = x[x['dist_hub_customer'].isna()].index

for i in indexes:
    df.at[i, 'dist_hub_customer'] = 0

In [75]:
x = df[df['dist_origin_customer'].isna()]
indexes = x.index

for i in indexes:
    df.at[i, 'dist_origin_customer'] = 0

In [78]:
df.to_csv('finaldf.csv')

# Data preprocessing

In [1]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate

In [3]:
df = pd.read_csv('Datasets/ordersXproductsXdistances.csv')
df['late_order'] = df['late_order'].astype(float)
not_useful = ['Unnamed: 0', 'order_id', 'origin_port_coord',
       'logistic_hub_coord', 'dist_origin_hub', 'customer_coord',
       'dist_hub_customer', 'dist_origin_customer']
for col in not_useful:
    del df[col]
df = df.dropna()

In [19]:
len(df)

113109

In [10]:
categorical_columns = ['origin_port', '3pl', 'customs_procedures', 'logistic_hub', 'product_id', 'customer', 'material_handling']
numerical_columns = ['units', 'weight']

target = df['late_order']
data = df.drop(columns=['late_order'])

In [9]:
data

Unnamed: 0,origin_port,3pl,customs_procedures,logistic_hub,customer,product_id,units,weight,material_handling
0,Rotterdam,v_002,DTP,Venlo,Marseille,1692723,583,1778.0,5.0
1,Rotterdam,v_004,CRF,Rome,Marseille,1644308,459,1088.0,3.0
2,Athens,v_002,CRF,Venlo,Paris,1684170,464,505.0,4.0
3,Rotterdam,v_004,CRF,Lille,Milan,1620510,678,1308.0,4.0
4,Barcelona,v_002,CRF,Venlo,Berlin,1699372,353,1465.0,0.0
...,...,...,...,...,...,...,...,...,...
114271,Rotterdam,v_002,CRF,Dusseldorf,Bordeaux,1681376,645,1896.0,3.0
114272,Barcelona,v_004,DTD,Dusseldorf,Berlin,1676942,502,746.0,1.0
114273,Rotterdam,v_002,DTP,Dusseldorf,Rome,1692737,464,572.0,5.0
114274,Barcelona,v_003,DTD,Dusseldorf,Munich,1699974,388,1894.0,1.0


In [21]:
categorical_preprocessor = OneHotEncoder(handle_unknown="ignore")
numerical_preprocessor = StandardScaler()

preprocessor = ColumnTransformer([
    ('one-hot-encoder', categorical_preprocessor, categorical_columns),
    ('standard_scaler', numerical_preprocessor, numerical_columns)])



In [31]:
model = make_pipeline(preprocessor, LogisticRegression(max_iter=50))
cv_results = cross_validate(model, data, target, cv=5)
y.append(cv_results['test_score'].mean())

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [33]:
preprocessor.fit_transform(data).todense()

matrix([[ 0.        ,  0.        ,  0.        , ...,  1.        ,
          1.43394552,  0.99534057],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         -0.37867728, -0.34955836],
        [ 0.        ,  1.        ,  0.        , ...,  0.        ,
         -0.30558766, -1.48590049],
        ...,
        [ 0.        ,  0.        ,  0.        , ...,  1.        ,
         -0.30558766, -1.35530886],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         -1.41655002,  1.22143952],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         -0.04246499,  0.07925   ]])