# Sarter Notebook

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Reading Data

In [2]:
# read train data
df_orders = pd.read_csv("orders.csv", sep=";")
df_products = pd.read_csv("product_attributes.csv", sep=",")
df_dists = pd.read_csv("cities_data.csv", sep=";")

# read test data
df_test = pd.read_csv("test.csv", sep=";")

# Clean the data

In [60]:
df_orders["origin_port"] = df_orders["origin_port"].replace(["ATHENAS"], "Athens")
df_orders["origin_port"] = df_orders["origin_port"].replace(["BCN"], "Barcelona")
df_orders["late_order"] = df_orders["late_order"].astype(int)

# Analaize the data

In [61]:
df_orders.head()


Unnamed: 0,order_id,origin_port,3pl,customs_procedures,logistic_hub,customer,product_id,units,late_order
0,366c7a3d298f,Rotterdam,v_002,DTP,Venlo,Marseille,1692723,583,1
1,45f906331e10,Rotterdam,v_004,CRF,Rome,Marseille,1644308,459,0
2,ac84a6e4af0f,Athens,v_002,CRF,Venlo,Paris,1684170,464,1
3,f5e98cb29790,Rotterdam,v_004,CRF,Lille,Milan,1620510,678,0
4,a9e7c9bee35b,Barcelona,v_002,CRF,Venlo,Berlin,1699372,353,0


In [62]:
counts1 = df_orders["late_order"].groupby(df_orders["3pl"]).value_counts()
# For every 3pl, print the percentage of late orders
for i in range(1, 5):
    print(f"v_00{i}", ":", counts1[f"v_00{i}"][0]/(counts1[f"v_00{i}"][0]+counts1[f"v_00{i}"][1]))

v_001 : 0.6179014667168108
v_002 : 0.8005533611227406
v_003 : 0.7497475840184624
v_004 : 0.7430507101217673


In [63]:
counts2 = df_orders["late_order"].groupby(df_orders["customs_procedures"]).value_counts()
print(counts2)
# For every custom_procedures, print the percentage of late orders
for name in ["CRF", "DTD", "DTP"]:
   print(name, ":", counts2[name][0]/(counts2[name][0]+counts2[name][1]))

customs_procedures  late_order
CRF                 0             41274
                    1             10203
DTD                 0             32548
                    1             10936
DTP                 0             13298
                    1              6017
Name: late_order, dtype: int64
CRF : 0.801794976397226
DTD : 0.7485051973139546
DTP : 0.6884804556044525


In [64]:
counts3 = df_orders["customs_procedures"].groupby(df_orders["3pl"]).value_counts()
print(counts3)

3pl    customs_procedures
v_001  CRF                    3686
       DTD                    2960
       DTP                    1331
v_002  CRF                   24595
       DTD                   21035
       DTP                    9307
v_003  CRF                    3103
       DTD                    2658
       DTP                    1172
v_004  CRF                   20093
       DTD                   16831
       DTP                    7505
Name: customs_procedures, dtype: int64


In [None]:
df_orders.describe()

# ML

### Split the data

In [72]:
import sklearn as sk
X = df_orders.drop(['late_order', 'order_id', 'product_id', 'units'], axis=1)
y = np.array(df_orders['late_order'])

X_train, X_test, y_train, y_test = sk.model_selection.train_test_split(X, y, test_size=0.2, random_state=42)
print(y_test)

[1 0 0 ... 0 0 0]


### SVM

In [73]:
import sklearn.preprocessing as skp

enc = skp.OneHotEncoder(handle_unknown='ignore')
enc.fit(X_test)
enc.categories_
X_train_enc = enc.transform(X_train)
X_test_enc = enc.transform(X_test)
print(X_train_enc)


  (0, 2)	1.0
  (0, 4)	1.0
  (0, 9)	1.0
  (0, 12)	1.0
  (0, 33)	1.0
  (1, 0)	1.0
  (1, 4)	1.0
  (1, 7)	1.0
  (1, 14)	1.0
  (1, 38)	1.0
  (2, 2)	1.0
  (2, 6)	1.0
  (2, 9)	1.0
  (2, 11)	1.0
  (2, 23)	1.0
  (3, 0)	1.0
  (3, 4)	1.0
  (3, 7)	1.0
  (3, 13)	1.0
  (3, 21)	1.0
  (4, 2)	1.0
  (4, 6)	1.0
  (4, 7)	1.0
  (4, 17)	1.0
  (4, 38)	1.0
  :	:
  (91415, 2)	1.0
  (91415, 4)	1.0
  (91415, 7)	1.0
  (91415, 16)	1.0
  (91415, 47)	1.0
  (91416, 0)	1.0
  (91416, 4)	1.0
  (91416, 8)	1.0
  (91416, 18)	1.0
  (91416, 28)	1.0
  (91417, 0)	1.0
  (91417, 6)	1.0
  (91417, 7)	1.0
  (91417, 18)	1.0
  (91417, 27)	1.0
  (91418, 2)	1.0
  (91418, 4)	1.0
  (91418, 7)	1.0
  (91418, 16)	1.0
  (91418, 44)	1.0
  (91419, 2)	1.0
  (91419, 4)	1.0
  (91419, 8)	1.0
  (91419, 17)	1.0
  (91419, 28)	1.0


In [75]:
import sklearn.svm as svm

model = svm.SVC(kernel='linear', C=1.0)
model.fit(X_train_enc, y_train)
print("Trained!!")
y_pred = model.predict(X_test_enc)
print("Predicted!")

accuracy = sk.metrics.accuracy_score(y_test, y_pred)


Trained!!
Predicted!


In [76]:
print(accuracy)

0.7663195659782989


## Save Solution
> **Note:** Submited probabilities must be for the **True** cases.

In [None]:
submission = pd.DataFrame({"order_id": X_test.order_id, "late_order": pred_proba[:,1]})
submission.to_csv("submission_kaggle.csv", index=False)