# Datos

In [8]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

# URL: https://drive.google.com/file/d/1fgjyxBxJK9vB-Cygb4jwMHQFMbdUAV-6
!gdown --id 1fgjyxBxJK9vB-Cygb4jwMHQFMbdUAV-6


Downloading...
From: https://drive.google.com/uc?id=1fgjyxBxJK9vB-Cygb4jwMHQFMbdUAV-6
To: /content/e_commerce_shiping_train_preprocessed.csv
100% 1.06M/1.06M [00:00<00:00, 62.8MB/s]


In [9]:
data = pd.read_csv('e_commerce_shiping_train_preprocessed.csv',encoding="utf-16")

# División del dataset

In [11]:
y = data['Reached.on.Time_Y.N']
X = data.drop('Reached.on.Time_Y.N', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, shuffle=True, random_state=1, stratify=data['Reached.on.Time_Y.N'])

# Regresión lineal

Creamos un objeto model de la clase LogisticRegression. Con ésto, estamos generando una expresión matemática con coeficientes sin fijar, que después del entrenamiento con la función "fit" ya sí tendrán valores fijos

In [13]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train,y_train)

print(model.intercept_)   #podemos ver el sesgo
print(model.coef_)        #podemos ver el peso de las características

print('Model score: ', model.score(X_test,y_test))

0.9531502878998763
[-3.88680663e-05 -1.17827205e-02  4.97416221e-03  1.34017187e-05
 -5.91697571e-03  1.13431101e-02  5.65299850e-03 -4.37479316e-05
 -7.71392914e-03  6.55691505e-03 -3.06532412e-03  2.82232596e-03
  1.40001225e-03  9.81441787e-03 -4.39787895e-03 -5.41653892e-03
  3.11859619e-02 -1.20178244e-02 -1.91681375e-02]
Model score:  0.20916603100272835


In [14]:
from sklearn.linear_model import Lasso

model = Lasso(alpha=0.001)
model.fit(X_train,y_train)

print('Model score: ', model.score(X_test,y_test))

Model score:  0.20999576204720816


In [15]:
from sklearn.linear_model import Ridge

model = Ridge(alpha=1)
model.fit(X_train,y_train)

print('Model score: ', model.score(X_test,y_test))

Model score:  0.20916800667486724


# Logistic Regresion

In [16]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train,y_train)
print('Model score: ', model.score(X_test,y_test))

Model score:  0.6445454545454545


# Máquinas de Soporte de Vectores (SVM)

In [17]:
from sklearn.svm import LinearSVC

model = LinearSVC(C=10, loss="hinge")
model.fit(X_train,y_train)
print('Model score: ', model.score(X_test,y_test))

from sklearn.svm import SVC

model = SVC(kernel="poly", degree=3, coef0=1, C=5)
model.fit(X_train,y_train)
print('Model score: ', model.score(X_test,y_test))


model = SVC(kernel="rbf", gamma=5, C=0.0001)
model.fit(X_train,y_train)
print('Model score: ', model.score(X_test,y_test))



Model score:  0.6709090909090909
Model score:  0.6836363636363636
Model score:  0.5968181818181818


# Random Forest

# KNN

In [18]:
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier(n_neighbors=3,
                             weights='uniform',
                             algorithm='ball_tree',p=1)
model.fit(X_train,y_train)
print('Model score: ', model.score(X_test,y_test))

model = KNeighborsClassifier(n_neighbors=7,
                             weights='distance',
                             algorithm='kd_tree',p=2)
model.fit(X_train,y_train)
print('Model score: ', model.score(X_test,y_test))

Model score:  0.645
Model score:  0.6404545454545455


# Comparativa

In [19]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier


Creamos los modelos con un diccionario y ejecutamos el entrenamiento de cada uno a través de un bucle for

In [20]:
models = {
    
    "                      Regresión Lineal": LinearRegression(),
    "               Regresión Lineal(Lasso)": Lasso(),
    "               Regresión Lineal(Ridge)": Ridge(),
    "                   Logistic Regression": LogisticRegression(),
    "Support Vector Machine (Linear Kernel)": LinearSVC(),
    "   Support Vector Machine (RBF Kernel)": SVC(),    
    "                         Random Forest": RandomForestClassifier(),
    "                     Gradient Boosting": GradientBoostingClassifier(),    
    "                   K-Nearest Neighbors": KNeighborsClassifier()
}

for name, model in models.items():
    model.fit(X_train, y_train)
    print(name + " trained.")

                      Regresión Lineal trained.
               Regresión Lineal(Lasso) trained.
               Regresión Lineal(Ridge) trained.
                   Logistic Regression trained.




Support Vector Machine (Linear Kernel) trained.
   Support Vector Machine (RBF Kernel) trained.
                         Random Forest trained.
                     Gradient Boosting trained.
                   K-Nearest Neighbors trained.


Presentación de las métricas

In [21]:
for name, model in models.items():
    print(name + ": {:.2f}%".format(model.score(X_test, y_test) * 100))

                      Regresión Lineal: 20.92%
               Regresión Lineal(Lasso): 18.48%
               Regresión Lineal(Ridge): 20.92%
                   Logistic Regression: 64.45%
Support Vector Machine (Linear Kernel): 60.05%
   Support Vector Machine (RBF Kernel): 68.50%
                         Random Forest: 66.73%
                     Gradient Boosting: 67.45%
                   K-Nearest Neighbors: 65.18%
