In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from mlxtend.plotting import plot_decision_regions


In [None]:
def get_distance(point1, point2):
    from math import atan2, cos, radians, sin, sqrt

    R = 6370
    lat1 = radians(point1[0])  # insert value
    lon1 = radians(point1[1])
    lat2 = radians(point2[0])
    lon2 = radians(point2[1])

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = sin(dlat / 2) ** 2 + cos(lat1) * cos(lat2) * sin(dlon / 2) ** 2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))
    distance = R * c
    return distance


# Modelos de classificação

## Importando os Dados

In [None]:
tb_atrasos = pd.read_csv("data/tb_atraso_olist.csv")


In [None]:
tb_atrasos.info()


In [None]:
pre_limpeza = tb_atrasos.shape[0]
tb_atrasos = tb_atrasos.dropna()
print(f"Num. linhas descartadas {pre_limpeza - tb_atrasos.shape[0]}")


In [None]:
tb_atrasos["dist"] = tb_atrasos.apply(
    lambda x: get_distance(
        (x["buyer_lat"], x["buyer_long"]), (x["seller_lat"], x["seller_long"])
    ),
    axis=1,
)


In [None]:
tb_atrasos["dt_compra"] = pd.to_datetime(tb_atrasos["order_purchase_timestamp"])
tb_atrasos["dt_real"] = pd.to_datetime(tb_atrasos["order_delivered_customer_date"])
tb_atrasos["dt_prevista"] = pd.to_datetime(tb_atrasos["order_estimated_delivery_date"])
tb_atrasos["dias_previstos"] = (
    tb_atrasos["dt_prevista"] - tb_atrasos["dt_compra"]
).dt.total_seconds() / (60 * 60 * 24)
tb_atrasos["dias_atraso"] = (
    tb_atrasos["dt_real"] - tb_atrasos["dt_prevista"]
).dt.total_seconds() / (60 * 60 * 24)
tb_atrasos["atraso_bin"] = np.where(tb_atrasos["dias_atraso"] >= 1, 1, 0)

In [None]:
tb_atrasos.columns

In [None]:
tb_atrasos.head()


In [None]:
tb_atrasos.groupby("atraso_bin")[["dist", "dias_previstos"]].mean()

In [None]:
tb_atrasos[["dist", "dias_previstos"]].quantile([0.05, 0.25, 0.5, 0.75, 0.95, 0.99, 1])

In [None]:
q99_dist = np.quantile(tb_atrasos['dist'], 0.99)
q99_dias = np.quantile(tb_atrasos['dias_previstos'], 0.99)
mask_outlier = (tb_atrasos['dist'] < q99_dist) & (tb_atrasos['dias_previstos'] < q99_dias)
tb_atrasos_nout = tb_atrasos[mask_outlier].copy()

In [None]:
tb_atrasos_nout.shape[0] - tb_atrasos.shape[0]

In [None]:
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(tb_atrasos[["dist", "dias_previstos"]]), columns = ["dist", "dias_previstos"])
y = tb_atrasos["atraso_bin"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42
)


O que foi feito de errado?

# Regressão Logística (Modelo Baseline)

In [None]:
from sklearn.linear_model import LogisticRegression


In [None]:
log_fit = LogisticRegression()
log_fit.fit(X_train, y_train)


In [None]:
plot_decision_regions(
    np.array(X_train), np.array(y_train), log_fit, scatter_kwargs={"alpha": 1, "s": 1}
)


In [None]:
vetor_dist = np.linspace(X_train["dist"].min(), X_train["dist"].max(), 200)
vetor_peso = [X_train["dias_previstos"].median()] * 200
tb_simul_dist = pd.DataFrame({"dist": vetor_dist, "dias_previstos": vetor_peso})
tb_simul_dist["prob_atraso_log"] = log_fit.predict_proba(tb_simul_dist)[:, -1]


In [None]:
vetor_peso = np.linspace(
    X_train["dias_previstos"].min(), X_train["dias_previstos"].max(), 200
)
vetor_dist = [X_train["dist"].median()] * 200
tb_simul_peso = pd.DataFrame({"dist": vetor_dist, "dias_previstos": vetor_peso})
tb_simul_peso["prob_atraso_log"] = log_fit.predict_proba(tb_simul_peso)[:, -1]


In [None]:
fig, ax = plt.subplots(1, 2, figsize=(10, 5))
sns.scatterplot(data=tb_simul_dist, x="dist", y="prob_atraso_log", ax=ax[0])
sns.scatterplot(data=tb_simul_peso, x="dias_previstos", y="prob_atraso_log", ax=ax[1])
ax[0].set_title("Impacto da Distância sobre Prob. Atraso")
ax[1].set_title("Impacto da Estimativa sobre Prob. Atraso")
fig.suptitle("Simulações usando Modelo Baseline (Reg. Log.)")


In [None]:
pred_test = log_fit.predict(X_test)
tb_p_test = pd.DataFrame({"y_real": y_test, "pred_reglog": pred_test})


In [None]:
from sklearn.metrics import (
    accuracy_score,
    roc_auc_score,
    f1_score,
    precision_score,
    recall_score,
)


In [None]:
print(f"Acurácia: {accuracy_score(tb_p_test['y_real'], tb_p_test['pred_reglog'])}")
print(f"ROC-AUC Score: {roc_auc_score(tb_p_test['y_real'], tb_p_test['pred_reglog'])}")
print(f"Precision: {precision_score(tb_p_test['y_real'], tb_p_test['pred_reglog'])}")
print(f"Recall: {recall_score(tb_p_test['y_real'], tb_p_test['pred_reglog'])}")
print(f"F1-Score: {f1_score(tb_p_test['y_real'], tb_p_test['pred_reglog'])}")


In [None]:
f1_score(y_train, log_fit.predict(X_train))

In [None]:
def calcular_erros(nome_coluna_pred):
    print(
        f"Acurácia: {accuracy_score(tb_p_test['y_real'], tb_p_test[nome_coluna_pred])}"
    )
    print(
        f"ROC-AUC Score: {roc_auc_score(tb_p_test['y_real'], tb_p_test[nome_coluna_pred])}"
    )
    print(
        f"Precision: {precision_score(tb_p_test['y_real'], tb_p_test[nome_coluna_pred])}"
    )
    print(f"Recall: {recall_score(tb_p_test['y_real'], tb_p_test[nome_coluna_pred])}")
    print(f"F1-Score: {f1_score(tb_p_test['y_real'], tb_p_test[nome_coluna_pred])}")


O que aconteceu com o modelo de regressão? Como poderíamos melhora-lo?

# kNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

![title](knn.png)

## Hiperparâmetros
* **n_neighbors** : Número de vizinhos
* **weights** : Metodologia de ponderação dos vizinhos (devo penalizar vizinhos mais distantes?).
* **metric** : Função de distância utilizada para 'escolher' vizinhos mais próximos.

https://scikit-learn.org/stable/modules/generated/sklearn.metrics.DistanceMetric.html#sklearn.metrics.DistanceMetric
![title](metrica_distancia.png)

In [None]:
knn_fit = KNeighborsClassifier(n_neighbors=1)
knn_fit.fit(X_train, y_train)


In [None]:
plot_decision_regions(
    np.array(X_train), np.array(y_train), knn_fit, scatter_kwargs={"alpha": 0, "s": 1}
)


In [None]:
tb_simul_dist.columns

In [None]:
tb_simul_dist["prob_atraso_knn"] = knn_fit.predict_proba(
    tb_simul_dist[["dist", "dias_previstos"]]
)[:, -1]
tb_simul_peso["prob_atraso_knn"] = knn_fit.predict_proba(
    tb_simul_peso[["dist", "dias_previstos"]]
)[:, -1]


In [None]:
fig, ax = plt.subplots(1, 2, figsize=(10, 5))
sns.scatterplot(data=tb_simul_dist, x="dist", y="prob_atraso_knn", ax=ax[0])
sns.scatterplot(data=tb_simul_dist, x="dist", y="prob_atraso_log", ax=ax[0])
sns.scatterplot(data=tb_simul_peso, x="dias_previstos", y="prob_atraso_knn", ax=ax[1])
sns.scatterplot(data=tb_simul_peso, x="dias_previstos", y="prob_atraso_log", ax=ax[1])
ax[0].set_title("Impacto da Estimativa sobre Prob. Atraso")
ax[1].set_title("Impacto do Peso sobre Prob. Atraso")
fig.suptitle("Simulações usando Modelo Baseline (Reg. Log.)")


In [None]:
tb_p_test["pred_knn"] = knn_fit.predict(X_test)
calcular_erros("pred_knn")

In [None]:
for i in range(1, 10):
    knn_fit = KNeighborsClassifier(n_neighbors=i)
    knn_fit.fit(X_train, y_train)
    print(f"{i}-NN F1 = {f1_score(y_test, knn_fit.predict(X_test))}")


In [None]:
for i in range(1, 20):
    knn_fit = KNeighborsClassifier(n_neighbors=i, weights="distance")
    knn_fit.fit(X_train, y_train)
    print(f"{i}-NN F1 = {f1_score(y_test, knn_fit.predict(X_test))}")


In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
parameter_grid = {
    "n_neighbors": range(1, 20),
    "weights": ["uniform", "distance"],
    "metric": ["euclidean", "manhattan", "chebyshev"],
}
knn_fit = KNeighborsClassifier()
knn_opt = GridSearchCV(estimator=knn_fit, param_grid=parameter_grid, scoring="f1",cv = 5)
knn_opt.fit(X_train, y_train)

In [None]:
knn_opt.best_estimator_

In [None]:
plot_decision_regions(
    np.array(X_train), np.array(y_train), knn_opt, scatter_kwargs={"alpha": 0, "s": 1}
);

In [None]:
tb_p_test["pred_knn"] = knn_opt.predict(X_test)
calcular_erros("pred_knn")

# SVM

In [None]:
from sklearn.svm import SVC

![title](svm.jpg)

## Hiperparâmetros

https://www.geeksforgeeks.org/svm-hyperparameter-tuning-using-gridsearchcv-ml/

* **C**: parâmetro de regularização, **reduz o overfitting** do modelo simplificando a superficie de decisão
* **kernel**: função utilizada para representar superficies não lineares
* **gamma**: o quão *fechadas* as curvas do kernel podem ser

In [None]:
X_train_svc = X_train[0:5000]
y_train_svc = y_train[0:5000]
svm_fit = SVC(class_weight = 'balanced')
svm_fit.fit(X_train_svc, y_train_svc)

In [None]:
plot_decision_regions(
    np.array(X_train_svc), np.array(y_train_svc), svm_fit, scatter_kwargs={"alpha": 0, "s": 1}
)

In [None]:
tb_p_test["pred_svm"] = svm_fit.predict(X_test)
calcular_erros("pred_svm")

In [None]:
### EXERCICIO
# MONTAR UM OTIMIZADOR DE HIPERPARAMETROS PARA O SVC ACIMA

# Árvores de Decisão

In [None]:
from sklearn.tree import DecisionTreeClassifier, plot_tree

## Hiperparâmetros

https://towardsdatascience.com/how-to-tune-a-decision-tree-f03721801680

* **max_depth**: profundidade máxima da árvore
* **min_samples_leaf**: número mínimo de amostras em cada nó final (folha) da árvore
* **min_samples_split**: número mínimo de amostras em cada galho

### Fit ingênuo

In [None]:
tree_fit = DecisionTreeClassifier()
tree_fit.fit(X_train, y_train)

In [None]:
f1_score(y_train, tree_fit.predict(X_train))

In [None]:
f1_score(y_test, tree_fit.predict(X_test))

O que aconteceu??

### Reduzindo Overfitting

Devemos reduzir o overfitting de uma arvore de decisão utilizando os hiperparâmetros dela.

In [None]:
tree_fit = DecisionTreeClassifier(max_depth = 3, min_samples_leaf=0.25, class_weight = "balanced")
tree_fit.fit(X_train, y_train)

In [None]:
plot_decision_regions(
    np.array(X_train), np.array(y_train), tree_fit, scatter_kwargs={"alpha": 0, "s": 1}
);

In [None]:
fig = plt.figure(figsize = (10, 10))
plot_tree(tree_fit)

In [None]:
tb_simul_dist["prob_atraso_tree"] = tree_fit.predict_proba(
    tb_simul_dist[["dist", "dias_previstos"]]
)[:, -1]
tb_simul_peso["prob_atraso_tree"] = tree_fit.predict_proba(
    tb_simul_peso[["dist", "dias_previstos"]]
)[:, -1]

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(10, 5))
sns.scatterplot(data=tb_simul_dist, x="dist", y="prob_atraso_tree", ax=ax[0])
sns.scatterplot(data=tb_simul_dist, x="dist", y="prob_atraso_log", ax=ax[0])
sns.scatterplot(data=tb_simul_peso, x="dias_previstos", y="prob_atraso_tree", ax=ax[1])
sns.scatterplot(data=tb_simul_peso, x="dias_previstos", y="prob_atraso_log", ax=ax[1])
ax[0].set_title("Impacto da Distância sobre Prob. Atraso")
ax[1].set_title("Impacto da Estimativa sobre Prob. Atraso")
fig.suptitle("Simulações usando Modelo Baseline (Reg. Log.)")

In [None]:
tb_p_test["pred_tree"] = tree_fit.predict(X_test)
calcular_erros("pred_tree")

In [None]:
max_depth = [int(x) for x in np.linspace(1, 50, 5)]
min_samples_leaf = [int(x) for x in np.linspace(1, 20, 3)]
min_samples_split = [int(x) for x in np.linspace(2, 40, 3)]
parameter_grid = {
    "max_depth": max_depth,
    "min_samples_leaf": min_samples_leaf,
    "min_samples_split": min_samples_split,
    "class_weight": ["balanced", None] 
}
tree_fit = DecisionTreeClassifier()
tree_opt = GridSearchCV(estimator=tree_fit, param_grid=parameter_grid, scoring="f1",cv = 5)
tree_opt.fit(X_train, y_train)

In [None]:
tree_opt.best_estimator_

In [None]:
tb_p_test["pred_tree"] = tree_opt.predict(X_test)
calcular_erros("pred_tree")

# Ensembles

https://scikit-learn.org/stable/modules/ensemble.html

![title](ensemble.png)

## Bagging

In [None]:
from sklearn.ensemble import RandomForestClassifier

![title](rf.png)

### Hiperparâmetros

https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74

* **n_estimators**: número de arvores de decisão
* **max_samples**: número de pontos em cada amostra


Parâmetros do classificador bagged
* **max_depth**: profundidade máxima da árvore (**CRITICO PARA ENSEMBLES!!**)
* **min_samples_leaf**: número mínimo de amostras em cada nó final (folha) da árvore
* **min_samples_split**: número mínimo de amostras em cada galho

In [None]:
rf_fit = RandomForestClassifier(n_estimators = 1000, class_weight = "balanced", max_depth = 8)
rf_fit.fit(X_train, y_train)

In [None]:
plot_decision_regions(
    np.array(X_train), np.array(y_train), rf_fit, scatter_kwargs={"alpha": 0., "s": 1}
);

In [None]:
tb_p_test["rf_fit"] = rf_fit.predict(X_test)
calcular_erros("rf_fit")

In [None]:
rf_fit = RandomForestClassifier(n_estimators = 10000, max_depth = 2, class_weight = "balanced")
rf_fit.fit(X_train, y_train)

In [None]:
plot_decision_regions(
    np.array(X_train), np.array(y_train), rf_fit, scatter_kwargs={"alpha": 0., "s": 1}
);

In [None]:
tb_p_test["rf_fit"] = rf_fit.predict(X_test)
calcular_erros("rf_fit")

## Boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

https://machinelearningmastery.com/gentle-introduction-gradient-boosting-algorithm-machine-learning/

![title](boosting.png)

### Hiperparâmetros

https://medium.com/all-things-ai/in-depth-parameter-tuning-for-gradient-boosting-3363992e9bae

* **learning_rate**: reduz a contribuição de cada modelo para reduzir overfitting
* **n_estimator**: numero de modelos em séries

Parâmetros do classificador boosted

* **max_depth**: profundidade máxima da árvore (**CRITICO PARA ENSEMBLES!!**)
* **min_samples_leaf**: número mínimo de amostras em cada nó final (folha) da árvore
* **min_samples_split**: número mínimo de amostras em cada galho

In [None]:
gb_fit = GradientBoostingClassifier(n_estimators = 100, max_depth = 8)
gb_fit.fit(X_train, y_train)

In [None]:
plot_decision_regions(
    np.array(X_train), np.array(y_train), gb_fit, scatter_kwargs={"alpha": 0., "s": 1}
);

In [None]:
tb_p_test["pred_gb"] = gb_fit.predict(X_test)
calcular_erros("pred_gb")

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
learning_rate = np.linspace(0.05, 0.8, 20)
n_estimators = [int(x) for x in np.linspace(200, 1000, 20)]
max_depth = range(1, 6)
parameter_grid = {
    "max_depth": max_depth,
    "n_estimators" : n_estimators,
    "learning_rate": learning_rate
}
gb_fit = GradientBoostingClassifier()
gb_opt = RandomizedSearchCV(estimator=gb_fit, param_distributions=parameter_grid, scoring="f1",cv = 5, n_iter = 5)
gb_opt.fit(X_train, y_train)

In [None]:
gb_opt.cv_results_