In [16]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
import warnings

warnings.filterwarnings("ignore")


pg_host = 'localhost'
pg_port = 15432
pg_user = 'postgres'
pg_pass = 'postgres'
SEED = 20
np.random.seed(SEED)

engine = create_engine(f'postgresql://{pg_user}:{pg_pass}@{pg_host}:{pg_port}/postgres')

df_data = pd.read_sql_table('vendas_carros', engine)
df_data.head()

Unnamed: 0,preco,vendido,idade_do_modelo,km_por_ano,modelo
0,30941.02,0,24,35085.22134,31
1,40557.96,0,26,12622.05362,33
2,89627.5,1,18,11440.79806,22
3,95276.14,1,9,43167.32682,14
4,117384.68,0,10,12770.1129,16


### Baseline Dummy Classifier

In [17]:
from sklearn.svm import LinearSVC, SVC
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.preprocessing import StandardScaler
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import GroupKFold, cross_validate, train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier

def print_metrics (x):
    return f"Accuracy: {round(x['test_score'].mean(), 2)} | std: {round(x['test_score'].std(), 2)}"

df_X = df_data[["preco", "idade_do_modelo", "km_por_ano"]]
df_Y = df_data["vendido"]


train_x, test_x, train_y, test_y = train_test_split(df_X, df_Y, test_size=0.25, stratify=df_Y)


dummy = DummyClassifier()
result = cross_validate(dummy, df_X, df_Y, cv=10, return_train_score=False)
print("Dummy Classifier: ", print_metrics(result))

model_dt = DecisionTreeClassifier(max_depth=2)
result_model_dt = cross_validate(model_dt, df_X, df_Y, cv=10, return_train_score=False)
print("Decision Tree: ", print_metrics(result_model_dt))

cv = GroupKFold(n_splits=10)
model_dt_kfold = DecisionTreeClassifier(max_depth=2)
result_modelo_dec_tree_kfold = cross_validate(model_dt_kfold, df_X, df_Y, cv=cv, groups=df_data.modelo, return_train_score=False)
print("Decision Tree KFold: ", print_metrics(result_modelo_dec_tree_kfold))

Dummy Classifier:  Accuracy: 0.58 | std: 0.0
Decision Tree:  Accuracy: 0.76 | std: 0.01
Decision Tree KFold:  Accuracy: 0.76 | std: 0.02


## Refinando Modelos
### 1 - Decision Tree Classifier

Parâmetros de interesse:
- `criterion`: função de avaliação da qualidade da divisão.
- `splitter`: estratégia utilizada para dividir o nó de decisão.
- `max_depth`: profundidade máxima da árvore.
- `min_samples_split`: número mínimo de amostras para dividir um nó.
- `min_samples_leaf`: número mínimo de amostras para ser um nó folha.


In [18]:
parametros = {
    'max_depth': [3, 5],
    'min_samples_leaf': [32, 64, 128],
    'min_samples_split': [32, 64, 128],
    'criterion': ['gini', 'entropy']
}
grid_search = GridSearchCV(DecisionTreeClassifier(), parametros, cv=GroupKFold(n_splits=10), n_jobs=-1)
grid_search.fit(df_X, df_Y, groups=df_data.modelo)
resultados = pd.DataFrame(grid_search.cv_results_)
# resultados.sort_values("mean_test_score", ascending=False).head()
better_parms = grid_search.best_params_
model_dt = DecisionTreeClassifier(**better_parms)
result_model_dt = cross_validate(model_dt, df_X, df_Y, cv=10, return_train_score=False)
print("Decision Tree: ", print_metrics(result_model_dt))

Decision Tree:  Accuracy: 0.79 | std: 0.01


### 2 - Classificador Linear SVC (Support Vector Classifier)

Parâmetros de interesse:
- `C`: parâmetro de regularização.
- `loss`: função de perda.
- `penalty`: norma de regularização.
- `dual`: formulação primal ou dual.
- `tol`: critério de parada.

In [19]:
hyper_parms_linear_svc = {
    'penalty': ['l1', 'l2'],
    'dual': [False], 
    'loss': ['squared_hinge'],
    'C': [0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 50.0, 100, 1000.0],
    'tol': [0.0001, 0.001, 0.01, 0.1],
    'max_iter': [1000]
}
grid_search = GridSearchCV(LinearSVC(), hyper_parms_linear_svc, cv=GroupKFold(n_splits=10), n_jobs=-1)
grid_search.fit(df_X, df_Y, groups=df_data.modelo)
resultados = pd.DataFrame(grid_search.cv_results_)
# resultados.sort_values("mean_test_score", ascending=False).head()
better_parms = grid_search.best_params_
model_linear_svc = LinearSVC(**better_parms)
result_model_linear_svc = cross_validate(model_linear_svc, df_X, df_Y, cv=10, return_train_score=False)
print("Linear SVC: ", print_metrics(result_model_linear_svc))

Linear SVC:  Accuracy: 0.71 | std: 0.01


## 3 - Classificador SVC (Support Vector Classifier)

- O SVC é um classificador binário apropriado para classificação de dados não lineares. 
- Ele pode ser visto como uma generalização do SVM para o caso não linear. 
- O SVC é um classificador que utiliza uma função de kernel para transformar o espaço de entrada em um espaço de maior dimensionalidade, onde os dados podem ser separados por um hiperplano.

In [20]:


hyper_parms_svc = [
    {
        'svc__kernel': ['linear', 'rbf'],
    },
]

pipe_knn = Pipeline([('scaler', StandardScaler()), ('svc', SVC())])

grid_search = GridSearchCV(
                            estimator=pipe_knn, 
                            param_grid=hyper_parms_svc, 
                            cv=GroupKFold(n_splits=10), 
                            n_jobs=-1
)

grid_search.fit(df_X, df_Y, groups=df_data.modelo)
resultados = pd.DataFrame(grid_search.cv_results_)
better_parms = {k.replace('svc__', ''): v for k, v in grid_search.best_params_.items()}
model_svc = SVC(**better_parms)
result_model_svc = cross_validate(model_svc, df_X, df_Y, cv=10, return_train_score=False)
print("SVC: ", print_metrics(result_model_svc))


SVC:  Accuracy: 0.77 | std: 0.01


### Classificador KNN (K-Nearest Neighbors)

In [22]:

parms_grid_knn = {
    'knn__n_neighbors': np.linspace(3, 15, 5, dtype=int),
    'knn__weights': ['uniform', 'distance'],
    'knn__metric': ['euclidean', 'manhattan']
    }

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
pipe_knn = Pipeline([('scaler', StandardScaler()), ('knn', KNeighborsClassifier())])

knn_grid_search = GridSearchCV(
                                        estimator=pipe_knn, 
                                        param_grid=parms_grid_knn, 
                                        cv=cv,
                                        scoring='recall',
                                        verbose=1,
                                        n_jobs=-1
)

knn_grid_search.fit(df_X, df_Y)
resultados = pd.DataFrame(knn_grid_search.cv_results_)
# resultados.sort_values("mean_test_score", ascending=False).head()
better_parms =   {k.replace('knn__', ''): v for k, v in knn_grid_search.best_params_.items()}
model_knn = KNeighborsClassifier(**better_parms)
result_model_knn = cross_validate(model_knn, df_X, df_Y, cv=10, return_train_score=False)
print("KNN: ", print_metrics(result_model_knn))


Fitting 5 folds for each of 20 candidates, totalling 100 fits
KNN:  Accuracy: 0.76 | std: 0.01


## Classificador de Regressão Logística

In [25]:
from sklearn.linear_model import LogisticRegression


max_iter =np.linspace(100, 300, 5, dtype=int)
c = [0.001, 0.01, 0.1, 1, 10]

parms_grid_reg_log = [
    {
    'logisticregression__solver': ['newton-cg', 'lbfgs'],
    'logisticregression__penalty': ['l2'],
    'logisticregression__max_iter': max_iter,
    'logisticregression__C': c 
    },
    {
    'logisticregression__solver': ['liblinear'],
    'logisticregression__penalty': ['l1', 'l2'],
    'logisticregression__max_iter': max_iter,
    'logisticregression__C': c 
    }
]

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
pipe_logistic_regression = Pipeline([('scaler', StandardScaler()), ('logisticregression', LogisticRegression())])
log_reg_grid_search = GridSearchCV(
                                        estimator=pipe_logistic_regression, 
                                        param_grid=parms_grid_reg_log, 
                                        cv=cv,
                                        scoring='recall',
                                        verbose=1,
                                        n_jobs=-1
)

log_reg_grid_search.fit(df_X, df_Y)
# print(log_reg_grid_search.best_params_)
df_reg_log_results =  pd.DataFrame(log_reg_grid_search.cv_results_)
#df_reg_log_results.loc[[log_reg_grid_search.best_index_]]
better_parms =   {k.replace('logisticregression__', ''): v for k, v in log_reg_grid_search.best_params_.items()}
model_knn = LogisticRegression(**better_parms)
result_model_knn = cross_validate(model_knn, df_X, df_Y, cv=10, return_train_score=False)
print("Logistic Regression: ", print_metrics(result_model_knn))

Fitting 5 folds for each of 100 candidates, totalling 500 fits


Logistic Regression:  Accuracy: 0.65 | std: 0.01
