In [3]:
%%capture
!pip install catboost

# Objetivo 1: Predicción de Ventas y Rentabilidad Basadas en el Stock Disponible y Otras Variables

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.svm import SVC
svc = SVC(gamma="auto")
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_predict

In [3]:
df_full_devices = pd.read_csv('/home/insightlab/Documents/MercadoLibre/arquivos/full_devices.csv', encoding='latin1')
df_items_titles_test = pd.read_csv('/home/insightlab/Documents/MercadoLibre/arquivos/items_titles_test.csv', encoding='latin1')
df_ofertas_relampago = pd.read_csv('/home/insightlab/Documents/MercadoLibre/arquivos/ofertas_relampago.csv', encoding='latin1')

### Metricas de evaluación

In [None]:
def accuracy(y_true, y_pred):
    true_positive, true_negative, false_positive, false_negative = calcule_confusion_matrix_values(y_true, y_pred)
    accuracy_rate = (true_positive + true_negative) / y_true.shape[0]
    return accuracy_rate

def recall(y_true, y_pred):
    true_positive, true_negative, false_positive, false_negative = calcule_confusion_matrix_values(y_true, y_pred)
    recall_rate = true_positive / (true_positive + false_negative)
    return recall_rate

def precision(y_true, y_pred):
    true_positive, true_negative, false_positive, false_negative = calcule_confusion_matrix_values(y_true, y_pred)
    precision_rate = true_positive / (true_positive + false_positive)
    return precision_rate

def f1score (y_true, y_pred):
    p = precision(y_true, y_pred)
    r = recall(y_true, y_pred)
    return 2*(p*r)/(p+r)

### K-foldK-fold

In [None]:
def my_KFold(X, n_splits=3):
  n_samples = X.shape[0]
  num_larger_folds = n_samples % n_splits
  num_default_elements = n_samples // n_splits
  all_index = np.arange(0, n_samples)

  for i in range(0, n_splits):
    if i < num_larger_folds:
        initial_index = i*(num_default_elements+1)
        final_index = initial_index + (num_default_elements+1)
    else:
        initial_index = num_larger_folds*(num_default_elements+1) + (i-num_larger_folds) * num_default_elements
        final_index = initial_index + num_default_elements

    test_index = np.arange(initial_index, final_index)
    train_index = np.setdiff1d(all_index, test_index)

    yield train_index, test_index

In [None]:
def train_model_with_KFold(model, X_train, y_train, n_splits=10):
  index = 1
  df = pd.DataFrame(columns = ["Fold", "Accuracy", "Recall", "Precision", "F1_Score"])

  for train_index, validation_index in my_KFold(X_train, n_splits=n_splits):
    model.fit(X_train[train_index], y_train[train_index])
    y_validation_predictions = model.predict(X_train[validation_index])
    y_validation = y_train[validation_index]
    acc_score = accuracy(y_validation, y_validation_predictions)
    rec_score = recall(y_validation, y_validation_predictions)
    prec_score = precision(y_validation, y_validation_predictions)
    f_score = f1score(y_validation, y_validation_predictions)
    df = pd.concat([df, pd.DataFrame([[index, acc_score, rec_score, prec_score, f_score]],columns = ["Fold", "Accuracy", "Recall", "Precision", "F1_Score"] )])
    index+=1
  return df

### Modelo de evaluación

In [None]:
def diplay_model_evaluation(y_train, y_train_predictions, y_test, y_test_predictions):
  df = pd.DataFrame(columns = ["Dados", "Accuracy", "Recall", "Precision", "F1_Score"])

  acc_score_train = accuracy(y_train, y_train_predictions)
  rec_score_train = recall(y_train, y_train_predictions)
  prec_score_train = precision(y_train, y_train_predictions)
  f_score_train = f1score(y_train, y_train_predictions)

  df = pd.concat([df, pd.DataFrame([['Treino', acc_score_train, rec_score_train, prec_score_train, f_score_train]],\
                                    columns = ["Dados", "Accuracy", "Recall", "Precision", "F1_Score"] )])

  acc_score_test = accuracy(y_test, y_test_predictions)
  rec_score_test = recall(y_test, y_test_predictions)
  prec_score_test = precision(y_test, y_test_predictions)
  f_score_test = f1score(y_test, y_test_predictions)

  df = pd.concat([df, pd.DataFrame([['Teste', acc_score_test, rec_score_test, prec_score_test, f_score_test]],\
                                    columns = ["Dados", "Accuracy", "Recall", "Precision", "F1_Score"] )])

  display(df)

### Matriz de confusión

In [None]:
def calcule_confusion_matrix_values(y_true, y_pred):
  #Ajustes de dimensões
  if y_true.ndim == 1:
      y_true = y_true.reshape(y_true.shape[0],1)

  if y_pred.ndim == 1:
      y_pred = y_pred.reshape(y_pred.shape[0],1)

  # Funções que vericam se para cada par, temos um true positive,
  # true negative, false positive ou false negative
  tp_vect_func = np.vectorize(lambda x, y: 1 if x == y and x==1 else 0)
  tn_vect_func = np.vectorize(lambda x, y: 1 if x == y and x==0 else 0)
  fp_vect_func = np.vectorize(lambda x, y: 1 if x != y and x==0 else 0)
  fn_vect_func = np.vectorize(lambda x, y: 1 if x != y and x==1 else 0)

  # Contagem dos valores
  TP = np.add.reduce(tp_vect_func(y_true, y_pred))[0]
  TN = np.add.reduce(tn_vect_func(y_true, y_pred))[0]
  FP = np.add.reduce(fp_vect_func(y_true, y_pred))[0]
  FN = np.add.reduce(fn_vect_func(y_true, y_pred))[0]

  return TP, TN, FP, FN