In [3]:
import pandas as pd
import numpy as np

# importando libs que usarei na criação de ambos os modelos
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

# Classificação
Vamos começar com um modelo de classificação, usarei um dataset que contêm dados de transações que são, ou não, fraudulentas.

In [4]:
from sklearn.tree import DecisionTreeClassifier  # modelo de classificação 
from sklearn.metrics import (
    # métricas usadas em modelos de classificação
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
)

In [9]:
credit_card_df = pd.read_csv('synthetic_financial_data.csv')
credit_card_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 11 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   transaction_id           10000 non-null  int64  
 1   customer_id              10000 non-null  int64  
 2   merchant_id              10000 non-null  int64  
 3   amount                   10000 non-null  float64
 4   transaction_time         10000 non-null  object 
 5   is_fraudulent            10000 non-null  int64  
 6   card_type                10000 non-null  object 
 7   location                 10000 non-null  object 
 8   purchase_category        10000 non-null  object 
 9   customer_age             10000 non-null  int64  
 10  transaction_description  10000 non-null  object 
dtypes: float64(1), int64(5), object(5)
memory usage: 859.5+ KB


In [7]:
credit_card_df.select_dtypes(include='object').nunique()

transaction_time           10000
card_type                      4
location                      50
purchase_category              6
transaction_description      100
dtype: int64

### Pre-Processamento

Vou excluir algumas colunas não importantes para nosso modelo, como estou focando na parte final do nosso modelo, as métricas, não usarei nada mirabolante, como feature engineering ou coisa do tipo.

In [10]:
columns_to_drop = [
    'transaction_id',
    'customer_id',
    'merchant_id',
    'transaction_description',
    'location',
]

credit_card_df.drop(columns_to_drop, axis=1, inplace=True)

In [11]:
credit_card_df.select_dtypes(include='object').nunique()

transaction_time     10000
card_type                4
purchase_category        6
dtype: int64

In [14]:
credit_card_df['transaction_time'] = pd.to_datetime(credit_card_df['transaction_time'])

credit_card_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   amount             10000 non-null  float64       
 1   transaction_time   10000 non-null  datetime64[ns]
 2   is_fraudulent      10000 non-null  int64         
 3   card_type          10000 non-null  object        
 4   purchase_category  10000 non-null  object        
 5   customer_age       10000 non-null  int64         
dtypes: datetime64[ns](1), float64(1), int64(2), object(2)
memory usage: 468.9+ KB


### Criação dos Pipelines

In [26]:
target = 'is_fraudulent'
object_cols = [col for col in credit_card_df.columns if credit_card_df[col].dtypes == 'object' and credit_card_df[col].name != target]
numeric_cols = [col for col in credit_card_df.columns if credit_card_df[col].dtypes in ['int64', 'float64'] and credit_card_df[col].name != target]
datetime_cols = [col for col in credit_card_df.columns if credit_card_df[col].dtypes == 'datetime64[ns]' and credit_card_df[col].name != target]

In [27]:
X = credit_card_df[object_cols + numeric_cols + datetime_cols]
y = credit_card_df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [32]:
object_pipe = Pipeline([
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('OH', OneHotEncoder(handle_unknown='ignore', sparse_output=False)),
])

numeric_pipeline = Pipeline([
    ('impute', SimpleImputer(strategy='median'))
])

column_transform = ColumnTransformer([
    ('object_pipeline', object_pipe, object_cols),
    ('numeric_pipeline', numeric_pipeline, numeric_cols),
])

In [34]:
X_train_transform = column_transform.fit_transform(X_train)
X_test_transform = column_transform.transform(X_test)

### Criando o Modelo

In [36]:
clf = DecisionTreeClassifier()
clf.fit(X_train_transform, y_train)

In [38]:
clf_y_pred = clf.predict(X_test_transform)
clf_y_pred

array([1, 1, 0, ..., 1, 1, 0], dtype=int64)

### Calculando Métricas

In [41]:
print(f"Accuracy {accuracy_score(y_pred=clf_y_pred, y_true=y_test)}")
print(f"Precision {precision_score(y_pred=clf_y_pred, y_true=y_test)}")
print(f"Recall {recall_score(y_pred=clf_y_pred, y_true=y_test)}")
print(f"F1-Score {f1_score(y_pred=clf_y_pred, y_true=y_test)}")

Accuracy 0.4972
Precision 0.5047095761381476
Recall 0.5066981875492513
F1-Score 0.5057019268580417


Podemos ver que nossa Accurary está em torno de 49%, a taxa de acertos foi abaixo da metade, o Precision - nos da uma ideia dos Falsos Positivos - tá em torno de 50%, ou seja, metade dos nossos dados foi previsto de forma errada, foram de falsos positivos, a mesma coisa para o recall, e como o F1 é a junção dos dois, o resultado é semelhante.

# Regressão
Usaremos um Dataset de vinhos, e vamos usar para prever a qualidade de um determinado vinho baseado das suas condições.

In [45]:
from sklearn.linear_model import LinearRegression  # modelo que usaremos
from sklearn.metrics import (
    # importante métricas
    mean_squared_error,
    mean_absolute_error,
)

In [43]:
wine_df = pd.read_csv('winequality-red.csv')
wine_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB


Como esse Dataset não tem dados faltantes e nem categóricos, não irei perder tempo fazendo pipelines para tratar os dados.

In [44]:
X = wine_df.drop('quality', axis=1)
y = wine_df['quality']

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [46]:
lrg = LinearRegression()
lrg.fit(X_train, y_train)

In [48]:
lrg_y_pred = lrg.predict(X_test)
lrg_y_pred

array([6.27767399, 6.34311776, 6.22656082, 6.16815286, 5.33516373,
       5.52747452, 5.49141438, 5.71528287, 4.23618184, 5.997418  ,
       5.1649321 , 6.61123436, 5.70930404, 5.78137731, 4.90804978,
       6.20333624, 4.87942756, 5.00097028, 5.38482906, 4.99258116,
       6.62895555, 4.99197316, 5.94857716, 6.16986294, 5.81586128,
       5.03466621, 6.24449622, 6.09696474, 6.19603406, 5.26811531,
       5.15882175, 5.41941072, 5.18847913, 6.19335595, 4.9692581 ,
       4.99464769, 6.07146115, 5.76344231, 5.53923516, 5.37239942,
       5.36853019, 5.42683571, 5.76885916, 5.73168876, 5.37853955,
       5.23009979, 4.76029036, 5.54603664, 5.63177687, 5.98789492,
       5.63310264, 5.23216039, 6.29155162, 6.55288904, 6.2573297 ,
       6.05063668, 5.0002599 , 5.43271097, 5.27457275, 5.83102261,
       5.4094192 , 5.33620748, 5.87332156, 4.97318222, 5.08427667,
       4.79430263, 5.41842205, 5.19730801, 6.33851934, 5.23965251,
       5.64806736, 5.67324638, 6.09805516, 5.15720813, 5.73274

In [50]:
print(f"MSE: {mean_squared_error(y_pred=lrg_y_pred, y_true=y_test)}")
print(f"MAE: {mean_absolute_error(y_pred=lrg_y_pred, y_true=y_test)}")
print(f"RMSE: {(mean_squared_error(y_pred=lrg_y_pred, y_true=y_test)) ** 1/2}")  # para calcular o RMSE, apenas tiramos a raiz do MSE

MSE: 0.3997353324303127
MAE: 0.49649435487371335
RMSE: 0.19986766621515634
