In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
import mlflow

In [2]:
df = pd.read_csv("carros.csv", sep=';')
df.head()

Unnamed: 0,Consumo,Cilindros,Cilindradas,RelEixoTraseiro,Peso,Tempo,TipoMotor,Transmissao,Marchas,Carburadors,HP
0,21,6,160,39,262,1646,0,1,4,4,110
1,21,6,160,39,2875,1702,0,1,4,4,110
2,228,4,108,385,232,1861,1,1,4,1,93
3,214,6,258,308,3215,1944,1,0,3,1,110
4,187,8,360,315,344,1702,0,0,3,2,175


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32 entries, 0 to 31
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype
---  ------           --------------  -----
 0   Consumo          32 non-null     int64
 1   Cilindros        32 non-null     int64
 2   Cilindradas      32 non-null     int64
 3   RelEixoTraseiro  32 non-null     int64
 4   Peso             32 non-null     int64
 5   Tempo            32 non-null     int64
 6   TipoMotor        32 non-null     int64
 7   Transmissao      32 non-null     int64
 8   Marchas          32 non-null     int64
 9   Carburadors      32 non-null     int64
 10  HP               32 non-null     int64
dtypes: int64(11)
memory usage: 2.9 KB


In [4]:
df['Consumo']

0      21
1      21
2     228
3     214
4     187
5     181
6     143
7     244
8     228
9     192
10    178
11    164
12    173
13    152
14    104
15    104
16    147
17    324
18    304
19    339
20    215
21    155
22    152
23    133
24    192
25    273
26     26
27    304
28    158
29    197
30     15
31    214
Name: Consumo, dtype: int64

In [5]:
lb = ['baixo', 'médio', 'alto']
df.drop(columns='Consumo')
df['Consumo']= pd.qcut(df['Consumo'], q=3, labels=lb)
df.sort_values(by='Consumo')

Unnamed: 0,Consumo,Cilindros,Cilindradas,RelEixoTraseiro,Peso,Tempo,TipoMotor,Transmissao,Marchas,Carburadors,HP
0,baixo,6,160,39,262,1646,0,1,4,4,110
26,baixo,4,1203,443,214,167,0,1,5,2,91
23,baixo,8,350,373,384,1541,0,0,3,4,245
22,baixo,8,304,315,3435,173,0,0,3,2,150
16,baixo,8,440,323,5345,1742,0,0,3,4,230
30,baixo,8,301,354,357,146,0,1,5,8,335
14,baixo,8,472,293,525,1798,0,0,3,4,205
13,baixo,8,2758,307,378,18,0,0,3,3,180
15,baixo,8,460,3,5424,1782,0,0,3,4,215
6,baixo,8,360,321,357,1584,0,0,3,4,245


In [6]:
X = df.drop(columns = 'Consumo')
y = df['Consumo']
X.head()

Unnamed: 0,Cilindros,Cilindradas,RelEixoTraseiro,Peso,Tempo,TipoMotor,Transmissao,Marchas,Carburadors,HP
0,6,160,39,262,1646,0,1,4,4,110
1,6,160,39,2875,1702,0,1,4,4,110
2,4,108,385,232,1861,1,1,4,1,93
3,6,258,308,3215,1944,1,0,3,1,110
4,8,360,315,344,1702,0,0,3,2,175


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size = 0.25,
                                                    random_state = 42)

In [8]:
rf = RandomForestClassifier()

In [9]:
rf.fit(X_train, y_train)

In [10]:
y_train_pred = rf.predict(X_train)
y_test_pred = rf.predict(X_test)

In [11]:
print('Dados de treino:\n\n',classification_report(y_train, y_train_pred),'\n')
print('Dados de teste:\n\n',classification_report(y_test, y_test_pred))

Dados de treino:

               precision    recall  f1-score   support

        alto       1.00      1.00      1.00         8
       baixo       1.00      1.00      1.00         9
       médio       1.00      1.00      1.00         7

    accuracy                           1.00        24
   macro avg       1.00      1.00      1.00        24
weighted avg       1.00      1.00      1.00        24
 

Dados de teste:

               precision    recall  f1-score   support

        alto       1.00      1.00      1.00         3
       baixo       0.50      1.00      0.67         2
       médio       1.00      0.33      0.50         3

    accuracy                           0.75         8
   macro avg       0.83      0.78      0.72         8
weighted avg       0.88      0.75      0.73         8



In [12]:
df_importances = pd.DataFrame(abs(rf.feature_importances_), X.columns, columns=['Score']).sort_values(by = ['Score'], ascending = False)
df_importances

Unnamed: 0,Score
Peso,0.178091
TipoMotor,0.154403
Cilindros,0.134582
HP,0.114359
Tempo,0.1117
Carburadors,0.11068
RelEixoTraseiro,0.091588
Cilindradas,0.063506
Marchas,0.022793
Transmissao,0.018298


In [13]:
idx_importances = df_importances.reset_index()
idx_importances['index'][:5]

0         Peso
1    TipoMotor
2    Cilindros
3           HP
4        Tempo
Name: index, dtype: object

In [14]:
columns = [i for i in idx_importances['index'][:5]]
X_imp = df[columns]
y = df['Consumo']
X.head()

Unnamed: 0,Cilindros,Cilindradas,RelEixoTraseiro,Peso,Tempo,TipoMotor,Transmissao,Marchas,Carburadors,HP
0,6,160,39,262,1646,0,1,4,4,110
1,6,160,39,2875,1702,0,1,4,4,110
2,4,108,385,232,1861,1,1,4,1,93
3,6,258,308,3215,1944,1,0,3,1,110
4,8,360,315,344,1702,0,0,3,2,175


In [15]:
X_train, X_test, y_train, y_test = train_test_split(X_imp, y,
                                                    test_size = 0.25,
                                                    random_state = 42)

In [16]:
logit = LogisticRegression(C=0.5, penalty = 'l2')

In [17]:
std = StandardScaler()

std.fit_transform(X_train)
std.transform(X_test)

array([[-0.65475191, -0.84515425, -0.14189513,  0.52169553, -1.06627771],
       [ 3.18170543, -0.84515425,  0.99326592,  1.18416603,  0.91846879],
       [ 2.00475456, -0.84515425,  0.99326592,  0.52169553,  0.82453795],
       [-0.84482314,  1.18321596, -1.27705618, -1.28353661,  1.11974917],
       [-0.62642757,  1.18321596, -1.27705618, -0.80324549, -0.97600652],
       [-0.60481162,  1.18321596, -0.14189513, -0.33951614, -1.03212104],
       [-0.59512172, -0.84515425,  0.99326592,  3.17157756, -1.07725664],
       [ 0.58108378,  1.18321596, -1.27705618, -1.28353661, -1.02480176]])

In [18]:
logit.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [19]:
y_train_pred = logit.predict(X_train)
y_test_pred = logit.predict(X_test)

In [20]:
print('Dados de treino:\n\n',classification_report(y_train, y_train_pred),'\n')
print(55 * '=','\n')
print('Dados de teste:\n\n',classification_report(y_test, y_test_pred))

Dados de treino:

               precision    recall  f1-score   support

        alto       0.89      1.00      0.94         8
       baixo       1.00      0.67      0.80         9
       médio       0.67      0.86      0.75         7

    accuracy                           0.83        24
   macro avg       0.85      0.84      0.83        24
weighted avg       0.87      0.83      0.83        24
 


Dados de teste:

               precision    recall  f1-score   support

        alto       1.00      0.67      0.80         3
       baixo       0.67      1.00      0.80         2
       médio       0.67      0.67      0.67         3

    accuracy                           0.75         8
   macro avg       0.78      0.78      0.76         8
weighted avg       0.79      0.75      0.75         8



In [21]:
DB_URI = 'sqlite:///mlrunsdb.db'
mlflow.set_tracking_uri(DB_URI)

mlflow.set_experiment(experiment_name='Projeto Carros')

<Experiment: artifact_location='./mlruns/3', creation_time=1668273612271, experiment_id='3', last_update_time=1668273612271, lifecycle_stage='active', name='Projeto Carros', tags={}>

In [22]:
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score)


def get_metrics(y_test:list, y_pred:list) -> list:
    ac = accuracy_score(y_test, y_pred)
    pr = precision_score(y_test, y_pred, average='weighted')
    rc = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    return [ac, pr, rc, f1]

In [23]:
with mlflow.start_run(
    run_name='API',
    description='Projeto Carros com logit',
) as model_run:


    model = Pipeline([('scaler',StandardScaler()),('logit', LogisticRegression(C=0.5, penalty = 'l2'))])
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    
    ac, pr, rc, f1 = get_metrics(y_test, y_pred)

    params = {
            "c":0.5,
            "penalty":"l2",
            "features":len(X_train.columns),
            "size_train_dataset": len(X_train),
            "size_test_dataset": len(X_test)}

    metrics = {
        "acuracia":ac,
        "precision":pr,
        "recall":rc,
        "f1":f1}

    mlflow.log_params(params=params)
    mlflow.log_metrics(metrics=metrics)

    mlflow.sklearn.log_model(model, "model")

In [24]:
import joblib

file_name = "logit_model_v1.pk1"

joblib.dump(model, file_name)

['logit_model_v1.pk1']