In [331]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import numpy as np
from imblearn.over_sampling import SMOTE

In [332]:
# Carregar os dados
data = pd.read_csv('air_system_present_year.csv')

In [333]:
#verificando quantidade de caminhões com defeito
df_defeito = data['class'].value_counts()
print(df_defeito)

class
neg    15625
pos      375
Name: count, dtype: int64


In [334]:
X = data.drop(columns=['class'])
y = data['class']

In [335]:
# Substituindo 'na' por NaN para representar valores ausentes
X.replace('na', np.nan, inplace=True)

In [336]:
# Verificar valores ausentes
missing_values = X.isnull().sum()
print(missing_values)

aa_000        0
ab_000    12363
ac_000      926
ad_000     3981
ae_000      690
          ...  
ee_007      192
ee_008      192
ee_009      192
ef_000      762
eg_000      762
Length: 170, dtype: int64


In [337]:
#Taking care of missing data
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(X)
X= imputer.transform(X)
X = pd.DataFrame(X)


In [338]:
# Visualizar as primeiras linhas do conjunto de dados com os valores nulos tratados
# 
print(X.head().to_string())

       0         1       2      3    4    5    6    7    8         9          10         11         12         13       14       15         16   17    18   19       20        21         22         23        24        25   26   27      28   29      30     31   32   33   34   35   36       37         38         39        40   41       42     43      44        45         46         47         48        49       50   51         52        53        54        55        56        57        58        59        60        61         62     63      64      65    66         67       68        69        70             71             72             73             74             75             76            77             78        79        80         81         82         83       84       85       86         87         88         89        90   91     92   93          94        95         96     97     98   99      100       101       102        103        104       105       106       107       1

In [339]:
#Visualizar alvo
print(y)

0        neg
1        neg
2        neg
3        neg
4        neg
        ... 
15995    neg
15996    neg
15997    neg
15998    neg
15999    neg
Name: class, Length: 16000, dtype: object


In [340]:
#Encoding the Dependent Variable
le = LabelEncoder()
y = le.fit_transform(y)

In [341]:
# Balanceando as classes com SMOTE
smote = SMOTE()
X_balanced, y_balanced = smote.fit_resample(X, y)
pd.DataFrame(y_balanced).value_counts()

0
0    15625
1    15625
Name: count, dtype: int64

In [342]:
#Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size = 0.2, random_state = 42)

In [343]:
print(X_train)

                0          1             2            3           4    \
17595  7.612265e+05   0.772065  3.580393e+08   439.498794    0.000000   
10368  5.969800e+04   0.000000  2.130706e+09    84.000000  230.000000   
26520  2.901470e+05   0.000000  1.426051e+03  1083.736524    0.000000   
17539  8.988779e+05   0.772065  0.000000e+00   439.498794    0.000000   
10610  8.600000e+01   0.000000  8.000000e+00     8.000000    2.000000   
...             ...        ...           ...          ...         ...   
29802  1.639210e+06  10.033759  3.580393e+08   414.111078    6.428347   
5390   4.980000e+02   0.772065  4.200000e+01    26.000000    0.000000   
860    1.341620e+05   0.000000  3.460000e+02   302.000000    0.000000   
15795  8.000000e+00   0.000000  0.000000e+00     0.000000    0.000000   
23654  7.800477e+05   0.772065  3.580393e+08   439.498794    2.327048   

              5            6              7             8             9    \
17595    0.000000     0.000000       0.000000 

In [344]:
print(y_test)

[1 1 0 ... 1 1 0]


In [345]:
# Definir os modelos
models = {
    'Random Forest': RandomForestClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'Logistic Regression': LogisticRegression(max_iter=2000),
}


In [346]:
# Treinar e avaliar os modelos
for model_name, model in models.items():
    print(f"Treinando e avaliando o modelo: {model_name}")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # Avaliar o desempenho do modelo
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    
    # Exibir as métricas de avaliação
    print(f"Acurácia: {accuracy:.2f}")
    print(f"Precisão: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1-Score: {f1:.2f}")
    print("\nRelatório de Classificação:\n", report)
    print("="*60)

Treinando e avaliando o modelo: Random Forest
Acurácia: 0.99
Precisão: 0.99
Recall: 1.00
F1-Score: 0.99

Relatório de Classificação:
               precision    recall  f1-score   support

           0       1.00      0.99      0.99      3180
           1       0.99      1.00      0.99      3070

    accuracy                           0.99      6250
   macro avg       0.99      0.99      0.99      6250
weighted avg       0.99      0.99      0.99      6250

Treinando e avaliando o modelo: Decision Tree
Acurácia: 0.99
Precisão: 0.99
Recall: 0.99
F1-Score: 0.99

Relatório de Classificação:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99      3180
           1       0.99      0.99      0.99      3070

    accuracy                           0.99      6250
   macro avg       0.99      0.99      0.99      6250
weighted avg       0.99      0.99      0.99      6250

Treinando e avaliando o modelo: Logistic Regression
Acurácia: 0.97
Precisão: 0

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
