# tratamento de dados categóricos

In [3]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
from ucimlrepo import fetch_ucirepo
car_evaluation = fetch_ucirepo(id=19)
features = car_evaluation.data.features
targets = car_evaluation.data.targets


df_car = pd.concat([features, targets], axis=1)

ordinal_encoder = OrdinalEncoder()

df_car_encoded = df_car.copy()
df_car_encoded[df_car.columns] =ordinal_encoder.fit_transform(df_car[df_car.columns])
df_car_encoded

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,3.0,3.0,0.0,0.0,2.0,1.0,2.0
1,3.0,3.0,0.0,0.0,2.0,2.0,2.0
2,3.0,3.0,0.0,0.0,2.0,0.0,2.0
3,3.0,3.0,0.0,0.0,1.0,1.0,2.0
4,3.0,3.0,0.0,0.0,1.0,2.0,2.0
...,...,...,...,...,...,...,...
1723,1.0,1.0,3.0,2.0,1.0,2.0,1.0
1724,1.0,1.0,3.0,2.0,1.0,0.0,3.0
1725,1.0,1.0,3.0,2.0,0.0,1.0,2.0
1726,1.0,1.0,3.0,2.0,0.0,2.0,1.0


## Treinamento 

In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
Y = df_car_encoded["class"]
X = df_car_encoded.loc[:, df_car_encoded.columns != "class"]

x_train, x_test, y_train, y_test = train_test_split(X, Y,test_size=0.3)
r_forest = RandomForestClassifier()


In [5]:
#Validação cruzada
scores = cross_val_score(r_forest, x_train, y_train, scoring="accuracy",cv=10)
scores


array([0.96694215, 0.98347107, 0.95041322, 0.9338843 , 0.98347107,
       0.98347107, 0.95867769, 0.97520661, 0.95041322, 0.98333333])

In [6]:
r_forest.fit(x_train, y_train)

some_data = x_test.iloc[:5]
labels = y_test[:5]

print("Previsões: ",r_forest.predict(some_data))
print("labels: ", labels.values)


Previsões:  [2. 2. 2. 2. 3.]
labels:  [2. 2. 2. 2. 3.]


## Avaliando e otimizando o modelo

In [22]:
import plotly.graph_objects as go
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [4, 6, 8, 10],
    'criterion': ['gini', 'entropy']
}

grid_search = GridSearchCV(RandomForestClassifier(), param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(x_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [4, 6, 8, 10],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'n_estimators': [100, 200, 300]},
             scoring='accuracy')

In [8]:
final_model = grid_search.best_estimator_
final_model_predictions = final_model.predict(x_test)

In [19]:

accuracy = accuracy_score(y_test, final_model_predictions)
print(f"Best parameters found: {grid_search.best_params_}")
print(f"Test accuracy: {accuracy:.4f}")
print(classification_report(y_test, final_model_predictions))

Best parameters found: {'criterion': 'entropy', 'max_depth': 10, 'max_features': 'auto', 'n_estimators': 300}
Test accuracy: 0.9634
              precision    recall  f1-score   support

         0.0       0.91      0.93      0.92       108
         1.0       0.88      0.64      0.74        22
         2.0       0.99      0.99      0.99       373
         3.0       0.89      1.00      0.94        16

    accuracy                           0.96       519
   macro avg       0.91      0.89      0.90       519
weighted avg       0.96      0.96      0.96       519



In [20]:
feature_importances = final_model.feature_importances_
features = X.columns
importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

fig = go.Figure(go.Bar(
    x=importance_df['Importance'],
    y=importance_df['Feature'],
    orientation='h'
))
fig.update_layout(title='Feature Importances', xaxis_title='Importance', yaxis_title='Feature')
fig.show()

In [25]:

# Criar o gráfico de dispersão
fig = go.Figure()

# Adicionar os valores reais
fig.add_trace(go.Scatter(y=y_test.values, mode='lines', name='Real Values'))

# Adicionar as previsões do modelo
fig.add_trace(go.Scatter(y=final_model_predictions, mode='lines', name='Predictions'))

# Atualizar layout do gráfico
fig.update_layout(
    title='Real Values vs Predictions',
    xaxis_title='Index',
    yaxis_title='Class',
    legend_title='Legend',
   
)

fig.show()