# tratamento de dados categóricos

In [41]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
from ucimlrepo import fetch_ucirepo
car_evaluation = fetch_ucirepo(id=19)
features = car_evaluation.data.features
targets = car_evaluation.data.targets


df_car = pd.concat([features, targets], axis=1)


feature_columns = df_car.columns[:-1]
ordinal_encoder_features = OrdinalEncoder()
df_car_encoded_features = df_car[feature_columns].copy()
df_car_encoded_features[df_car_encoded_features.columns] = ordinal_encoder_features.fit_transform(df_car[feature_columns])
df_car_encoded_features


ordinal_encoder_target = OrdinalEncoder()
df_car['class'] = ordinal_encoder_target.fit_transform(df_car[['class']])


## Treinamento 

In [42]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score

Y = df_car['class']
X = df_car_encoded_features

x_train, x_test, y_train, y_test = train_test_split(X, Y,test_size=0.3)
r_forest = RandomForestClassifier()


In [43]:
#Validação cruzada
scores = cross_val_score(r_forest, x_train, y_train, scoring="accuracy",cv=10)
scores


array([0.9338843 , 0.96694215, 0.94214876, 0.96694215, 0.96694215,
       0.95867769, 0.97520661, 0.96694215, 0.96694215, 0.96666667])

In [44]:
r_forest.fit(x_train, y_train)

some_data = x_test.iloc[:5]
labels = y_test[:5]

print("Previsões: ",r_forest.predict(some_data))
print("labels: ", labels.values)


Previsões:  [2. 2. 2. 2. 2.]
labels:  [2. 2. 2. 2. 2.]


## Avaliando e otimizando o modelo

In [45]:
import plotly.graph_objects as go
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [4, 6, 8, 10],
    'criterion': ['gini', 'entropy']
}

grid_search = GridSearchCV(RandomForestClassifier(), param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(x_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [4, 6, 8, 10],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'n_estimators': [100, 200, 300]},
             scoring='accuracy')

In [46]:
final_model = grid_search.best_estimator_
final_model_predictions = final_model.predict(x_test)

In [47]:

accuracy = accuracy_score(y_test, final_model_predictions)
print(f"Best parameters found: {grid_search.best_params_}")
print(f"Test accuracy: {accuracy:.4f}")
print(classification_report(y_test, final_model_predictions))

Best parameters found: {'criterion': 'entropy', 'max_depth': 10, 'max_features': 'log2', 'n_estimators': 100}
Test accuracy: 0.9557
              precision    recall  f1-score   support

         0.0       0.91      0.90      0.91       119
         1.0       1.00      0.73      0.84        22
         2.0       0.97      0.99      0.98       358
         3.0       0.94      0.85      0.89        20

    accuracy                           0.96       519
   macro avg       0.96      0.87      0.91       519
weighted avg       0.96      0.96      0.95       519



In [48]:
feature_importances = final_model.feature_importances_
features = X.columns
importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

fig = go.Figure(go.Bar(
    x=importance_df['Importance'],
    y=importance_df['Feature'],
    orientation='h'
))
fig.update_layout(title='Feature Importances', xaxis_title='Importance', yaxis_title='Feature')
fig.show()

In [49]:
#real x values predictions
fig = go.Figure()
fig.add_trace(go.Scatter(y=y_test.values, mode='lines', name='Real Values'))
fig.add_trace(go.Scatter(y=final_model_predictions, mode='lines', name='Predictions'))
fig.update_layout(
    title='Real Values vs Predictions',
    xaxis_title='Index',
    yaxis_title='Class',
    legend_title='Legend',
   
)

fig.show()

In [53]:
new_data = pd.DataFrame({
    'buying': ['vhigh', 'low'],
    'maint': ['vhigh', 'med'],
    'doors': ['2', '4'],
    'persons': ['2', '4'],
    'lug_boot': ['small', 'big'],
    'safety': ['low', 'high']
})


new_data_encoded = ordinal_encoder_features.transform(new_data)

new_predictions_encoded = final_model.predict(new_data_encoded)

# convertendo predições codificadas para dado categórico
new_predictions = ordinal_encoder_target.inverse_transform(new_predictions_encoded.reshape(-1, 1)).ravel()

print("New Data Predictions: ", new_predictions)


new_data['predicted_class'] = new_predictions
print(new_data)

New Data Predictions:  ['unacc' 'vgood']
  buying  maint doors persons lug_boot safety predicted_class
0  vhigh  vhigh     2       2    small    low           unacc
1    low    med     4       4      big   high           vgood
