In [121]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [122]:
def performance_report(model, X_test, y_test, X_train):
    # Previsões sobre o teste
    y_pred = model.predict(X_test)
    # Accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f'Precisão: {accuracy :.2f}')
    # Classification report
    print()
    print('Classification_report:')
    print(classification_report(y_test, y_pred))
    # Feature importance
    importance_df = pd.DataFrame({
        'Feature': X_train.columns,
        'Importance': model.feature_importances_
    }).sort_values(by='Importance', ascending=False)
    print()
    print('Feature importance:')
    plt.barh(importance_df['Feature'], importance_df['Importance'])
    plt.xlabel('Importance')
    plt.ylabel('Feature')
    plt.title('Feature Importance')
    plt.gca().invert_yaxis()
    plt.show()


# Leitura de dados

In [123]:
df = pd.read_csv('resultados_instancias_tcc.csv')
df = df.drop(columns='#')
df = df[df['Obj_RF_T_0'] != np.inf]  # Remove infactíveis

In [124]:
df.head()

Unnamed: 0,Instancias,Plantas,Produtos,Periodos,Bin_vars,Setup_time,Setup_cost,Capacidade,Obj_RF_1_0,Time_RF_1_0,...,Obj_RF_4_2,Time_RF_4_2,Obj_RF_4_3,Time_RF_4_3,Obj_RF_6_0,Time_RF_6_0,Obj_RF_6_1,Time_RF_6_1,Obj_RF_T_0,Time_RF_T_0
0,AAA00_12_2_10,2,10,12,240,high,high,tight,68290.89,1.1,...,,,,,65345.74,13.7,64514.48,13.11,63942.18,661.7
1,AAA00_12_2_60,2,60,12,1440,high,high,tight,472379.31,16.1,...,,,,,417894.66,1800.37,416054.0,1201.15,414813.46,1800.3
2,AAA00_12_2_120,2,120,12,2880,high,high,tight,876917.39,54.6,...,,,,,823765.77,1800.51,809997.06,1201.2,808310.31,1800.1
3,AAA00_12_4_10,4,10,12,480,high,high,tight,119691.81,8.3,...,,,,,116811.61,1387.94,116443.66,1200.89,116655.81,1800.3
4,AAA00_12_4_60,4,60,12,2880,high,high,tight,675451.58,126.0,...,,,,,659496.4,1800.13,661304.02,1203.93,677672.79,1800.0


# Construção de targets

In [125]:
obj_columns = [col for col in df.columns if col.startswith('Obj_')]
time_columns = [col for col in df.columns if col.startswith('Time_')]

## Target em função objetivo (BEST)

In [126]:
# Seleciona para cada instância o método com menor função objetivo para a coluna BEST
df['BEST'] = df[obj_columns].idxmin(axis=1)
df['BEST'] = df['BEST'].str.replace('Obj_', '')

In [127]:
df['BEST'].value_counts()

BEST
RF_T_0    221
RF_6_0     73
RF_3_1     60
RF_2_0     52
RF_1_0     48
RF_6_1     45
RF_2_1     36
RF_3_0     33
RF_4_0     30
Name: count, dtype: int64

## Target com compromisso de tempo (BEST_TIME)

In [128]:
# Seleciona para cada instância o método com melhor relação função objetivo e tempo para a coluna BEST_TIME
# O compromisso com tempo 

# Preprocessamento de dados

## Drop Obj_ e Time_

In [129]:
df = df.drop(columns=obj_columns + time_columns + ['Instancias'])

## Encoding de variáveis categóricas

In [130]:
categorical_cols = ['Setup_time', 'Setup_cost', 'Capacidade', 'BEST']

encode_dict = {
    'Setup_time': ['normal', 'high'],
    'Setup_cost': ['normal', 'high'],
    'Capacidade': ['tight', 'normal'],
    'BEST': sorted(df['BEST'].unique())
}

decoding_dict = {}

for col, order in encode_dict.items():
    df[col] = pd.Categorical(df[col], categories=order, ordered=True).codes
    decoding_dict[col] = {i: category for i, category in enumerate(order)}

# Train e test split

In [131]:
df.head()

Unnamed: 0,Plantas,Produtos,Periodos,Bin_vars,Setup_time,Setup_cost,Capacidade,BEST
0,2,10,12,240,1,1,0,8
1,2,60,12,1440,1,1,0,8
2,2,120,12,2880,1,1,0,8
3,4,10,12,480,1,1,0,7
4,4,60,12,2880,1,1,0,4


In [132]:
X, y = df.drop(columns='BEST'), df['BEST']

In [133]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2112)

In [134]:
X_train

Unnamed: 0,Plantas,Produtos,Periodos,Bin_vars,Setup_time,Setup_cost,Capacidade
136,2,60,12,1440,0,1,0
257,2,120,12,2880,0,0,0
286,2,60,12,1440,0,0,0
547,6,60,12,4320,0,0,1
134,20,120,12,28800,0,1,0
...,...,...,...,...,...,...,...
57,20,10,12,2400,1,1,0
391,2,60,12,1440,0,1,1
387,20,10,12,2400,0,1,1
167,2,120,12,2880,1,0,0


# Modelos

Num primeiro momento vou testar diversos algoritmos antes de buscar precisão. Ainda tenho algum trabalho de feature engineering para ter tudo afinado.

In [135]:
# Inicialização dos classificadores
classifiers = {
    'Logistic Regression': LogisticRegression(random_state=2112, max_iter=1000),
    'Support Vector Machine': SVC(kernel='rbf', random_state=2112, probability=True),
    'K-Nearest Neighbors': KNeighborsClassifier(n_neighbors=5),
    'Gradient Boosting': GradientBoostingClassifier(random_state=2112),
    'Random Forest': RandomForestClassifier(random_state=2112),
    'Naive Bayes': GaussianNB()
}

# Teste dos classificadores em bulk
for name, clf in classifiers.items():
    clf.fit(X_train, y_train)  # Treinos
    y_pred = clf.predict(X_test)  # Previsões
    acc = accuracy_score(y_test, y_pred)  # Precisão
    print(f'{name} Accuracy: {acc:.2f}')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression Accuracy: 0.41
Support Vector Machine Accuracy: 0.41
K-Nearest Neighbors Accuracy: 0.62
Gradient Boosting Accuracy: 0.62
Random Forest Accuracy: 0.62
Naive Bayes Accuracy: 0.42
