# Relatório ML II - Seleção de Atributos com Filter

## Carregando pacotes

In [1]:
from sklearn.model_selection import train_test_split

from sklearn.feature_selection import VarianceThreshold

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC

from sklearn.metrics import (confusion_matrix, 
                             ConfusionMatrixDisplay, 
                             classification_report, 
                             RocCurveDisplay,
                             roc_curve,
                             r2_score,
                             mean_squared_error,
                             auc, accuracy_score)
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
from scipy.io.arff import loadarff 

## Carregando banco de dados letter

In [2]:
df_data = pd.DataFrame(loadarff('dataset_6_letter.arff')[0])
df_data['class'] = df_data['class'].astype(str).str.extract("b'([A-Z])'")
df_data['class'] = pd.factorize(df_data['class'])[0]

## Separação das Features do Target

In [3]:
# Features
X = df_data.iloc[:,0:-1]
# Class
y = df_data.iloc[:,-1]

## Filter selecionando 12 features

In [4]:
filter_variance = VarianceThreshold(4.10)
X_filtered = filter_variance.fit_transform(X)
print(f'Total de features: {X.shape[1]}')
print(f'Selecionados: {X_filtered.shape[1]}')

Total de features: 16
Selecionados: 12


In [11]:
def show_selected(filter_variance):
    mask = filter_variance.get_support()
    print(X.columns[mask])

In [10]:
show_selected(filter_variance)

Index(['y-box', 'high', 'onpix', 'x-bar', 'y-bar', 'x2bar', 'y2bar', 'xybar',
       'x2ybr', 'xy2br', 'x-ege', 'y-ege'],
      dtype='object')


### Separação da base treinamento da base de teste

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X_filtered, y, test_size=0.2, random_state=42, stratify=y)

### Função de cálculo de métricas

In [19]:
def eval_model(model):
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred))
    print("MSE:", mean_squared_error(y_test, y_pred))
    print("R2_score:", r2_score(y_test, y_pred))
    print("Accuracy score:", accuracy_score(y_test,y_pred))

### KNN

In [20]:
model = KNeighborsClassifier(n_neighbors=1,metric='euclidean')
eval_model(model)

              precision    recall  f1-score   support

           0       1.00      0.97      0.98       147
           1       0.97      0.90      0.93       161
           2       0.95      0.97      0.96       149
           3       0.88      0.88      0.88       147
           4       0.89      0.92      0.90       155
           5       0.94      0.95      0.95       157
           6       0.85      0.85      0.85       151
           7       0.98      0.98      0.98       158
           8       0.93      0.96      0.94       161
           9       0.94      0.95      0.94       153
          10       0.99      0.98      0.99       158
          11       0.89      0.89      0.89       148
          12       0.90      0.90      0.90       154
          13       0.92      0.94      0.93       151
          14       0.93      0.97      0.95       157
          15       0.95      0.95      0.95       152
          16       0.97      0.96      0.96       157
          17       0.96    

### Árvore de Decisão

In [21]:
model = DecisionTreeClassifier(criterion='log_loss',max_depth=100,splitter='best')
eval_model(model)

              precision    recall  f1-score   support

           0       0.91      0.89      0.90       147
           1       0.87      0.84      0.85       161
           2       0.85      0.81      0.83       149
           3       0.79      0.82      0.80       147
           4       0.84      0.86      0.85       155
           5       0.84      0.87      0.85       157
           6       0.76      0.79      0.78       151
           7       0.93      0.89      0.91       158
           8       0.84      0.80      0.82       161
           9       0.88      0.89      0.88       153
          10       0.93      0.94      0.94       158
          11       0.77      0.84      0.81       148
          12       0.81      0.84      0.82       154
          13       0.85      0.87      0.86       151
          14       0.85      0.86      0.85       157
          15       0.93      0.89      0.91       152
          16       0.93      0.90      0.92       157
          17       0.87    

### Floresta Randômica

In [22]:
model = RandomForestClassifier(criterion='entropy',max_depth=1000,n_estimators=10)
eval_model(model)

              precision    recall  f1-score   support

           0       0.98      0.95      0.97       147
           1       0.91      0.93      0.92       161
           2       0.90      0.91      0.90       149
           3       0.84      0.83      0.84       147
           4       0.87      0.94      0.90       155
           5       0.92      0.92      0.92       157
           6       0.86      0.91      0.88       151
           7       0.95      0.97      0.96       158
           8       0.86      0.91      0.88       161
           9       0.95      0.89      0.92       153
          10       0.96      0.97      0.97       158
          11       0.87      0.85      0.86       148
          12       0.92      0.92      0.92       154
          13       0.91      0.90      0.91       151
          14       0.92      0.96      0.94       157
          15       0.97      0.92      0.95       152
          16       0.93      0.95      0.94       157
          17       0.98    

### Regressão Logística

In [23]:
model = LogisticRegression(penalty='l2',C=0.5,solver='newton-cg')
eval_model(model)

              precision    recall  f1-score   support

           0       0.76      0.73      0.75       147
           1       0.79      0.78      0.79       161
           2       0.51      0.56      0.53       149
           3       0.49      0.39      0.43       147
           4       0.72      0.74      0.73       155
           5       0.76      0.80      0.78       157
           6       0.61      0.70      0.65       151
           7       0.81      0.87      0.84       158
           8       0.68      0.71      0.70       161
           9       0.78      0.83      0.81       153
          10       0.91      0.85      0.88       158
          11       0.55      0.69      0.61       148
          12       0.68      0.69      0.69       154
          13       0.66      0.52      0.58       151
          14       0.70      0.75      0.72       157
          15       0.83      0.72      0.77       152
          16       0.76      0.72      0.74       157
          17       0.76    

### Naive Bayes Gaussiano

In [24]:
model = GaussianNB()
eval_model(model)

              precision    recall  f1-score   support

           0       0.82      0.66      0.73       147
           1       0.82      0.72      0.77       161
           2       0.34      0.28      0.31       149
           3       0.54      0.31      0.39       147
           4       0.63      0.79      0.70       155
           5       0.82      0.59      0.69       157
           6       0.60      0.60      0.60       151
           7       0.62      0.86      0.72       158
           8       0.57      0.73      0.64       161
           9       0.69      0.75      0.72       153
          10       0.86      0.87      0.86       158
          11       0.51      0.53      0.52       148
          12       0.57      0.32      0.41       154
          13       0.52      0.69      0.59       151
          14       0.54      0.61      0.57       157
          15       0.99      0.72      0.83       152
          16       0.46      0.62      0.53       157
          17       0.65    

### MLP

In [25]:
model = MLPClassifier(activation='tanh', hidden_layer_sizes= (16, 26), learning_rate= 'invscaling')
eval_model(model)

              precision    recall  f1-score   support

           0       0.88      0.81      0.84       147
           1       0.96      0.87      0.91       161
           2       0.64      0.75      0.69       149
           3       0.80      0.69      0.74       147
           4       0.83      0.84      0.84       155
           5       0.92      0.86      0.89       157
           6       0.72      0.74      0.73       151
           7       0.87      0.94      0.90       158
           8       0.74      0.84      0.79       161
           9       0.88      0.88      0.88       153
          10       0.98      0.88      0.93       158
          11       0.78      0.76      0.77       148
          12       0.71      0.73      0.72       154
          13       0.90      0.79      0.84       151
          14       0.80      0.89      0.84       157
          15       0.93      0.88      0.90       152
          16       0.87      0.90      0.88       157
          17       0.85    

### SVM

In [26]:
model = SVC(C= 20, decision_function_shape= 'ovo', kernel= 'rbf')
eval_model(model)

              precision    recall  f1-score   support

           0       0.99      0.96      0.98       147
           1       0.98      0.88      0.93       161
           2       0.99      0.97      0.98       149
           3       0.87      0.89      0.88       147
           4       0.89      0.92      0.90       155
           5       0.92      0.96      0.94       157
           6       0.86      0.91      0.88       151
           7       0.99      0.97      0.98       158
           8       0.90      0.91      0.90       161
           9       0.95      0.95      0.95       153
          10       0.99      0.99      0.99       158
          11       0.90      0.89      0.89       148
          12       0.89      0.92      0.90       154
          13       0.97      0.93      0.95       151
          14       0.91      0.99      0.95       157
          15       0.99      0.96      0.97       152
          16       0.97      0.96      0.97       157
          17       0.99    

## Filter selecionando 8 features

In [13]:
filter_variance = VarianceThreshold(5.25)
X_filtered = filter_variance.fit_transform(X)
print(f'Total de features: {X.shape[1]}')
print(f'Selecionados: {X_filtered.shape[1]}')

Total de features: 16
Selecionados: 8


In [14]:
show_selected(filter_variance)

Index(['y-box', 'y-bar', 'x2bar', 'y2bar', 'xybar', 'x2ybr', 'x-ege', 'y-ege'], dtype='object')


### Separação da base treinamento da base de teste

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X_filtered, y, test_size=0.2, random_state=42, stratify=y)

### KNN

In [29]:
model = KNeighborsClassifier(n_neighbors=1,metric='euclidean')
eval_model(model)

              precision    recall  f1-score   support

           0       0.98      0.94      0.96       147
           1       0.89      0.88      0.89       161
           2       0.91      0.91      0.91       149
           3       0.76      0.87      0.81       147
           4       0.88      0.90      0.89       155
           5       0.92      0.92      0.92       157
           6       0.81      0.83      0.82       151
           7       0.95      0.97      0.96       158
           8       0.83      0.80      0.81       161
           9       0.91      0.91      0.91       153
          10       0.97      0.95      0.96       158
          11       0.85      0.80      0.82       148
          12       0.83      0.84      0.83       154
          13       0.85      0.89      0.87       151
          14       0.88      0.92      0.90       157
          15       0.92      0.93      0.93       152
          16       0.92      0.91      0.92       157
          17       0.95    

### Árvore de Decisão

In [30]:
model = DecisionTreeClassifier(criterion='log_loss',max_depth=100,splitter='best')
eval_model(model)

              precision    recall  f1-score   support

           0       0.94      0.91      0.92       147
           1       0.80      0.82      0.81       161
           2       0.82      0.82      0.82       149
           3       0.76      0.79      0.78       147
           4       0.86      0.86      0.86       155
           5       0.83      0.90      0.87       157
           6       0.77      0.77      0.77       151
           7       0.94      0.90      0.92       158
           8       0.70      0.71      0.71       161
           9       0.90      0.87      0.88       153
          10       0.92      0.92      0.92       158
          11       0.75      0.78      0.76       148
          12       0.83      0.80      0.81       154
          13       0.83      0.83      0.83       151
          14       0.78      0.84      0.81       157
          15       0.91      0.89      0.90       152
          16       0.88      0.92      0.90       157
          17       0.88    

### Floresta Randomica

In [31]:
model = RandomForestClassifier(criterion='entropy',max_depth=1000,n_estimators=10)
eval_model(model)

              precision    recall  f1-score   support

           0       0.92      0.92      0.92       147
           1       0.91      0.88      0.89       161
           2       0.89      0.91      0.90       149
           3       0.80      0.90      0.85       147
           4       0.84      0.89      0.87       155
           5       0.86      0.93      0.89       157
           6       0.78      0.87      0.83       151
           7       0.92      0.92      0.92       158
           8       0.77      0.77      0.77       161
           9       0.91      0.91      0.91       153
          10       0.94      0.94      0.94       158
          11       0.87      0.86      0.86       148
          12       0.83      0.81      0.82       154
          13       0.87      0.84      0.86       151
          14       0.86      0.92      0.89       157
          15       0.94      0.89      0.92       152
          16       0.92      0.93      0.92       157
          17       0.94    

### Regressão Logística

In [32]:
model = LogisticRegression(penalty='l2',C=0.5,solver='newton-cg')
eval_model(model)

              precision    recall  f1-score   support

           0       0.67      0.64      0.66       147
           1       0.78      0.68      0.73       161
           2       0.33      0.40      0.36       149
           3       0.27      0.18      0.22       147
           4       0.65      0.67      0.66       155
           5       0.71      0.82      0.76       157
           6       0.58      0.79      0.67       151
           7       0.83      0.85      0.84       158
           8       0.49      0.61      0.55       161
           9       0.75      0.82      0.78       153
          10       0.94      0.84      0.89       158
          11       0.17      0.18      0.17       148
          12       0.46      0.36      0.40       154
          13       0.63      0.46      0.53       151
          14       0.37      0.48      0.42       157
          15       0.81      0.71      0.76       152
          16       0.48      0.55      0.51       157
          17       0.64    

### Naive Bayes Gaussiano

In [33]:
model = GaussianNB()
eval_model(model)

              precision    recall  f1-score   support

           0       0.75      0.63      0.68       147
           1       0.90      0.61      0.73       161
           2       0.31      0.25      0.27       149
           3       0.67      0.31      0.42       147
           4       0.59      0.73      0.65       155
           5       0.85      0.66      0.74       157
           6       0.60      0.57      0.59       151
           7       0.69      0.88      0.78       158
           8       0.49      0.43      0.46       161
           9       0.69      0.76      0.72       153
          10       0.88      0.86      0.87       158
          11       0.15      0.11      0.13       148
          12       0.47      0.34      0.39       154
          13       0.42      0.70      0.52       151
          14       0.55      0.52      0.53       157
          15       0.84      0.72      0.77       152
          16       0.33      0.58      0.42       157
          17       0.71    

### MLP

In [34]:
model = MLPClassifier(activation='tanh', hidden_layer_sizes= (16, 26), learning_rate= 'invscaling')
eval_model(model)

              precision    recall  f1-score   support

           0       0.82      0.80      0.81       147
           1       0.86      0.81      0.83       161
           2       0.60      0.67      0.63       149
           3       0.65      0.60      0.62       147
           4       0.73      0.75      0.74       155
           5       0.86      0.83      0.85       157
           6       0.65      0.77      0.71       151
           7       0.91      0.91      0.91       158
           8       0.68      0.69      0.68       161
           9       0.88      0.83      0.85       153
          10       0.89      0.89      0.89       158
          11       0.70      0.70      0.70       148
          12       0.68      0.64      0.66       154
          13       0.74      0.78      0.76       151
          14       0.79      0.78      0.78       157
          15       0.84      0.85      0.84       152
          16       0.74      0.85      0.79       157
          17       0.82    

### SVM

In [35]:
model = SVC(C= 20, decision_function_shape= 'ovo', kernel= 'rbf')
eval_model(model)

              precision    recall  f1-score   support

           0       0.97      0.95      0.96       147
           1       0.92      0.88      0.90       161
           2       0.88      0.90      0.89       149
           3       0.84      0.83      0.83       147
           4       0.89      0.88      0.88       155
           5       0.92      0.92      0.92       157
           6       0.75      0.91      0.82       151
           7       0.95      0.96      0.95       158
           8       0.72      0.75      0.73       161
           9       0.95      0.89      0.92       153
          10       0.99      0.95      0.97       158
          11       0.87      0.84      0.85       148
          12       0.81      0.84      0.82       154
          13       0.84      0.88      0.86       151
          14       0.84      0.90      0.87       157
          15       0.95      0.90      0.93       152
          16       0.90      0.94      0.92       157
          17       0.94    

## Filter selecionando 4 features

In [16]:
filter_variance = VarianceThreshold(6.5)
X_filtered = filter_variance.fit_transform(X)
print(f'Total de features: {X.shape[1]}')
print(f'Selecionados: {X_filtered.shape[1]}')
X_train, X_test, y_train, y_test = train_test_split(X_filtered, y, test_size=0.2, random_state=42, stratify=y)

Total de features: 16
Selecionados: 4


In [17]:
show_selected(filter_variance)

Index(['y-box', 'x2bar', 'x2ybr', 'y-ege'], dtype='object')


### KNN

In [37]:
model = KNeighborsClassifier(n_neighbors=1,metric='euclidean')
eval_model(model)

              precision    recall  f1-score   support

           0       0.51      0.44      0.47       147
           1       0.27      0.25      0.26       161
           2       0.44      0.50      0.47       149
           3       0.46      0.44      0.45       147
           4       0.41      0.45      0.43       155
           5       0.52      0.54      0.53       157
           6       0.45      0.44      0.45       151
           7       0.55      0.56      0.56       158
           8       0.28      0.32      0.29       161
           9       0.59      0.51      0.55       153
          10       0.66      0.58      0.62       158
          11       0.28      0.27      0.28       148
          12       0.34      0.38      0.36       154
          13       0.38      0.41      0.40       151
          14       0.54      0.45      0.49       157
          15       0.55      0.61      0.58       152
          16       0.51      0.51      0.51       157
          17       0.64    

### Árvore de Decisão

In [38]:
model = DecisionTreeClassifier(criterion='log_loss',max_depth=100,splitter='best')
eval_model(model)

              precision    recall  f1-score   support

           0       0.48      0.76      0.59       147
           1       0.35      0.39      0.37       161
           2       0.38      0.59      0.47       149
           3       0.50      0.50      0.50       147
           4       0.49      0.55      0.52       155
           5       0.59      0.59      0.59       157
           6       0.48      0.58      0.52       151
           7       0.55      0.58      0.57       158
           8       0.37      0.29      0.33       161
           9       0.61      0.71      0.66       153
          10       0.71      0.67      0.69       158
          11       0.37      0.26      0.30       148
          12       0.43      0.51      0.47       154
          13       0.45      0.54      0.49       151
          14       0.60      0.48      0.54       157
          15       0.70      0.65      0.68       152
          16       0.56      0.59      0.58       157
          17       0.71    

### Floresta Randômica

In [39]:
model = RandomForestClassifier(criterion='entropy',max_depth=1000,n_estimators=10)
eval_model(model)

              precision    recall  f1-score   support

           0       0.56      0.63      0.59       147
           1       0.41      0.34      0.37       161
           2       0.45      0.59      0.51       149
           3       0.53      0.49      0.51       147
           4       0.52      0.54      0.53       155
           5       0.64      0.56      0.60       157
           6       0.46      0.53      0.49       151
           7       0.58      0.60      0.59       158
           8       0.37      0.33      0.35       161
           9       0.66      0.70      0.68       153
          10       0.72      0.69      0.71       158
          11       0.32      0.23      0.27       148
          12       0.42      0.53      0.47       154
          13       0.47      0.60      0.53       151
          14       0.60      0.53      0.56       157
          15       0.66      0.66      0.66       152
          16       0.52      0.57      0.54       157
          17       0.68    

### Regressão Logística

In [40]:
model = LogisticRegression(penalty='l2',C=0.5,solver='newton-cg')
eval_model(model)

              precision    recall  f1-score   support

           0       0.23      0.35      0.28       147
           1       0.07      0.03      0.04       161
           2       0.09      0.11      0.10       149
           3       0.21      0.11      0.14       147
           4       0.22      0.14      0.17       155
           5       0.30      0.36      0.33       157
           6       0.36      0.52      0.42       151
           7       0.37      0.23      0.28       158
           8       0.15      0.11      0.13       161
           9       0.43      0.44      0.43       153
          10       0.47      0.63      0.54       158
          11       0.08      0.03      0.05       148
          12       0.10      0.08      0.09       154
          13       0.26      0.25      0.25       151
          14       0.21      0.30      0.25       157
          15       0.38      0.39      0.39       152
          16       0.14      0.25      0.18       157
          17       0.48    

### Naive Bayes Gaussiano

In [41]:
model = GaussianNB()
eval_model(model)

              precision    recall  f1-score   support

           0       0.18      0.34      0.24       147
           1       0.13      0.09      0.10       161
           2       0.17      0.14      0.16       149
           3       0.00      0.00      0.00       147
           4       0.04      0.01      0.02       155
           5       0.43      0.31      0.36       157
           6       0.26      0.34      0.30       151
           7       0.40      0.01      0.02       158
           8       0.20      0.03      0.05       161
           9       0.38      0.55      0.45       153
          10       0.33      0.81      0.47       158
          11       0.00      0.00      0.00       148
          12       0.04      0.03      0.03       154
          13       0.27      0.71      0.39       151
          14       0.43      0.08      0.13       157
          15       0.62      0.34      0.44       152
          16       0.08      0.11      0.09       157
          17       0.50    

### MLP

In [42]:
model = MLPClassifier(activation='tanh', hidden_layer_sizes= (16, 26), learning_rate= 'invscaling')
eval_model(model)

              precision    recall  f1-score   support

           0       0.34      0.63      0.45       147
           1       0.32      0.20      0.25       161
           2       0.41      0.36      0.38       149
           3       0.56      0.32      0.41       147
           4       0.49      0.45      0.47       155
           5       0.50      0.43      0.47       157
           6       0.36      0.56      0.44       151
           7       0.67      0.47      0.55       158
           8       0.44      0.30      0.36       161
           9       0.63      0.51      0.56       153
          10       0.66      0.70      0.68       158
          11       0.32      0.18      0.23       148
          12       0.30      0.34      0.32       154
          13       0.37      0.48      0.42       151
          14       0.47      0.48      0.47       157
          15       0.73      0.59      0.65       152
          16       0.33      0.42      0.37       157
          17       0.62    

### SVM

In [43]:
model = SVC(C= 20, decision_function_shape= 'ovo', kernel= 'rbf')
eval_model(model)

              precision    recall  f1-score   support

           0       0.33      0.61      0.43       147
           1       0.33      0.22      0.26       161
           2       0.44      0.56      0.50       149
           3       0.63      0.31      0.42       147
           4       0.49      0.49      0.49       155
           5       0.54      0.52      0.53       157
           6       0.44      0.61      0.51       151
           7       0.76      0.47      0.58       158
           8       0.43      0.30      0.36       161
           9       0.62      0.65      0.64       153
          10       0.74      0.73      0.74       158
          11       0.49      0.22      0.31       148
          12       0.30      0.53      0.38       154
          13       0.43      0.66      0.52       151
          14       0.60      0.54      0.57       157
          15       0.76      0.60      0.67       152
          16       0.45      0.50      0.47       157
          17       0.76    