### Trabalho realizado por: 
- Daniel Luís, 56362
- João Gonçalo Santos, 57103
- Paulo Bolinhas, 56300
- Rui Martins, 56283

### Carregar Data Set

In [18]:
# Import necessary libraries
import pandas as pd
# Load the dataset
biodeg_data = pd.read_csv('biodegradable_a.csv', sep=',')

### Fazer a respetiva divisão das colunas entre valores contínuos e categóricos

In [19]:
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

columns_names = biodeg_data.columns
all_data_values = biodeg_data.values

categorical_cols = [23,24, 28, 41]
continuous_cols = [col for col in range(42) if col not in categorical_cols]

#data by type
categorical_data = biodeg_data.iloc[:, categorical_cols]
continuous_data = biodeg_data.iloc[:, continuous_cols]

### Lidar com os valores em falta

Análise das colunas para determinar a quantidade de valores em falta, para escolher uma estratégia:

In [20]:
q1 = continuous_data.quantile(0.25)
q3 = continuous_data.quantile(0.75)
iqr = q3 - q1

outliers = []

num_lines = len(continuous_data)
print("Total number of lines:", num_lines)

missing_values = []
for col in continuous_data.columns:
    # calcular o numero de valores em falta
    num_missing = continuous_data[col].isnull().sum()
    missing_values.append(num_missing)
    pct_missing = num_missing / num_lines * 100
    pct_missing = round(pct_missing, 2)
    if (pct_missing > 20):
        print("Column", col, "has", pct_missing, "% missing values.")

    # calcular o numero de outliers
    num_outliers = ((continuous_data[col] < (q1[col] - 1.5 * iqr[col])) | (continuous_data[col] > (q3[col] + 1.5 * iqr[col]))).sum()
    outliers.append(num_outliers)
    pct_outliers = num_outliers / num_lines * 100
    pct_outliers = round(pct_outliers, 2)
    if (pct_outliers > 20):
        print("Column", col, "has", pct_outliers, "% outliers.")

Total number of lines: 4564
Column Psi_i_1d has 23.4 % outliers.
Column SpMax_B has 29.75 % missing values.


Verifica-se que apenas duas colunas ultrapassam as métricas analisadas, por isso, o algoritmo a utilizar para lidar com valores em falta de dados contínuos é o mean.

### Função pré-definida para calcular e devolver as várias métricas de avaliação relevantes relativos a um problema de classficação

In [21]:
from sklearn.metrics import confusion_matrix, f1_score, matthews_corrcoef, precision_score, recall_score

def present_reg_statistics(y_test, preds):
    print("The Precision is: %7.4f" % precision_score(y_test, preds, pos_label='RB'))
    print("The Recall is: %7.4f" % recall_score(y_test, preds, pos_label='RB'))
    print("The F1 score is: %7.4f" % f1_score(y_test, preds, pos_label='RB'))
    print("The Matthews correlation coefficient is: %7.4f" % matthews_corrcoef(y_test, preds))
    print("This is the Confusion Matrix")
    print(pd.DataFrame(confusion_matrix(y_test, preds)))

### Análise dos modelos de scaling para KNN + Simple Cross Validation vs K Fold Validation + Tunning de hiperparâmetros + Comportamento do classificador usando todos os dados disponíveis

Avaliar qual o scaler que obtém melhores resultados para o classificador KNeighborsClassifier, testando com Simple Cross Validation vs K Fold Validation, e, também, fazendo uso da técnica de tunning de hiperparâmetros.

In [22]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler, PowerTransformer
from sklearn.model_selection import KFold

scalers = {
    "StandardScaler": StandardScaler(), 
    "MinMaxScaler": MinMaxScaler(), 
    "PowerTransformer": PowerTransformer()
}

#Foi retirado o Simple Cross Validator, de forma a cumprir o requisito imposto relativo às 12páginas máx
#Basta decomentar a linha abaixo e comentar a seguinte para verificar simple cross validation.

# validators = ["Simple","K"]
validators = ["K"]

for validator in validators:
    print("\n",f"------------------ VALIDATOR {validator} ------------------\n")

    X = biodeg_data.values[:, :-1]
    y = biodeg_data["Biodegradable"].values

    if validator == "Simple":
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=123)
    else:
        kf = KFold(n_splits=10, shuffle=True, random_state=23)
        kf.get_n_splits(X)

        for train_index, test_index in kf.split(X):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

    for scaler_name, scaler in scalers.items():
        print("\n",f"--- SCALING WITH {scaler_name}:\n")
            
        #Scale only the continous data
        Xt_train_cont = pd.DataFrame(scaler.fit_transform(X_train[:, continuous_cols]), columns=columns_names[continuous_cols])
        Xt_test_cont = pd.DataFrame(scaler.transform(X_test[:, continuous_cols]), columns=columns_names[continuous_cols])
    
        #Get the categorical data (not scaled)
        Xt_train_cat = pd.DataFrame(X_train[:, categorical_cols[:-1]], columns=columns_names[categorical_cols[:-1]])
        Xt_test_cat = pd.DataFrame(X_test[:, categorical_cols[:-1]], columns=columns_names[categorical_cols[:-1]])

        # Join continuous and categorical data
        Xt_train = pd.concat([Xt_train_cont, Xt_train_cat], axis=1)
        Xt_test = pd.concat([Xt_test_cont, Xt_test_cat], axis=1)
        
        #Original order with new data
        # Get the original column order
        original_column_order = list(biodeg_data.columns[:-1])
        
        # Reorder the columns in the preprocessed data to match the original order
        Xt_train = Xt_train[original_column_order]
        Xt_test = Xt_test[original_column_order]
        
        #Re-convert in numpy.ndarray
        Xt_train = Xt_train.values
        Xt_test = Xt_test.values

        #Handle missing values
        imputer = SimpleImputer(strategy='mean')
        Xt_train_imp_cont = imputer.fit_transform(Xt_train[:, continuous_cols])
        Xt_test_imp_cont = imputer.fit_transform(Xt_test[:, continuous_cols])
        
        imputer = SimpleImputer(strategy='most_frequent')
        Xt_train_imp_cat = imputer.fit_transform(Xt_train[:, categorical_cols[:-1]])
        Xt_test_imp_cat = imputer.fit_transform(Xt_test[:, categorical_cols[:-1]])
        
        # concatenate the continuous and categorical arrays
        Xt_train_imp = np.concatenate([Xt_train_imp_cont, Xt_train_imp_cat], axis=1)
        Xt_test_imp = np.concatenate([Xt_test_imp_cont, Xt_test_imp_cat], axis=1)

        # retrieve the original column order
        original_order = np.concatenate([continuous_cols, categorical_cols[:-1]])
        sorted_indices = np.argsort(original_order)

        #reorder the concatenated arrays into the original order 
        Xt_train = Xt_train_imp[:, sorted_indices]
        Xt_test = Xt_test_imp[:, sorted_indices]
        
        #----------------
        # define the values of k to try
        k_values = range(3, 8, 2)

        # define the types of weights to use
        weights_list = ["uniform", "distance"]

        for weights in weights_list:
            print("\n", f"- KNN with {weights}")
            for k in k_values:
                knn = KNeighborsClassifier(n_neighbors=k, weights=weights)
                # fit & predict
                knn.fit(Xt_train, y_train)
                preds = knn.predict(Xt_test)

                # print evaluation metrics
                print("\n", f"NN - {k}")
                accuracy = accuracy_score(y_test, preds)
                present_reg_statistics(y_test, preds)
                print(f"Accuracy: {accuracy}")
                


 ------------------ VALIDATOR K ------------------


 --- SCALING WITH StandardScaler:


 - KNN with uniform

 NN - 3
The Precision is:  0.9673
The Recall is:  0.9872
The F1 score is:  0.9772
The Matthews correlation coefficient is:  0.8344
This is the Confusion Matrix
    0    1
0  53   13
1   5  385
Accuracy: 0.9605263157894737

 NN - 5
The Precision is:  0.9699
The Recall is:  0.9923
The F1 score is:  0.9810
The Matthews correlation coefficient is:  0.8622
This is the Confusion Matrix
    0    1
0  54   12
1   3  387
Accuracy: 0.9671052631578947

 NN - 7
The Precision is:  0.9650
The Recall is:  0.9897
The F1 score is:  0.9772
The Matthews correlation coefficient is:  0.8336
This is the Confusion Matrix
    0    1
0  52   14
1   4  386
Accuracy: 0.9605263157894737

 - KNN with distance

 NN - 3
The Precision is:  0.9673
The Recall is:  0.9846
The F1 score is:  0.9759
The Matthews correlation coefficient is:  0.8257
This is the Confusion Matrix
    0    1
0  53   13
1   6  384
Accur

In [24]:
tree_entropia = DecisionTreeClassifier(criterion="entropy")
tree_entropia.fit(Xt_train, y_train)
y_pred_entropia = tree_entropia.predict(Xt_test)

tree_gini = DecisionTreeClassifier(criterion="gini")
tree_gini.fit(Xt_train, y_train)
y_pred_gini = tree_gini.predict(Xt_test)

present_reg_statistics(y_test, y_pred_entropia)
present_reg_statistics(y_test, y_pred_gini)


ValueError: Input X contains NaN.
DecisionTreeClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

### Feature Selection
#### Correlation

Top X atributos mais correlacionados à variàvel y - Variante: Correlation.

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import explained_variance_score

N = Xt_train.shape[0]

y_train = np.array(y_train)
y_test = np.array(y_test)

mapping = {'RB': 0, 'NRB': 1}

# Use the map method to replace the values in y_train
y_train_mapped = np.array([mapping[label] for label in y_train])
y_test_mapped = np.array([mapping[label] for label in y_test])

# converter y para float
y_train_float = y_train_mapped.astype(float)
y_test_float = y_test_mapped.astype(float)

#append the y to the X matrix
v=np.hstack((y_train_float.reshape((N,1)), Xt_train))

# converter de object para float64
v = v.astype(np.float64)

#compute and view the correlation matrix (note that we are interested only on the first column (0) 
#which shows the correlation between each variable and the label y.
corr_matrix_y = np.corrcoef(v.T)[0, 1:]

#38 - tested and got the best values
top_X_corr_vars = np.argsort(abs(corr_matrix_y))[::-1][:38]



### Stepwise Feature selection

Top X atributos mais correlacionados à variàvel y - Variante: Stepwise methods, com duas variantes, nomeadamente direction="forward" vs "backward".

In [None]:
from sklearn.feature_selection import SequentialFeatureSelector

N,M=Xt_train.shape
#----Forward----
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SequentialFeatureSelector

lmr = LinearRegression()
sfs_forward = SequentialFeatureSelector(lmr, n_features_to_select=39, direction='forward', cv=5)
sfs_forward.fit(Xt_train, y_train_float)
# get the relevant columns
features_forward = sfs_forward.get_support()
Features_selected_forward =np.arange(M)[features_forward]
print("Forward - The features selected are columns: ", Features_selected_forward)

nXt_train_stepWise_forward=sfs_forward.transform(Xt_train)
nXt_test_stepWise_forward=sfs_forward.transform(Xt_test)

#----Backward----
lmr = LinearRegression()
sfs_backward = SequentialFeatureSelector(lmr, n_features_to_select=39, direction='backward', cv=5)
sfs_backward.fit(Xt_train, y_train_float)
# get the relevant columns
features_backward = sfs_backward.get_support()
Features_selected_backward =np.arange(M)[features_backward]
print("Backward - The features selected are columns: ", Features_selected_backward)

nXt_train_stepWise_backward=sfs_backward.transform(Xt_train)
nXt_test_stepWise_backward=sfs_backward.transform(Xt_test)


KeyboardInterrupt: 

Verificar se o uso de feature selection é positivo, ou não, relativamente ao uso de todos os dados, para o classificador KNeighborsClassifier com utilização da configuração utilizada que deu origem, até aqui, à "melhor" performance obtida.

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
#Best scale
X = biodeg_data.values[:, :-1]
y = biodeg_data["Biodegradable"].values

#Xt_train, Xt_test are ready to use, beacause are scaled with the last and best performance scaler PowerTransformer() + cross validation
Xt_train_topX_corr = Xt_train[:, top_X_corr_vars]
Xt_test_topX_corr = Xt_test[:, top_X_corr_vars]

#Correlation & Step Wise
feature_selection_option = [[Xt_train_topX_corr, Xt_test_topX_corr, "KNN with Correlation Feature selection"], 
                            [nXt_train_stepWise_forward, nXt_test_stepWise_forward, "KNN with Forward Stepwise Feature selection"],
                            [nXt_train_stepWise_backward, nXt_test_stepWise_backward, "KNN with Backward Stepwise Feature selection"]]

for option in feature_selection_option:
    print(option[2], "\n")
    #Best config of KNN (n_neighbors=5, weights="uniform")  ------ AQUI METER MELHOR CONFIG (VER COM TABELA EXCEL)
    knn = KNeighborsClassifier(n_neighbors=5, weights="uniform")
    # Fit & predict with the features selected as the current option
    knn.fit(option[0], y_train)
    preds = knn.predict(option[1])
    accuracy = accuracy_score(y_test, preds)

    # Print evaluation metrics for the current option
    present_reg_statistics(y_test, preds)
    print("Accuracy: ", accuracy, "\n")


KNN with Correlation Feature selection 

The Precision is:  0.9773
The Recall is:  0.9923
The F1 score is:  0.9847
The Matthews correlation coefficient is:  0.8909
This is the Confusion Matrix
    0    1
0  57    9
1   3  387
Accuracy:  0.9736842105263158 

KNN with Forward Stepwise Feature selection 

The Precision is:  0.9797
The Recall is:  0.9923
The F1 score is:  0.9860
The Matthews correlation coefficient is:  0.9003
This is the Confusion Matrix
    0    1
0  58    8
1   3  387
Accuracy:  0.9758771929824561 

KNN with Backward Stepwise Feature selection 

The Precision is:  0.9797
The Recall is:  0.9897
The F1 score is:  0.9847
The Matthews correlation coefficient is:  0.8916
This is the Confusion Matrix
    0    1
0  58    8
1   4  386
Accuracy:  0.9736842105263158 



### Outros modelos classificadores e comparação entre todas as features ou as X melhores selecionadas ##
Logistic Regression, Decision Tree e SVC com utilização da técnica de tunning de hiperparâmetros para cada um dos classificadores, individualmente. Utilização, para posterior comparação, de todas as variantes de features, isto é, para todas elas, para features selecionadas via métodos de Correlation e features selecionadas via Stepwise methods.

In [None]:
import warnings
from sklearn.exceptions import ConvergenceWarning
# Filter out ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

# Define the hyperparameters to test
logistic_params = {'C': [0.01, 0.1, 1, 10]}

tree_params = {'max_depth': range(1, 11), 'min_samples_split': range(2, 11)}

svm_params = {'C': [1, 10, 100, 1e3],'gamma': [1e-1, 1e-2, 1e-3, 1e-4]}

models = {"Logistic Regression": (LogisticRegression(), logistic_params), 
            "Decision Tree": (DecisionTreeClassifier(random_state=123), tree_params),
            "SVC": (SVC(random_state=123), svm_params)}

X_train_options = [[Xt_train, Xt_test, "All data"],
                    [Xt_train_topX_corr, Xt_test_topX_corr, "Correlation Feature selection"], 
                    [nXt_train_stepWise_forward, nXt_test_stepWise_forward, "Forward Stepwise Feature selection"],
                    [nXt_train_stepWise_backward, nXt_test_stepWise_backward, "Backward Stepwise Feature selection"]]

for X_train_option in X_train_options:
    print("---- Using ", X_train_option[2], "----\n")
    
    # Iterate over the models and their hyperparameters
    for model_name, (model, params) in models.items():
        grid = GridSearchCV(model, params, cv=5)
        print(f"- {model_name}:")
    
        grid.fit(X_train_option[0], y_train)
        # Print the best hyperparameters and the corresponding evaluation metrics
        print("Best Hyperparameters:", grid.best_params_)
        preds = grid.predict(X_train_option[1])
        accuracy = accuracy_score(y_test, preds)
        present_reg_statistics(y_test, preds)
        print("Accuracy:", accuracy, "\n")

---- Using  All data ----

- Logistic Regression:
Best Hyperparameters: {'C': 10}
The Precision is:  0.9529
The Recall is:  0.9846
The F1 score is:  0.9685
The Matthews correlation coefficient is:  0.7649
This is the Confusion Matrix
    0    1
0  47   19
1   6  384
Accuracy: 0.9451754385964912 

- Decision Tree:
Best Hyperparameters: {'max_depth': 10, 'min_samples_split': 10}
The Precision is:  0.9721
The Recall is:  0.9821
The F1 score is:  0.9770
The Matthews correlation coefficient is:  0.8370
This is the Confusion Matrix
    0    1
0  55   11
1   7  383
Accuracy: 0.9605263157894737 

- SVC:
Best Hyperparameters: {'C': 1, 'gamma': 0.1}
The Precision is:  0.9847
The Recall is:  0.9897
The F1 score is:  0.9872
The Matthews correlation coefficient is:  0.9104
This is the Confusion Matrix
    0    1
0  60    6
1   4  386
Accuracy: 0.9780701754385965 

---- Using  Correlation Feature selection ----

- Logistic Regression:
Best Hyperparameters: {'C': 1}
The Precision is:  0.9529
The Reca

KeyboardInterrupt: 