# Projeto de Engenharia do Conhecimento 2023/2024

*Projeto by: Renato Ferreira (58238), Pedro Lopes(58196), Simão Quintas (58190)*

### Index

1. Feature selection
    1. Using correlation
    2. Using stepwise methods
    3. Random Forests for Feature Selection
2. Principal Components analysis
    1. Linear PCA
    2. Kernel PCA
3. Model Tuning


## 1. Feature selection

In [44]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score, precision_score, recall_score, matthews_corrcoef, confusion_matrix, make_scorer
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from scipy.stats import pearsonr

Start the imputer and get the data

In [94]:
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=5, weights="uniform")

data = pd.read_csv('proj-data.csv', na_values='?')

Let's drop the rows and columns with a large number of NA values and get dummies for the columns with strings

In [95]:
# Remover as colunas que indicam se algo foi medido ou não, a coluna com a indentificação e colunas com muitos valores ausentes
data.drop(data.filter(like='measured').columns, axis=1, inplace=True)
data.drop('[record identification]', axis=1, inplace=True)

hyperthyroid_conditions = ['A', 'B', 'C', 'D']
hypothyroid_conditions = ['E', 'F', 'G', 'H']
binding_protein = ['I', 'J']
general_health = ['K']
replacement_therapy = ['L', 'M', 'N']
discordant = ['R']
none = ['-']

for i in range(len(data)):
    if data.at[i, "diagnoses"] in hyperthyroid_conditions :
        data.at[i, "diagnoses"] = 1
    elif data.at[i, "diagnoses"] in hypothyroid_conditions :
        data.at[i, "diagnoses"] = 2
    elif data.at[i, "diagnoses"] in binding_protein :
        data.at[i, "diagnoses"] = 3
    elif data.at[i, "diagnoses"] in general_health :
        data.at[i, "diagnoses"] = 4
    elif data.at[i, "diagnoses"] in replacement_therapy :
        data.at[i, "diagnoses"] = 5
    elif data.at[i, "diagnoses"] in discordant :
        data.at[i, "diagnoses"] = 6
    elif data.at[i, "diagnoses"] in none :
        data.at[i, "diagnoses"] = 7 
    else:
        data.at[i, "diagnoses"] = 8 

data.replace('f', 0, inplace=True)
data.replace('t', 1, inplace=True)

data

Unnamed: 0,age:,sex:,on thyroxine:,query on thyroxine:,on antithyroid medication:,sick:,pregnant:,thyroid surgery:,I131 treatment:,query hypothyroid:,...,hypopituitary:,psych:,TSH:,T3:,TT4:,T4U:,FTI:,TBG:,referral source:,diagnoses
0,29,F,0,0,0,0,0,0,0,1,...,0,0,0.3,,,,,,other,7
1,29,F,0,0,0,0,0,0,0,0,...,0,0,1.6,1.9,128.0,,,,other,7
2,36,F,0,0,0,0,0,0,0,0,...,0,0,,,,,,26.0,other,7
3,60,F,0,0,0,0,0,0,0,0,...,0,0,,,,,,26.0,other,7
4,77,F,0,0,0,0,0,0,0,0,...,0,0,,,,,,21.0,other,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7333,56,M,0,0,0,0,0,0,0,0,...,0,0,,,64.0,0.83,77.0,,SVI,7
7334,22,M,0,0,0,0,0,0,0,0,...,0,0,,,91.0,0.92,99.0,,SVI,7
7335,69,M,0,0,0,0,0,0,0,0,...,0,0,,,113.0,1.27,89.0,,SVI,3
7336,47,F,0,0,0,0,0,0,0,0,...,0,0,,,75.0,0.85,88.0,,other,7


Obter os valores da feature matrix tratados e da target variable

In [101]:

X = data.iloc[:,:-1]

# Remover linhas com poucos valores medidos
X.dropna(axis=1, thresh=5500, inplace=True) # 5500 porque é ~75% do número total
X.drop('sex:',axis=1,inplace=True)
X.drop('referral source:',axis=1,inplace=True)
columns_to_check = ['TSH:', 'T3:', 'TT4:', 'T4U:', 'FTI:', 'TBG:']
existing_columns = [col for col in columns_to_check if col in data.columns]

if existing_columns:
    data.dropna(thresh=3, subset=existing_columns, inplace=True)

y = data.iloc[: , -1:]
y = y.astype('int')

In [102]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,random_state=0)

#SCALER
scaler = StandardScaler()
scaler.fit(X_train)
Xt_train=scaler.fit_transform(X_train)
Xt_test=scaler.fit_transform(X_test)

#IMPUTER
imputer = KNNImputer(weights="uniform")
imputer.fit(X_train)
Xt_train = imputer.transform(X_train)
Xt_test = imputer.transform(X_test)

#FEATURE SELECTION
N,M = Xt_train.shape


rfr=RandomForestRegressor(random_state=0)
sel = SelectFromModel(estimator=rfr,threshold=0.015)
y_train = y_train.squeeze().ravel()
y_test = y_test.squeeze().ravel()
sel.fit(Xt_train, y_train)

print("Default threshold: ", sel.threshold_)

features=sel.get_support()
Features_selected =np.arange(M)[features]

print("The features selected are columns: ", Features_selected)

nX_train=sel.transform(Xt_train)
nX_test=sel.transform(Xt_test)

score = make_scorer(matthews_corrcoef)

def  present_statistics(TRUTH_nfold, PREDS_nfold):
    print("These are the training set statistics:")
    print("The Accuracy is: %7.4f" % accuracy_score(TRUTH_nfold, PREDS_nfold))
    print("The Precision is: %7.4f" % precision_score(TRUTH_nfold, PREDS_nfold, average='weighted'))
    print("The Recall is: %7.4f" % recall_score(TRUTH_nfold, PREDS_nfold, average='weighted'))
    print("The F1 score is: %7.4f" % f1_score(TRUTH_nfold, PREDS_nfold, average='weighted'))
    print("The Matthews correlation coefficient is: %7.4f" % matthews_corrcoef(TRUTH_nfold, PREDS_nfold))
    print(pd.DataFrame(confusion_matrix(TRUTH_nfold, PREDS_nfold)))


Default threshold:  0.015
The features selected are columns:  [ 0  1 15 16 17 18]


## 3. Model Tuning

For this example we are going to use Support Vector Classifiers, but any model learned so far can be used

We are going to use first [Scikit-Learn's GridSearchCV](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html), an implementation of extensive parameter search. In its basic form it just requires:
* a bare bones model constructor 
* a dictionary containing the parameters to search for. The keys of the dictionary should correspond to the parameter to test and the values to a list of possible values to test
* a scoring function defining what is the criterion to select and rank the best models
* GridSearchCV uses by default 5-Fold Cross validation, but other validation criteria can be used

The result of GridSearchCV is a structure that contains the fitted models that can then be used for learning and application

Tet's try it with the C and gamma values for support vector classification

In [None]:
from time import time
#from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import scipy.stats as stats

#make the dictionary with the testing parameters
#gammas = [1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6, 1e-7]
#Cs = [1, 10, 100, 1e3, 1e4, 1e5]
#param_grid = {'gamma': gammas, 'C': Cs}
depths = [3, 5, 10, 15]
m_sampl_split = [2, 5, 9]
prune_a = [0.0, 0.0001, 0.001, 0.01]
param_grid = {'max_depth': depths, 'min_samples_split': m_sampl_split, 'ccp_alpha': prune_a}

#define the model and do the grid search
#clf = SVC() # RBF (Gaussian) by default
clf = DecisionTreeClassifier(criterion='log_loss', random_state=23)
gs = GridSearchCV(estimator=clf, param_grid=param_grid, scoring="f1")

start = time()
gs=gs.fit(X_train, y_train)
print(
    'GridSearchCV took %.2f seconds for %d candidate parameter settings.'
    % ((time() - start), len(gs.cv_results_['params']))
)

Let's identify the best element parameters [best according to the scoring function, in this case it is the F1 score]

In [None]:
#print('best gamma: %7.4f' % gs.best_estimator_.gamma)
#print('best C: %3.2f' %  gs.best_estimator_.C)
print('best maximum depth: %2.0f' % gs.best_estimator_.max_depth)
print('best minimum samples to split a node: %2.0f' %  gs.best_estimator_.min_samples_split)
print('best minimal cost pruning parameter: %1.4f' % gs.best_estimator_.ccp_alpha)

Just for sake of completion, we can use the best estimator model (the one with the optimized parameters) for prediction on the test set.

In [None]:
preds=gs.best_estimator_.predict(X_test)
print('F1 : %7.4f' % f1_score(y_test, preds))
print('number of leaves:', gs.best_estimator_.get_n_leaves())

GridSearchCV gives you a number of statistics on the tests it runs:

In [None]:
for i in gs.cv_results_.keys(): print(i)

We can print the results in a nice Pandas Data Frame

In [None]:
grid_res = pd.DataFrame(gs.cv_results_)
grid_res.sort_values(by=['rank_test_score'], ascending=True, inplace=True) #sort the tested models by score
grid_res[['params', 'rank_test_score', 'mean_test_score', 'std_test_score', 'mean_fit_time', 'std_fit_time']] #show only mean and std of the test score

we can check if the 2nd best model produces different results 

In [None]:
print('max_depth:', grid_res.loc[1, 'param_max_depth'],
      'min_samples_split:', grid_res.loc[1, 'param_min_samples_split'],
      'ccp_alpha:', '{:.2e}'.format(grid_res.loc[1, 'param_ccp_alpha']))
clf = DecisionTreeClassifier(criterion='log_loss', random_state=23,
                             max_depth=grid_res.loc[1, 'param_max_depth'],
                             min_samples_split=grid_res.loc[1, 'param_min_samples_split'],
                             ccp_alpha=grid_res.loc[1, 'param_ccp_alpha'])
clf.fit(X_train, y_train)
preds=clf.predict(X_test)
print('F1 : %7.4f' % f1_score(y_test, preds))
print('number of leaves:', clf.get_n_leaves())

Let's try now the RandomizedSearchCV and compare to the previous one.

In [None]:
# configure randomized search (by default also 5-fold CV)
# notice the loguniform distributions

param_dist = {
#    'C': stats.loguniform(1, 1e5),
#    'gamma': stats.loguniform(1e-7, 1e-1),
    'max_depth': stats.randint(3, 16),
    'min_samples_split': stats.randint(2, 10),
    'ccp_alpha': stats.loguniform(1e-5, 0.01)
}

n_iter_search = 15
rs = RandomizedSearchCV(
    clf, param_distributions=param_dist, n_iter=n_iter_search
)

start = time()
rs = rs.fit(X_train, y_train)
print(
    'RandomizedSearchCV took %.2f seconds for %d candidates parameter settings'
    % ((time() - start), n_iter_search)
)

In [None]:
print('best maximum depth: %2.0f' % rs.best_estimator_.max_depth)
print('best minimum samples to split a node: %2.0f' %  rs.best_estimator_.min_samples_split)
print('best minimal cost pruning parameter: %1.4f' % rs.best_estimator_.ccp_alpha)

Now we can use the best estimator model (the one with the optimized parameters) for prediction

In [None]:
rs1 = rs.best_estimator_
rs1.fit(X_train, y_train)
preds=rs1.predict(X_test)
print('F1 : %7.4f' % f1_score(y_test, preds))
print('number of leaves:', rs1.get_n_leaves())

In [None]:
rand_res = pd.DataFrame(rs.cv_results_)
rand_res.sort_values(by=['rank_test_score'], ascending= True, inplace=True) #sort the tested models by score
rand_res[['params', 'rank_test_score', 'mean_test_score', 'std_test_score', 'mean_fit_time', 'std_fit_time']] #show only mean and std of the test score

checking the 2nd best model 

In [None]:
print('max_depth:', rand_res['param_max_depth'].iat[1],
      ', min_samples_split:', rand_res['param_min_samples_split'].iat[1],
      ', ccp_alpha:', '{:.2e}'.format(rand_res['param_ccp_alpha'].iat[1]))
clf = DecisionTreeClassifier(criterion='log_loss', random_state=23,
                             max_depth=rand_res['param_max_depth'].iat[1],
                             min_samples_split=rand_res['param_min_samples_split'].iat[1],
                             ccp_alpha=rand_res['param_ccp_alpha'].iat[1])
clf.fit(X_train, y_train)
preds=clf.predict(X_test)
print('F1 : %7.4f' % f1_score(y_test, preds))
print('number of leaves:', clf.get_n_leaves())