# Projeto de Engenharia do Conhecimento 2023/2024

*Projeto by: Renato Ferreira (58238), Pedro Lopes(58196), Simão Quintas (58190)*

## Imports

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, matthews_corrcoef, make_scorer
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.preprocessing import StandardScaler

Let's drop the rows and columns with a large number of NA values

In [None]:
data = pd.read_csv('proj-data.csv', na_values='?')

# Remover as colunas que indicam se algo foi medido ou não, a coluna com a indentificação e colunas com muitos valores ausentes
data.drop(data.filter(like='measured').columns, axis=1, inplace=True)
data.drop('[record identification]', axis=1, inplace=True)
data.drop('referral source:',axis=1,inplace=True)

hyperthyroid_conditions = ['A', 'B', 'C', 'D']
hypothyroid_conditions = ['E', 'F', 'G', 'H']
binding_protein = ['I', 'J']
general_health = ['K']
replacement_therapy = ['L', 'M', 'N']
discordant = ['R']
none = ['-']

for i in range(len(data)):
    if data.at[i, "diagnoses"] in hyperthyroid_conditions :
        data.at[i, "diagnoses"] = 1
    elif data.at[i, "diagnoses"] in hypothyroid_conditions :
        data.at[i, "diagnoses"] = 2
    elif data.at[i, "diagnoses"] in binding_protein :
        data.at[i, "diagnoses"] = 3
    elif data.at[i, "diagnoses"] in general_health :
        data.at[i, "diagnoses"] = 4
    elif data.at[i, "diagnoses"] in replacement_therapy :
        data.at[i, "diagnoses"] = 5
    elif data.at[i, "diagnoses"] in discordant :
        data.at[i, "diagnoses"] = 6
    elif data.at[i, "diagnoses"] in none :
        data.at[i, "diagnoses"] = 7 
    else:
        data.at[i, "diagnoses"] = 8 

data.replace('f', 0, inplace=True)
data.replace('t', 1, inplace=True)
data.replace('F', 0, inplace=True)
data.replace('M', 1, inplace=True)

data

Obter os valores da feature matrix tratados e da target variable, removendo as colunas com poucos valores.

In [None]:
missingValues = {}

for i in data.values:
  c=0
  for j in i:
    if pd.isna(j):
      if data.columns[c] not in missingValues:
        missingValues[data.columns[c]] = 1
      else:
        missingValues[data.columns[c]] += 1
    c+=1

for c in missingValues.keys():
  if missingValues[c] > 0:
    print(c,str(missingValues[c]),"missing values!")

X = data.iloc[:,:-1]

y = data.iloc[: , -1:]
y = y.astype('int')

Vamos avaliar a importância de T3 e de TBG, de forma a avaliar se os removemos ou não.

## Feature Selection

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,random_state=0)

#SCALER
scaler = StandardScaler()
scaler.fit(X_train)
Xt_train=scaler.fit_transform(X_train)
Xt_test=scaler.fit_transform(X_test)

#FEATURE SELECTION
N,M = Xt_train.shape

rfr=RandomForestRegressor(random_state=0)
sel = SelectFromModel(estimator=rfr,threshold=0.02)
y_train = y_train.squeeze().ravel()
y_test = y_test.squeeze().ravel()
sel.fit(Xt_train, y_train)

print("Default threshold: ", sel.threshold_)

features=sel.get_support()
Features_selected =np.arange(M)[features]

print("The features selected are columns: ", Features_selected)

nX_train=sel.transform(Xt_train)
nX_test=sel.transform(Xt_test)

score = make_scorer(matthews_corrcoef)

In [None]:
rfc = RandomForestClassifier(random_state=123)      
rfc.fit(Xt_train, y_train)

# Calculate feature importances and standard deviations
importances = rfc.feature_importances_
std = np.std([t.feature_importances_ for t in rfc.estimators_], axis=0)

# Sort features based on importances
indices = np.argsort(importances)[::-1]

# Print and plot feature importances with error bars
print("Feature Importances:")
for f in range(Xt_train.shape[1]):
    print("%d: Feature %d (%f ± %f)" % (f + 1, indices[f], importances[indices[f]], std[indices[f]]))
    
plt.figure(figsize=(10, 6))
plt.bar(range(Xt_train.shape[1]), importances[indices], color="b", yerr=std[indices], align="center")
plt.xticks(range(Xt_train.shape[1]), X.columns[indices], rotation=90)
plt.xlim([-1, Xt_train.shape[1]])
plt.xlabel("Features")
plt.ylabel("Importance")
plt.title("Feature Importances with Error Bars")
plt.tight_layout()
plt.show()

# Métodos

In [None]:
def present_statistics(y_test, preds):
    print("Statistics:")
    print("The Precision is: %7.4f" % precision_score(y_test, preds, average='weighted'))
    print("The Accuracy is: %7.4f" % accuracy_score(y_test, preds))
    print("The Recall is: %7.4f" % recall_score(y_test, preds, average='weighted'))
    print("The F1 score is: %7.4f" % f1_score(y_test, preds, average='weighted'))
    print("The Matthews correlation coefficient is: %7.4f" % matthews_corrcoef(y_test, preds))
    print("-------------------------------------------------------------")

## Decision Tree

### Testar o modelo Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import SimpleImputer

print("Com valores Nan, sem scaler:")

# Create and train the DecisionTreeClassifier model
tree_model = DecisionTreeClassifier()
tree_model.fit(X_train, y_train)

# Make predictions
tree_preds = tree_model.predict(X_test)
present_statistics(y_test, tree_preds)

print("Sem valores Nan, sem scaler:")

# Create an instance of the SimpleImputer class with fill_value='-1'
imputer = SimpleImputer(strategy='constant', fill_value=-1)

# Replace missing values with -1 in the training and testing data
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

# Train the DecisionTreeClassifier model with the simple imputer
tree_model.fit(X_train_imputed, y_train)

# Make predictions
tree_preds = tree_model.predict(X_test_imputed)
present_statistics(y_test, tree_preds)

print("Com valores Nan, com scaler:")

# Train the DecisionTreeClassifier model with the simple imputer
tree_model.fit(Xt_train, y_train)

# Make predictions
tree_preds = tree_model.predict(Xt_test)
present_statistics(y_test, tree_preds)

print("Sem valores Nan, com scaler:")

X_train_imputed = imputer.fit_transform(Xt_train)
X_test_imputed = imputer.transform(Xt_test)

tree_model.fit(X_train_imputed, y_train)

tree_preds = tree_model.predict(X_test_imputed)
present_statistics(y_test, tree_preds)

Os resultados ao utilizar valores Nan são melhores.

In [None]:
y_train_flat = np.ravel(y_train)
y_test_flat = np.ravel(y_test)

## KNeighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn_model = KNeighborsClassifier(n_neighbors=3)

print("Usando um scaler:")

# Replace missing values with -1 in the scaled training and testing data
X_train_imputed = imputer.fit_transform(Xt_train)
X_test_imputed = imputer.transform(Xt_test)

knn_model.fit(X_train_imputed, y_train_flat)

# Predictions
knn_preds = knn_model.predict(X_test_imputed)
present_statistics(y_test, knn_preds)

print("Sem usar scaler:")

# Replace missing values with -1 in the training and testing data
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

knn_model.fit(X_train_imputed, y_train_flat)

# Predictions
knn_preds = knn_model.predict(X_test_imputed)
present_statistics(y_test_flat, knn_preds)

É melhor ao usar um scaler

## SVC

In [None]:
from sklearn.svm import SVC
svc_model = SVC()

print(svc_model,"usando scaler:")
# Replace missing values with -1 in the training and testing data
X_train_imputed = imputer.fit_transform(Xt_train)
X_test_imputed = imputer.transform(Xt_test)

svc_model.fit(X_train_imputed, y_train_flat)

svc_preds = svc_model.predict(X_test_imputed)
present_statistics(y_test_flat, svc_preds)

print(svc_model,"sem scaler:")

X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

svc_model.fit(X_train_imputed, y_train_flat)

svc_preds = svc_model.predict(X_test_imputed)
present_statistics(y_test_flat, svc_preds)

## Gaussian Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB
gaus_model = GaussianNB()

print(gaus_model,"sem scaler:")

# Replace missing values with -1 in the training and testing data
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

gaus_model.fit(X_train_imputed, y_train_flat)

gaus_preds = gaus_model.predict(X_test_imputed)
present_statistics(y_test_flat, gaus_preds)

print(gaus_model,"com scaler:")

X_train_imputed = imputer.fit_transform(Xt_train)
X_test_imputed = imputer.transform(Xt_test)

gaus_model.fit(X_train_imputed, y_train_flat)

gaus_preds = gaus_model.predict(X_test_imputed)
present_statistics(y_test_flat, gaus_preds)

## LogisticRegression

In [None]:
from sklearn.linear_model import LogisticRegression
logr_model = LogisticRegression(max_iter=1000)

print(logr_model,"com scaler:")

# Replace missing values with -1 in the training and testing data
X_train_imputed = imputer.fit_transform(Xt_train)
X_test_imputed = imputer.transform(Xt_test)

# Train models
logr_model.fit(X_train_imputed, y_train_flat)

logr_preds = logr_model.predict(X_test_imputed)
present_statistics(y_test_flat, logr_preds)

print(logr_model,"sem scaler:")

# Replace missing values with -1 in the training and testing data
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

# Train models
logr_model.fit(X_train_imputed, y_train_flat)

logr_preds = logr_model.predict(X_test_imputed)
present_statistics(y_test_flat, logr_preds)

Os melhores modelos são o Decision Tree, KNeighbors e LogisticRegression

## Model Tuning

### Decision Tree Classifier

In [None]:
param_grid = {
    'max_depth': [None,*range(3, 30)],
    'min_samples_split': [*range(2,15)],
    'min_samples_leaf': [*range(2,15)],
    'max_features': [None],
    'criterion': ['gini','entropy']
}

tree_model = DecisionTreeClassifier()

grid_search = GridSearchCV(estimator=tree_model, param_grid=param_grid, cv=5, scoring='f1_macro')

grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)

best_tree_model = grid_search.best_estimator_

tree_preds = best_tree_model.predict(X_test)

present_statistics(y_test, tree_preds)

# Perform cross-validation with the best estimator
cv_scores = cross_val_score(best_tree_model, X_train, y_train, cv=5, scoring='f1_macro')

# Print cross-validation scores
print("Cross-Validation Scores:", cv_scores)
print("Mean Cross-Validation Score:", cv_scores.mean())
print("Standard Deviation of Cross-Validation Score:", cv_scores.std())

### KNeighbours

In [None]:
param_grid = {
    'n_neighbors': [3,5,7,9,11,13,15,17],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'p': [1, 2]  # 1 for Manhattan distance, 2 for Euclidean distance
}

knn_model = KNeighborsClassifier()

grid_search = GridSearchCV(estimator=knn_model, param_grid=param_grid, cv=5, scoring='f1_weighted')

X_train_imputed = imputer.fit_transform(Xt_train)
X_test_imputed = imputer.transform(Xt_test)

grid_search.fit(X_train_imputed, y_train_flat)

print("Best Parameters:", grid_search.best_params_)

best_knn_model = grid_search.best_estimator_

knn_preds = best_knn_model.predict(X_test_imputed)

present_statistics(y_test_flat, knn_preds)

# Perform cross-validation with the best estimator
cv_scores = cross_val_score(best_knn_model, X_train_imputed, y_train_flat, cv=5, scoring='f1_macro')

# Print cross-validation scores
print("Cross-Validation Scores:", cv_scores)
print("Mean Cross-Validation Score:", cv_scores.mean())
print("Standard Deviation of Cross-Validation Score:", cv_scores.std())

### Logistic Regression

In [None]:
param_grid = {
    'penalty': ['l1', 'l2'],
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'solver': ['liblinear']
}

logreg_model = LogisticRegression(max_iter=1000)  # Increase max_iter if needed

grid_search = GridSearchCV(estimator=logreg_model, param_grid=param_grid, cv=5, scoring='f1_weighted')

X_train_imputed = imputer.fit_transform(Xt_train)
X_test_imputed = imputer.transform(Xt_test)

grid_search.fit(X_train_imputed, y_train_flat)

print("Best Parameters:", grid_search.best_params_)

best_logreg_model = grid_search.best_estimator_

logreg_preds = best_logreg_model.predict(X_test_imputed)

present_statistics(y_test_flat, logreg_preds)

# Perform cross-validation with the best estimator
cv_scores = cross_val_score(best_logreg_model, X_train_imputed, y_train_flat, cv=5, scoring='f1_macro')

# Print cross-validation scores
print("Cross-Validation Scores:", cv_scores)
print("Mean Cross-Validation Score:", cv_scores.mean())
print("Standard Deviation of Cross-Validation Score:", cv_scores.std())

Escolhemos o Decision Tree com os parâmetros:
'criterion': 'gini', 
'max_depth': 17, 
'max_features': None, 
'min_samples_leaf': 3, 
'min_samples_split': 10, por ter sido aquele com melhor cross-validation score e com menor standard deviation

# O2

## Idade

In [None]:
from sklearn.metrics import r2_score
#OBJETIVO 2 : IDADE

data_age = pd.read_csv('proj-data.csv', na_values='?')

data_age.drop(data_age.filter(like='measured').columns, axis=1, inplace=True)
data_age.drop('[record identification]', axis=1, inplace=True)
data_age.drop('referral source:',axis=1,inplace=True)
data_age.dropna(subset=['age:'],inplace=True)

for i in range(len(data_age)):
    if data_age.at[i, "diagnoses"] in hyperthyroid_conditions :
        data_age.at[i, "diagnoses"] = 1
    elif data_age.at[i, "diagnoses"] in hypothyroid_conditions :
        data_age.at[i, "diagnoses"] = 2
    elif data_age.at[i, "diagnoses"] in binding_protein :
        data_age.at[i, "diagnoses"] = 3
    elif data_age.at[i, "diagnoses"] in general_health :
        data_age.at[i, "diagnoses"] = 4
    elif data_age.at[i, "diagnoses"] in replacement_therapy :
        data_age.at[i, "diagnoses"] = 5
    elif data_age.at[i, "diagnoses"] in discordant :
        data_age.at[i, "diagnoses"] = 6
    elif data_age.at[i, "diagnoses"] in none :
        data_age.at[i, "diagnoses"] = 7
    else:
        data_age.at[i, "diagnoses"] = 8 

data_age.replace('f', 0, inplace=True)
data_age.replace('t', 1, inplace=True)
data_age.replace('F', 0, inplace=True)
data_age.replace('M', 1, inplace=True)

X_age = data_age.iloc[:,1:]
y_age = data_age.iloc[: , :1]

X_train, X_test, y_train, y_test = train_test_split(X_age, y_age, test_size=0.25,random_state=0)

scaler = StandardScaler()
scaler.fit(X_train)
Xt_train=scaler.fit_transform(X_train)
Xt_test=scaler.fit_transform(X_test)

### SVR

In [None]:
from sklearn.svm import SVR
model = SVR()

print("02AGE SVR sem valores Nan, com scaler:")

X_train_imputed = imputer.fit_transform(Xt_train)
X_test_imputed = imputer.transform(Xt_test)

model.fit(X_train_imputed, y_train)

preds = model.predict(X_test_imputed)
print("R2 Score:",r2_score(y_test, preds))

y_train_flat = np.ravel(y_train)
y_test_flat = np.ravel(y_test)

## Best Features

In [None]:
rfr.fit(X_train_imputed, y_train)

# Calculate feature importances and standard deviations
importances = rfr.feature_importances_
std = np.std([t.feature_importances_ for t in rfr.estimators_], axis=0)

# Sort features based on importances
indices = np.argsort(importances)[::-1]

# Print and plot feature importances with error bars
print("Feature Importances:")
for f in range(X_train_imputed.shape[1]):
    print("%d: Feature %d (%f ± %f)" % (f + 1, indices[f], importances[indices[f]], std[indices[f]]))
    
plt.figure(figsize=(10, 6))
plt.bar(range(X_train_imputed.shape[1]), importances[indices], color="b", yerr=std[indices], align="center")
plt.xticks(range(X_train_imputed.shape[1]), X_age.columns[indices], rotation=90)
plt.xlim([-1, X_train_imputed.shape[1]])
plt.xlabel("Features")
plt.ylabel("Importance")
plt.title("Feature Importances with Error Bars")
plt.tight_layout()
plt.show()

## Sexo

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# OBJETIVO 2: SEX

data_sex = pd.read_csv('proj-data.csv', na_values='?')

data_sex.drop(data_sex.filter(like='measured').columns, axis=1, inplace=True)
data_sex.drop('[record identification]', axis=1, inplace=True)
data_sex.drop('referral source:', axis=1, inplace=True)
data_sex.dropna(subset=['sex:'], inplace=True)

# Reset index after dropping rows with NaN values
data_sex.reset_index(drop=True, inplace=True)

for i in range(len(data_sex)):
    if data_sex.at[i, "diagnoses"] in hyperthyroid_conditions:
        data_sex.at[i, "diagnoses"] = 1
    elif data_sex.at[i, "diagnoses"] in hypothyroid_conditions:
        data_sex.at[i, "diagnoses"] = 2
    elif data_sex.at[i, "diagnoses"] in binding_protein:
        data_sex.at[i, "diagnoses"] = 3
    elif data_sex.at[i, "diagnoses"] in general_health:
        data_sex.at[i, "diagnoses"] = 4
    elif data_sex.at[i, "diagnoses"] in replacement_therapy:
        data_sex.at[i, "diagnoses"] = 5
    elif data_sex.at[i, "diagnoses"] in discordant:
        data_sex.at[i, "diagnoses"] = 6
    elif data_sex.at[i, "diagnoses"] in none:
        data_sex.at[i, "diagnoses"] = 7
    else:
        data_sex.at[i, "diagnoses"] = 8

data_sex.replace('f', 0, inplace=True)
data_sex.replace('t', 1, inplace=True)
data_sex.replace('F', 0, inplace=True)
data_sex.replace('M', 1, inplace=True)

X_sex = data_sex.iloc[:, :]
X_sex.drop('sex:', axis=1, inplace=True)

y_sex = data_sex[['sex:']]  # Extract 'sex:' column for y_sex

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_sex, y_sex, test_size=0.25, random_state=0)

# Ensure y_train and y_test are in the correct shape for fitting the model
y_train_flat = y_train.values.ravel()
y_test_flat = y_test.values.ravel()

### KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

# Initialize imputer
imputer = SimpleImputer(strategy='mean')

# Standardize the features
scaler = StandardScaler()

# Impute and scale the data
print("02SEX KNN tree usando um scaler:")

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_imputed = imputer.fit_transform(X_train_scaled)
X_test_imputed = imputer.transform(X_test_scaled)

knn_model = KNeighborsClassifier(n_neighbors=7)
knn_model.fit(X_train_imputed, y_train_flat)

knn_preds = knn_model.predict(X_test_imputed)
present_statistics(y_test_flat, knn_preds)

## Best Features

In [None]:
rfc = RandomForestClassifier(random_state=123)      
rfc.fit(X_train_imputed, y_train_flat)

# Calculate feature importances and standard deviations
importances = rfc.feature_importances_
std = np.std([t.feature_importances_ for t in rfc.estimators_], axis=0)

# Sort features based on importances
indices = np.argsort(importances)[::-1]

# Print and plot feature importances with error bars
print("Feature Importances:")
for f in range(X_train_imputed.shape[1]):
    print("%d: Feature %d (%f ± %f)" % (f + 1, indices[f], importances[indices[f]], std[indices[f]]))
    
plt.figure(figsize=(10, 6))
plt.bar(range(X_train_imputed.shape[1]), importances[indices], color="b", yerr=std[indices], align="center")
plt.xticks(range(X_train_imputed.shape[1]), X_sex.columns[indices], rotation=90)
plt.xlim([-1, X_train_imputed.shape[1]])
plt.xlabel("Features")
plt.ylabel("Importance")
plt.title("Feature Importances with Error Bars")
plt.tight_layout()
plt.show()

# Célula final

In [3]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, matthews_corrcoef, make_scorer


def present_statistics(y_test, preds):
    print("Statistics:")
    print("The Precision is: %7.4f" % precision_score(y_test, preds, average='weighted'))
    print("The Accuracy is: %7.4f" % accuracy_score(y_test, preds))
    print("The Recall is: %7.4f" % recall_score(y_test, preds, average='weighted'))
    print("The F1 score is: %7.4f" % f1_score(y_test, preds, average='weighted'))
    print("The Matthews correlation coefficient is: %7.4f" % matthews_corrcoef(y_test, preds))
    print("-------------------------------------------------------------")

data = pd.read_csv('proj-data.csv', na_values='?')
testData = pd.read_csv('proj-test-data.csv', na_values='?')
testClass = pd.read_csv('proj-test-class.csv', na_values='?')

data.drop(data.filter(like='measured').columns, axis=1, inplace=True)
data.drop('[record identification]', axis=1, inplace=True)
data.drop('referral source:',axis=1,inplace=True)

hyperthyroid_conditions = ['A', 'B', 'C', 'D']
hypothyroid_conditions = ['E', 'F', 'G', 'H']
binding_protein = ['I', 'J']
general_health = ['K']
replacement_therapy = ['L', 'M', 'N']
discordant = ['R']
none = ['-']

for i in range(len(data)):
    if data.at[i, "diagnoses"] in hyperthyroid_conditions :
        data.at[i, "diagnoses"] = 1
    elif data.at[i, "diagnoses"] in hypothyroid_conditions :
        data.at[i, "diagnoses"] = 2
    elif data.at[i, "diagnoses"] in binding_protein :
        data.at[i, "diagnoses"] = 3
    elif data.at[i, "diagnoses"] in general_health :
        data.at[i, "diagnoses"] = 4
    elif data.at[i, "diagnoses"] in replacement_therapy :
        data.at[i, "diagnoses"] = 5
    elif data.at[i, "diagnoses"] in discordant :
        data.at[i, "diagnoses"] = 6
    elif data.at[i, "diagnoses"] in none :
        data.at[i, "diagnoses"] = 7 
    else:
        data.at[i, "diagnoses"] = 8 

for i in range(len(testClass)):
    if testClass.at[i, "diagnoses"] in hyperthyroid_conditions :
        testClass.at[i, "diagnoses"] = 1
    elif testClass.at[i, "diagnoses"] in hypothyroid_conditions :
        testClass.at[i, "diagnoses"] = 2
    elif testClass.at[i, "diagnoses"] in binding_protein :
        testClass.at[i, "diagnoses"] = 3
    elif testClass.at[i, "diagnoses"] in general_health :
        testClass.at[i, "diagnoses"] = 4
    elif testClass.at[i, "diagnoses"] in replacement_therapy :
        testClass.at[i, "diagnoses"] = 5
    elif testClass.at[i, "diagnoses"] in discordant :
        testClass.at[i, "diagnoses"] = 6
    elif testClass.at[i, "diagnoses"] in none :
        testClass.at[i, "diagnoses"] = 7 
    else:
        testClass.at[i, "diagnoses"] = 8 

data.replace('f', 0, inplace=True)
data.replace('t', 1, inplace=True)
data.replace('F', 0, inplace=True)
data.replace('M', 1, inplace=True)

X = data.iloc[:,:-1]

y = data.iloc[: , -1:]
y = y.astype('int')


X = data.iloc[:,:-1]

y = data.iloc[: , -1:]
y = y.astype('int')

tree_model = DecisionTreeClassifier(criterion = 'entropy', max_depth = 27, max_features = None, min_samples_leaf = 2, min_samples_split = 6)
tree_model.fit(X, y)

testData.drop(testData.filter(like='measured').columns, axis=1, inplace=True)
testData.drop('[record identification]', axis=1, inplace=True)
testData.drop('referral source:',axis=1,inplace=True)

testData.replace('f', 0, inplace=True)
testData.replace('t', 1, inplace=True)
testData.replace('F', 0, inplace=True)
testData.replace('M', 1, inplace=True)

# Make predictions
tree_preds = tree_model.predict(testData)
testClass_flat = testClass['diagnoses'].astype(int).values.flatten()
print("Valores do teste:")
print(testClass_flat)
print("Valores da previsão:")
print(tree_preds)
present_statistics(testClass_flat, tree_preds)


Valores do teste:
[7 7 7 7 4 7 7 7 7 7 1 7 7 7 7 7 7 2 7 2]
Valores da previsão:
[7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 2]
Statistics:
The Precision is:  0.7737
The Accuracy is:  0.8500
The Recall is:  0.8500
The F1 score is:  0.7981
The Matthews correlation coefficient is:  0.4695
-------------------------------------------------------------


  data.replace('f', 0, inplace=True)
  data.replace('t', 1, inplace=True)
  data.replace('M', 1, inplace=True)
  testData.replace('f', 0, inplace=True)
  testData.replace('t', 1, inplace=True)
  testData.replace('M', 1, inplace=True)
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
