In [24]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder,StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
#from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from scipy.stats import f_oneway
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score, make_scorer
import re
import numpy as np
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

In [7]:
data=pd.read_excel('dades_27_08_2024.xlsx')

In [8]:
print(data.isna().sum())
print(f'{(data.isna().any(axis=1).sum()/data.shape[0])*100:.2f}% of rows have at least one Na value')

Referència                      0
Cost                            0
Exitus                          0
Especialitat d'acte mèdic       0
Risc                            0
Àrea assistencial               0
Tipus de praxi                128
Centre docent                   0
Tipologia d'acte mèdic        252
Àmbit                          55
Consentiment informat           0
País de naixement             491
Província de naixement        746
País llicenciatura             89
Especialitat                  529
Centre                         51
Codi nivell                  2423
Codi diagnòstic               666
Codi procediment mèdic        819
Seqüeles                        0
Reclamants                      0
Pacients                        0
Edat pacient                 2340
Reclamants_Exitus               0
Category                        0
dtype: int64
97.13% of rows have at least one Na value


In [None]:
#Mapping yes and no to 1 and 0 in binary features
data['Risc'] = data['Risc'].map({'No': 0, 'Sí': 1})
data['Centre docent'] = data['Centre docent'].map({'No': 0, 'Sí': 1})
data['Consentiment informat'] = data['Consentiment informat'].map({'No': 0, 'Sí': 1})

#Dropping Cost feature to avoid information leakage
data.drop(columns=['Cost'], inplace=True)

In [12]:
# Check correlation and p-value of each numeric column to 'Category'
from scipy.stats import pearsonr

if 'Category' in data.columns:
    numeric_cols = data.select_dtypes(include='number').columns
    results = []
    for col in numeric_cols:
        if col != 'Category':
            valid = data[[col, 'Category']].dropna()
            if len(valid) > 1:
                corr, pval = pearsonr(valid[col], valid['Category'])
                results.append((col, corr, pval))
    print(f"{'Column':<20} {'Correlation':>12} {'P-value':>12}")
    for col, corr, pval in results:
        print(f"{col:<20} {corr:>12.4f} {pval:>12.4g}")
else:
    print("'Category' column not found in the dataset.")


Column                Correlation      P-value
Exitus                     0.0065       0.7083
Especialitat d'acte mèdic      -0.0066       0.7032
Risc                       0.4149   6.435e-138
Àrea assistencial          0.0335      0.05362
Centre docent              0.0089       0.6094
Consentiment informat       0.0348      0.04528
Centre                    -0.0566     0.001217
Seqüeles                   0.0138       0.4279
Reclamants                 0.2101    2.495e-34
Pacients                   0.0590    0.0006818
Edat pacient               0.0773      0.01611
Reclamants_Exitus          0.0533     0.002179


In [13]:
# Check correlation between categorical columns and 'Category' using ANOVA F-test

cat_cols = data.select_dtypes(include=['object', 'category']).columns
anova_results = []
for col in cat_cols:
    if col != 'Category':
        groups = [group['Category'].dropna() for name, group in data.groupby(col)]
        if len(groups) > 1:
            f_stat, p_val = f_oneway(*groups)
            anova_results.append((col, f_stat, p_val))

print(f"{'Column':<30} {'F-statistic':>12} {'P-value':>12}")
for col, f_stat, p_val in anova_results:
    print(f"{col:<30} {f_stat:>12.4f} {p_val:>12.4g}")

  res = hypotest_fun_out(*samples, **kwds)


Column                          F-statistic      P-value
Referència                              inf            0
Tipus de praxi                      10.2238     3.75e-05
Tipologia d'acte mèdic               1.6406      0.02165
Àmbit                                3.0332       0.0483
País de naixement                    0.7097       0.9251
Província de naixement               0.9382       0.6288
País llicenciatura                   0.7962       0.7513
Especialitat                         3.2117    4.113e-11
Codi nivell                          0.5699       0.7232
Codi diagnòstic                      1.2417    4.348e-05
Codi procediment mèdic               1.1716      0.00564


In [14]:
#What to do with columns with NAs:
#Drop columns with NAs that have low statistical correlation with cost (Ambit, Pais de naixement, Província de naixement, Paìs de llicentiatura, Codi nivell, Edad pacient)
cols_to_drop = ['Àmbit', 'País de naixement', 'Província de naixement', 'País llicenciatura', 'Codi nivell', 'Edat pacient','Referència']
data.drop(columns=cols_to_drop, inplace=True, errors='ignore')

In [15]:
#Check Nas again
print(data.isna().sum())
print(f'{(data.isna().any(axis=1).sum()/data.shape[0])*100:.2f}% of rows have at least one Na value')

#Dropping all rows with NAs
data.dropna(inplace=True)
print(f'After dropping NAs, {data.shape[0]} rows remain in the dataset.')

Exitus                         0
Especialitat d'acte mèdic      0
Risc                           0
Àrea assistencial              0
Tipus de praxi               128
Centre docent                  0
Tipologia d'acte mèdic       252
Consentiment informat          0
Especialitat                 529
Centre                        51
Codi diagnòstic              666
Codi procediment mèdic       819
Seqüeles                       0
Reclamants                     0
Pacients                       0
Reclamants_Exitus              0
Category                       0
dtype: int64
36.62% of rows have at least one Na value
After dropping NAs, 2098 rows remain in the dataset.


In [16]:
#Check for duplicate rows
duplicates = data.duplicated().sum()
print(f'Number of duplicate rows: {duplicates}')

# Display duplicate rows
duplicates_df = data[data.duplicated(keep=False)]
display(duplicates_df)

# Drop duplicate rows from the dataset
data.drop_duplicates(inplace=True)
print('Duplicate rows have been removed.')

#Removing - within the Codi procediment mèdic column
data = data[~data['Codi procediment mèdic'].astype(str).str.contains('-')]
data = data.reset_index(drop=True)

Number of duplicate rows: 11


Unnamed: 0,Exitus,Especialitat d'acte mèdic,Risc,Àrea assistencial,Tipus de praxi,Centre docent,Tipologia d'acte mèdic,Consentiment informat,Especialitat,Centre,Codi diagnòstic,Codi procediment mèdic,Seqüeles,Reclamants,Pacients,Reclamants_Exitus,Category
207,0,13,1,35,AMB CIRURGIA,0,PROGRAMAT,0,"CIRURGIA PLÀSTICA, ESTÈTICA I REPARADORA",11977.0,H02,8.3,86,0,1,0,1
424,0,13,0,35,AMB CIRURGIA,0,PROGRAMAT,0,"CIRURGIA PLÀSTICA, ESTÈTICA I REPARADORA",11977.0,E881,86.83,86,1,1,0,0
483,0,13,0,35,AMB CIRURGIA,0,PROGRAMAT,0,"CIRURGIA PLÀSTICA, ESTÈTICA I REPARADORA",11977.0,N642,85.54,86,0,1,0,0
951,0,13,1,35,AMB CIRURGIA,0,PROGRAMAT,0,"CIRURGIA PLÀSTICA, ESTÈTICA I REPARADORA",11977.0,H02,8.3,86,0,1,0,1
1006,0,13,0,35,AMB CIRURGIA,0,PROGRAMAT,1,"CIRURGIA PLÀSTICA, ESTÈTICA I REPARADORA",12065.0,Z411,86.83,86,1,1,0,0
1181,0,13,0,35,AMB CIRURGIA,0,PROGRAMAT,0,"CIRURGIA PLÀSTICA, ESTÈTICA I REPARADORA",11977.0,N642,85.54,86,0,1,0,0
1812,0,13,0,35,AMB CIRURGIA,0,PROGRAMAT,0,"CIRURGIA PLÀSTICA, ESTÈTICA I REPARADORA",11977.0,E881,86.83,86,1,1,0,0
2158,0,5,1,3,AMB CIRURGIA,0,PROGRAMAT,0,APARELL DIGESTIU,11977.0,Y640,99.29,153,5,5,0,2
2159,0,5,1,3,AMB CIRURGIA,0,PROGRAMAT,0,APARELL DIGESTIU,11977.0,Y640,99.29,153,5,5,0,2
2160,0,5,1,3,AMB CIRURGIA,0,PROGRAMAT,0,APARELL DIGESTIU,11977.0,Y640,99.29,153,5,5,0,2


Duplicate rows have been removed.


In [17]:
#Removing outliers cost from the dataset
#data = data[~data['Cost'].isin(outliers['Cost'])]
#Resetting index after removing outliers
#data = data.reset_index(drop=True)

#Startinng the pre-processing for the model
X=data.drop(columns=['Category'])
print(X.shape)
y=data['Category']

X['Codi procediment mèdic']=X['Codi procediment mèdic'].astype('float').astype('int')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(X.columns)

#Identify categorical and numerical columns
categorical_cols = X_train.select_dtypes(include=['object', 'category']).columns.tolist()
#categorical_cols.remove('Codi diagnòstic')  # Excluding 'Codi diagnòstic' from categorical columns to encode it through ordinal encoding
numerical_cols = X_train.select_dtypes(include=['number']).columns.tolist()



(2073, 16)
Index(['Exitus', 'Especialitat d'acte mèdic', 'Risc', 'Àrea assistencial',
       'Tipus de praxi', 'Centre docent', 'Tipologia d'acte mèdic',
       'Consentiment informat', 'Especialitat', 'Centre', 'Codi diagnòstic',
       'Codi procediment mèdic', 'Seqüeles', 'Reclamants', 'Pacients',
       'Reclamants_Exitus'],
      dtype='object')


In [20]:
#Order of diagnostic codes through function
#def sort_diagnostic_codes(codes):
#    import re
#    def code_key(code):
#        match = re.match(r"([A-Za-z]+)([0-9]+)", code)
#        if match:
#            letter, number = match.groups()
#            return (letter, int(number))
#        else:
#            return (code, 0)
#    return sorted(codes, key=code_key)

#icd10_codes = sort_diagnostic_codes(X['Codi diagnòstic'].unique().tolist())

#Creating the encoding for categorical columns
#ordinal_pipe=Pipeline(steps=[
#    ('ordinal', OrdinalEncoder(categories=[icd10_codes]))
#])

ohe_pipe=Pipeline(steps=[
    ('onehot', OneHotEncoder(categories='auto', handle_unknown='ignore'))
])

numerical_pipe=Pipeline(steps=[
    ('scale', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('categorical', ohe_pipe, categorical_cols),
#        ('ordinal', ordinal_pipe, ['Codi diagnòstic']),
        ('numerical', numerical_pipe, numerical_cols)
    ],
    remainder='passthrough', verbose_feature_names_out=False
)

pipe=Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model',DecisionTreeClassifier())
])

In [25]:
#Defining the models and hyperparameters for GridSearchCV
param_grid=[
    {'model':[DecisionTreeClassifier()],
     'model__max_depth': [5, 10, 20],
     'model__min_samples_split': [2, 5, 10],
     'model__min_samples_leaf': [1, 2, 4]},

    {'model':[RandomForestClassifier()],
     'model__n_estimators': [100, 200, 500],
     'model__max_depth': [10, 20, 30],
     'model__min_samples_split': [2, 5, 10],
     'model__min_samples_leaf': [1, 2, 4]},

    {'model':[XGBClassifier()],
     'model__n_estimators': [100, 200],
     'model__max_depth': [3, 6],
     'model__learning_rate': [0.05, 0.1]},

    {'model':[LGBMClassifier()],
     'model__n_estimators': [100, 200],
     'model__num_leaves': [31, 50],
     'model__learning_rate': [0.05, 0.1]},

    {'model':[SVC()],
     'model__C': [0.1, 1, 10],
     'model__kernel': ['linear', 'rbf']}
]

#Defining the scoreing metrics to evaluate the models
scoring = {
    'accuracy': make_scorer(accuracy_score),
    'recall': make_scorer(recall_score, average='weighted'),
    'precision': make_scorer(precision_score, average='weighted'),
    'f1': make_scorer(f1_score, average='weighted')
}

grid = GridSearchCV(pipe, param_grid, scoring=scoring, refit='accuracy', cv=5)

In [None]:
#Fitting the models
grid.fit(X_train, y_train)

# Save best result
best_model = grid.best_estimator_
best_score = -grid.best_score_ 
print(f'Best model: {best_model}')
print(f'Best score: {best_score:.2f}')

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001484 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 450
[LightGBM] [Info] Number of data points in the train set: 1326, number of used features: 38
[LightGBM] [Info] Start training from score -0.328211
[LightGBM] [Info] Start training from score -2.199490
[LightGBM] [Info] Start training from score -2.192710
[LightGBM] [Info] Start training from score -3.339775
[LightGBM] [Info] Start training from score -3.822626




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007652 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 440
[LightGBM] [Info] Number of data points in the train set: 1326, number of used features: 37
[LightGBM] [Info] Start training from score -0.328211
[LightGBM] [Info] Start training from score -2.199490
[LightGBM] [Info] Start training from score -2.192710
[LightGBM] [Info] Start training from score -3.339775
[LightGBM] [Info] Start training from score -3.822626




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001454 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 430
[LightGBM] [Info] Number of data points in the train set: 1326, number of used features: 36
[LightGBM] [Info] Start training from score -0.328211
[LightGBM] [Info] Start training from score -2.199490
[LightGBM] [Info] Start training from score -2.192710
[LightGBM] [Info] Start training from score -3.318721
[LightGBM] [Info] Start training from score -3.857718


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.063347 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 449
[LightGBM] [Info] Number of data points in the train set: 1327, number of used features: 35
[LightGBM] [Info] Start training from score -0.328965
[LightGBM] [Info] Start training from score -2.193464
[LightGBM] [Info] Start training from score -2.193464
[LightGBM] [Info] Start training from score -3.340528
[LightGBM] [Info] Start training from score -3.823380




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000568 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 443
[LightGBM] [Info] Number of data points in the train set: 1327, number of used features: 37
[LightGBM] [Info] Start training from score -0.327918
[LightGBM] [Info] Start training from score -2.200243
[LightGBM] [Info] Start training from score -2.193464
[LightGBM] [Info] Start training from score -3.340528
[LightGBM] [Info] Start training from score -3.823380




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010191 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 450
[LightGBM] [Info] Number of data points in the train set: 1326, number of used features: 38
[LightGBM] [Info] Start training from score -0.328211
[LightGBM] [Info] Start training from score -2.199490
[LightGBM] [Info] Start training from score -2.192710
[LightGBM] [Info] Start training from score -3.339775
[LightGBM] [Info] Start training from score -3.822626




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000472 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 440
[LightGBM] [Info] Number of data points in the train set: 1326, number of used features: 37
[LightGBM] [Info] Start training from score -0.328211
[LightGBM] [Info] Start training from score -2.199490
[LightGBM] [Info] Start training from score -2.192710
[LightGBM] [Info] Start training from score -3.339775
[LightGBM] [Info] Start training from score -3.822626




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000500 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 430
[LightGBM] [Info] Number of data points in the train set: 1326, number of used features: 36
[LightGBM] [Info] Start training from score -0.328211
[LightGBM] [Info] Start training from score -2.199490
[LightGBM] [Info] Start training from score -2.192710
[LightGBM] [Info] Start training from score -3.318721
[LightGBM] [Info] Start training from score -3.857718


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.015224 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 449
[LightGBM] [Info] Number of data points in the train set: 1327, number of used features: 35
[LightGBM] [Info] Start training from score -0.328965
[LightGBM] [Info] Start training from score -2.193464
[LightGBM] [Info] Start training from score -2.193464
[LightGBM] [Info] Start training from score -3.340528
[LightGBM] [Info] Start training from score -3.823380




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.034946 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 443
[LightGBM] [Info] Number of data points in the train set: 1327, number of used features: 37
[LightGBM] [Info] Start training from score -0.327918
[LightGBM] [Info] Start training from score -2.200243
[LightGBM] [Info] Start training from score -2.193464
[LightGBM] [Info] Start training from score -3.340528
[LightGBM] [Info] Start training from score -3.823380




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001755 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 450
[LightGBM] [Info] Number of data points in the train set: 1326, number of used features: 38
[LightGBM] [Info] Start training from score -0.328211
[LightGBM] [Info] Start training from score -2.199490
[LightGBM] [Info] Start training from score -2.192710
[LightGBM] [Info] Start training from score -3.339775
[LightGBM] [Info] Start training from score -3.822626




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001914 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 440
[LightGBM] [Info] Number of data points in the train set: 1326, number of used features: 37
[LightGBM] [Info] Start training from score -0.328211
[LightGBM] [Info] Start training from score -2.199490
[LightGBM] [Info] Start training from score -2.192710
[LightGBM] [Info] Start training from score -3.339775
[LightGBM] [Info] Start training from score -3.822626




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002932 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 430
[LightGBM] [Info] Number of data points in the train set: 1326, number of used features: 36
[LightGBM] [Info] Start training from score -0.328211
[LightGBM] [Info] Start training from score -2.199490
[LightGBM] [Info] Start training from score -2.192710
[LightGBM] [Info] Start training from score -3.318721
[LightGBM] [Info] Start training from score -3.857718


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.120920 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 449
[LightGBM] [Info] Number of data points in the train set: 1327, number of used features: 35
[LightGBM] [Info] Start training from score -0.328965
[LightGBM] [Info] Start training from score -2.193464
[LightGBM] [Info] Start training from score -2.193464
[LightGBM] [Info] Start training from score -3.340528
[LightGBM] [Info] Start training from score -3.823380




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000960 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 443
[LightGBM] [Info] Number of data points in the train set: 1327, number of used features: 37
[LightGBM] [Info] Start training from score -0.327918
[LightGBM] [Info] Start training from score -2.200243
[LightGBM] [Info] Start training from score -2.193464
[LightGBM] [Info] Start training from score -3.340528
[LightGBM] [Info] Start training from score -3.823380




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007026 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 450
[LightGBM] [Info] Number of data points in the train set: 1326, number of used features: 38
[LightGBM] [Info] Start training from score -0.328211
[LightGBM] [Info] Start training from score -2.199490
[LightGBM] [Info] Start training from score -2.192710
[LightGBM] [Info] Start training from score -3.339775
[LightGBM] [Info] Start training from score -3.822626




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000562 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 440
[LightGBM] [Info] Number of data points in the train set: 1326, number of used features: 37
[LightGBM] [Info] Start training from score -0.328211
[LightGBM] [Info] Start training from score -2.199490
[LightGBM] [Info] Start training from score -2.192710
[LightGBM] [Info] Start training from score -3.339775
[LightGBM] [Info] Start training from score -3.822626




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003430 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 430
[LightGBM] [Info] Number of data points in the train set: 1326, number of used features: 36
[LightGBM] [Info] Start training from score -0.328211
[LightGBM] [Info] Start training from score -2.199490
[LightGBM] [Info] Start training from score -2.192710
[LightGBM] [Info] Start training from score -3.318721
[LightGBM] [Info] Start training from score -3.857718


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001949 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 449
[LightGBM] [Info] Number of data points in the train set: 1327, number of used features: 35
[LightGBM] [Info] Start training from score -0.328965
[LightGBM] [Info] Start training from score -2.193464
[LightGBM] [Info] Start training from score -2.193464
[LightGBM] [Info] Start training from score -3.340528
[LightGBM] [Info] Start training from score -3.823380




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000405 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 443
[LightGBM] [Info] Number of data points in the train set: 1327, number of used features: 37
[LightGBM] [Info] Start training from score -0.327918
[LightGBM] [Info] Start training from score -2.200243
[LightGBM] [Info] Start training from score -2.193464
[LightGBM] [Info] Start training from score -3.340528
[LightGBM] [Info] Start training from score -3.823380




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000296 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 450
[LightGBM] [Info] Number of data points in the train set: 1326, number of used features: 38
[LightGBM] [Info] Start training from score -0.328211
[LightGBM] [Info] Start training from score -2.199490
[LightGBM] [Info] Start training from score -2.192710
[LightGBM] [Info] Start training from score -3.339775
[LightGBM] [Info] Start training from score -3.822626




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000384 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 440
[LightGBM] [Info] Number of data points in the train set: 1326, number of used features: 37
[LightGBM] [Info] Start training from score -0.328211
[LightGBM] [Info] Start training from score -2.199490
[LightGBM] [Info] Start training from score -2.192710
[LightGBM] [Info] Start training from score -3.339775
[LightGBM] [Info] Start training from score -3.822626




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000403 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 430
[LightGBM] [Info] Number of data points in the train set: 1326, number of used features: 36
[LightGBM] [Info] Start training from score -0.328211
[LightGBM] [Info] Start training from score -2.199490
[LightGBM] [Info] Start training from score -2.192710
[LightGBM] [Info] Start training from score -3.318721
[LightGBM] [Info] Start training from score -3.857718


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000644 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 449
[LightGBM] [Info] Number of data points in the train set: 1327, number of used features: 35
[LightGBM] [Info] Start training from score -0.328965
[LightGBM] [Info] Start training from score -2.193464
[LightGBM] [Info] Start training from score -2.193464
[LightGBM] [Info] Start training from score -3.340528
[LightGBM] [Info] Start training from score -3.823380




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000323 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 443
[LightGBM] [Info] Number of data points in the train set: 1327, number of used features: 37
[LightGBM] [Info] Start training from score -0.327918
[LightGBM] [Info] Start training from score -2.200243
[LightGBM] [Info] Start training from score -2.193464
[LightGBM] [Info] Start training from score -3.340528
[LightGBM] [Info] Start training from score -3.823380




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000372 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 450
[LightGBM] [Info] Number of data points in the train set: 1326, number of used features: 38
[LightGBM] [Info] Start training from score -0.328211
[LightGBM] [Info] Start training from score -2.199490
[LightGBM] [Info] Start training from score -2.192710
[LightGBM] [Info] Start training from score -3.339775
[LightGBM] [Info] Start training from score -3.822626




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001463 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 440
[LightGBM] [Info] Number of data points in the train set: 1326, number of used features: 37
[LightGBM] [Info] Start training from score -0.328211
[LightGBM] [Info] Start training from score -2.199490
[LightGBM] [Info] Start training from score -2.192710
[LightGBM] [Info] Start training from score -3.339775
[LightGBM] [Info] Start training from score -3.822626




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000717 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 430
[LightGBM] [Info] Number of data points in the train set: 1326, number of used features: 36
[LightGBM] [Info] Start training from score -0.328211
[LightGBM] [Info] Start training from score -2.199490
[LightGBM] [Info] Start training from score -2.192710
[LightGBM] [Info] Start training from score -3.318721
[LightGBM] [Info] Start training from score -3.857718


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000400 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 449
[LightGBM] [Info] Number of data points in the train set: 1327, number of used features: 35
[LightGBM] [Info] Start training from score -0.328965
[LightGBM] [Info] Start training from score -2.193464
[LightGBM] [Info] Start training from score -2.193464
[LightGBM] [Info] Start training from score -3.340528
[LightGBM] [Info] Start training from score -3.823380


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000675 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 443
[LightGBM] [Info] Number of data points in the train set: 1327, number of used features: 37
[LightGBM] [Info] Start training from score -0.327918
[LightGBM] [Info] Start training from score -2.200243
[LightGBM] [Info] Start training from score -2.193464
[LightGBM] [Info] Start training from score -3.340528
[LightGBM] [Info] Start training from score -3.823380




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000400 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 450
[LightGBM] [Info] Number of data points in the train set: 1326, number of used features: 38
[LightGBM] [Info] Start training from score -0.328211
[LightGBM] [Info] Start training from score -2.199490
[LightGBM] [Info] Start training from score -2.192710
[LightGBM] [Info] Start training from score -3.339775
[LightGBM] [Info] Start training from score -3.822626




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001384 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 440
[LightGBM] [Info] Number of data points in the train set: 1326, number of used features: 37
[LightGBM] [Info] Start training from score -0.328211
[LightGBM] [Info] Start training from score -2.199490
[LightGBM] [Info] Start training from score -2.192710
[LightGBM] [Info] Start training from score -3.339775
[LightGBM] [Info] Start training from score -3.822626




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000889 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 430
[LightGBM] [Info] Number of data points in the train set: 1326, number of used features: 36
[LightGBM] [Info] Start training from score -0.328211
[LightGBM] [Info] Start training from score -2.199490
[LightGBM] [Info] Start training from score -2.192710
[LightGBM] [Info] Start training from score -3.318721
[LightGBM] [Info] Start training from score -3.857718


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000586 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 449
[LightGBM] [Info] Number of data points in the train set: 1327, number of used features: 35
[LightGBM] [Info] Start training from score -0.328965
[LightGBM] [Info] Start training from score -2.193464
[LightGBM] [Info] Start training from score -2.193464
[LightGBM] [Info] Start training from score -3.340528
[LightGBM] [Info] Start training from score -3.823380




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000783 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 443
[LightGBM] [Info] Number of data points in the train set: 1327, number of used features: 37
[LightGBM] [Info] Start training from score -0.327918
[LightGBM] [Info] Start training from score -2.200243
[LightGBM] [Info] Start training from score -2.193464
[LightGBM] [Info] Start training from score -3.340528
[LightGBM] [Info] Start training from score -3.823380




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000311 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 450
[LightGBM] [Info] Number of data points in the train set: 1326, number of used features: 38
[LightGBM] [Info] Start training from score -0.328211
[LightGBM] [Info] Start training from score -2.199490
[LightGBM] [Info] Start training from score -2.192710
[LightGBM] [Info] Start training from score -3.339775
[LightGBM] [Info] Start training from score -3.822626




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006140 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 440
[LightGBM] [Info] Number of data points in the train set: 1326, number of used features: 37
[LightGBM] [Info] Start training from score -0.328211
[LightGBM] [Info] Start training from score -2.199490
[LightGBM] [Info] Start training from score -2.192710
[LightGBM] [Info] Start training from score -3.339775
[LightGBM] [Info] Start training from score -3.822626




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000484 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 430
[LightGBM] [Info] Number of data points in the train set: 1326, number of used features: 36
[LightGBM] [Info] Start training from score -0.328211
[LightGBM] [Info] Start training from score -2.199490
[LightGBM] [Info] Start training from score -2.192710
[LightGBM] [Info] Start training from score -3.318721
[LightGBM] [Info] Start training from score -3.857718


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000601 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 449
[LightGBM] [Info] Number of data points in the train set: 1327, number of used features: 35
[LightGBM] [Info] Start training from score -0.328965
[LightGBM] [Info] Start training from score -2.193464
[LightGBM] [Info] Start training from score -2.193464
[LightGBM] [Info] Start training from score -3.340528
[LightGBM] [Info] Start training from score -3.823380




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000323 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 443
[LightGBM] [Info] Number of data points in the train set: 1327, number of used features: 37
[LightGBM] [Info] Start training from score -0.327918
[LightGBM] [Info] Start training from score -2.200243
[LightGBM] [Info] Start training from score -2.193464
[LightGBM] [Info] Start training from score -3.340528
[LightGBM] [Info] Start training from score -3.823380


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

Best model: Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('categorical',
                                                  Pipeline(steps=[('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  ['Tipus de praxi',
                                                   "Tipologia d'acte mèdic",
                                                   'Especialitat',
                                                   'Codi diagnòstic']),
                                                 ('numerical',
                                                  Pipeline(steps=[('scale',
                                                                   StandardScaler())]),
                                                  ['Exitus',
                                                   "Especialitat d'act

In [27]:

print(f'Best score: {best_score}')

Best score: -0.7285880682852255


In [29]:
#Apply the best performing model to the test set and check the scores
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Test set accuracy: {accuracy:.2f}')
precision=precision_score(y_test, y_pred, average='weighted')
print(f'Test set precision: {precision:.2f}')
recall=recall_score(y_test, y_pred, average='weighted')
print(f'Test set recall: {recall:.2f}')
f1=f1_score(y_test, y_pred, average='weighted')
print(f'Test set F1 score: {f1:.2f}')

Test set accuracy: 0.76
Test set precision: 0.64
Test set recall: 0.76
Test set F1 score: 0.67


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
