In [None]:
import sys
import numpy as np
import pandas as pd
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
import math

from sklearn.preprocessing import StandardScaler


from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.model_selection import train_test_split


In [None]:
df = pd.read_csv('datasets/MALAVA_2023-09-20_MP.csv')

# IMPORT DATASET

data = df[[
    'Et√†', 'ECOG PS', 'SESSO', 'BMI','Uso concomitante di metformina', 'Pregresso trattamento con BCG', 'CHT neoadiuvante',
    'CISTECTOMIA', 'ISTOLOGIA', 'Metastatico alla diagnosi', 'CHT I linea', 'Cicli CHT prima linea', 'ORR CT I linea',
    'Metastasi linfonodali', 'Metastasi ossee', 'Metastasi viscerali', 'Metastasi epatiche', 'Metastasi polmonari',
    'GLOBULI BIANCHI BASALI', 'NEUTROFILI BASALI', 'LINFOCITI BASALI', 'MONOCITI BASALI', 'EOSINOFILI BASALI',
    'PIASTRINE BASALI', 'PLR BASALE', 'NLR BASALE', 'SII BASALE', 'LDH BASALE', 'Creatininemia basale', "Steroidi all'inizio", 
    "PPI all'inizio", "Antibiotici all'inizio", "COXi",
    'ORR'
]] 

data

In [None]:
# delete patients without ORR
data=data.dropna(subset=['ORR'])
data

## ------------------------------<br><br>TRAIN/TEST SPLIT & IMPUTATION 

In [None]:
print((data.isnull().sum()/data.shape[0])*100)

# LDH excluded (77%) and also metmformina (27%) --> THRESHOLD = 20%

data = data.drop(['Uso concomitante di metformina','LDH BASALE'],axis=1)

In [None]:
# separate covariates and target
y=data['ORR']
data=data.iloc[:,:-1]

In [None]:
X_tr, X_t, y_tr, y_t = train_test_split(data, y, stratify=y, random_state=123, test_size = 0.2)  
# 80% training - 20% test set

## IMPUTATION

In [None]:
# IMPUTATION

# define imputer
imputer = IterativeImputer()
# fit on the dataset
imputer.fit(X_tr)
# transform the dataset
X = imputer.transform(X_tr)
X1 = imputer.transform(X_t)

X_train = pd.DataFrame(X, columns=data.columns)
X_test = pd.DataFrame(X1, columns=data.columns)
X_train

In [None]:
# ROUND THE VALUES IMPUTED THAT NEED TO BE INTEGERS

X_train['ECOG PS'] = round(X_train['ECOG PS']).astype('int64')
X_test['ECOG PS'] = round(X_test['ECOG PS']).astype('int64')

X_train['SESSO'] = round(X_train['SESSO']).astype('int64')
X_test['SESSO'] = round(X_test['SESSO']).astype('int64')

X_train['Pregresso trattamento con BCG'] = round(X_train['Pregresso trattamento con BCG']).astype('int64')
X_test['Pregresso trattamento con BCG'] = round(X_test['Pregresso trattamento con BCG']).astype('int64')

X_train['CHT neoadiuvante'] = round(X_train['CHT neoadiuvante']).astype('int64')
X_test['CHT neoadiuvante'] = round(X_test['CHT neoadiuvante']).astype('int64')

X_train['CISTECTOMIA'] = round(X_train['CISTECTOMIA']).astype('int64')
X_test['CISTECTOMIA'] = round(X_test['CISTECTOMIA']).astype('int64')

X_train['ISTOLOGIA'] = round(X_train['ISTOLOGIA']).astype('int64')
X_test['ISTOLOGIA'] = round(X_test['ISTOLOGIA']).astype('int64')

X_train['Metastatico alla diagnosi'] = round(X_train['Metastatico alla diagnosi']).astype('int64')
X_test['Metastatico alla diagnosi'] = round(X_test['Metastatico alla diagnosi']).astype('int64')

X_train['CHT I linea'] = round(X_train['CHT I linea']).astype('int64')
X_test['CHT I linea'] = round(X_test['CHT I linea']).astype('int64')

X_train['Cicli CHT prima linea'] = round(X_train['Cicli CHT prima linea']).astype('int64')
X_test['Cicli CHT prima linea'] = round(X_test['Cicli CHT prima linea']).astype('int64')

X_train['ORR CT I linea'] = round(X_train['ORR CT I linea']).astype('int64')
X_test['ORR CT I linea'] = round(X_test['ORR CT I linea']).astype('int64')

X_train['Metastasi linfonodali'] = round(X_train['Metastasi linfonodali']).astype('int64')
X_test['Metastasi linfonodali'] = round(X_test['Metastasi linfonodali']).astype('int64')

X_train['Metastasi ossee'] = round(X_train['Metastasi ossee']).astype('int64')
X_test['Metastasi ossee'] = round(X_test['Metastasi ossee']).astype('int64')

X_train['Metastasi viscerali'] = round(X_train['Metastasi viscerali']).astype('int64')
X_test['Metastasi viscerali'] = round(X_test['Metastasi viscerali']).astype('int64')

X_train['Metastasi epatiche'] = round(X_train['Metastasi epatiche']).astype('int64')
X_test['Metastasi epatiche'] = round(X_test['Metastasi epatiche']).astype('int64')

X_train['Metastasi polmonari'] = round(X_train['Metastasi polmonari']).astype('int64')
X_test['Metastasi polmonari'] = round(X_test['Metastasi polmonari']).astype('int64')

X_train["Steroidi all'inizio"] = round(X_train["Steroidi all'inizio"]).astype('int64')
X_test["Steroidi all'inizio"] = round(X_test["Steroidi all'inizio"]).astype('int64')

X_train["PPI all'inizio"] = round(X_train["PPI all'inizio"]).astype('int64')
X_test["PPI all'inizio"] = round(X_test["PPI all'inizio"]).astype('int64')

X_train["Antibiotici all'inizio"] = round(X_train["Antibiotici all'inizio"]).astype('int64')
X_test["Antibiotici all'inizio"] = round(X_test["Antibiotici all'inizio"]).astype('int64')

X_train['COXi'] = round(X_train['COXi']).astype('int64')
X_test['COXi'] = round(X_test['COXi']).astype('int64')

## FEATURE TRANSFORMATION

In [None]:

categorical_features_indices = np.where(X_train.dtypes == 'int64')[0]
X_train_categorical=X_train.iloc[:,categorical_features_indices]
X_test_categorical=X_test.iloc[:,categorical_features_indices]


numerical_features_indices = np.where(X_train.dtypes == 'float64')[0]
X_train_numerical=X_train.iloc[:,numerical_features_indices]
X_test_numerical=X_test.iloc[:,numerical_features_indices]


In [None]:
X = X_train_numerical.copy()
X.hist(layout=(4,4), figsize=(15,12))

In [None]:
# logNLR is better than normal NLR, it seems more normal
x_log = X.copy()

for i in X.columns:
    x_log['log {c}'.format(c=i)]=X[i].apply(lambda y: math.log(y+1))

x_log.hist(layout=(4,6), figsize=(15,12))  


# NO LOGARITHMIC TRANSFORMATIONS NEEDED

In [None]:
X_train_numerical = X.copy()

### Standardization

In [None]:
scaler = StandardScaler()
scaler.fit(X_train_numerical)

X_tr_scaled = pd.DataFrame(scaler.transform(X_train_numerical))  # scaling on the training set
X_tr_scaled.columns=X_train_numerical.columns
X_train_categorical.index=X_tr_scaled.index
X_train_scaled = pd.concat([X_tr_scaled,X_train_categorical],axis=1)

X_t_scaled = pd.DataFrame(scaler.transform(X_test_numerical))
X_t_scaled.columns=X_test_numerical.columns
X_test_categorical.index=X_t_scaled.index
X_test_scaled = pd.concat([X_t_scaled,X_test_categorical],axis=1)

In [None]:
y_tr.index = X_train.index
y_t.index = X_test.index


In [None]:
# english feature names
new_columns = [
    'Age',
    'BMI',
    'Leukocytes at baseline',
    'Neutrophils at baseline',
    'Lymphocytes at baseline',
    'Monocytes at baseline',
    'Eosinophils at baseline',
    'Platelets at baseline',
    'PLR',
    'NLR',
    'SII',
    'Creatinine at baseline',
    'ECOG PS',
    'Sex',
    'Previous BCG treatment',
    'Neoadjuvant CHT',
    'Cystectomy',
    'Histology',
    'Metastatic at diagnosis',
    '1st line CHT',
    'Tot cycles 1st line CHT',
    'ORR 1st line CHT',
    'Lymph nodes metastases',
    'Bone metastases',
    'Visceral metastases',
    'Liver metastases',
    'Lung metastases',
    'Steroids',
    'PPI',
    'Antibiotics',
    'COXi'
]

X_train_scaled.columns = new_columns
X_test_scaled.columns = new_columns

## -----------------------------------------<br><br>FEATURE CORRELATION 

In [None]:

plt.figure(figsize=(30, 20))
heatmap = sns.heatmap(X_train_scaled.corr(), vmin=-1, vmax=1, fmt='.2f', annot=True, annot_kws={'fontsize': 15})
heatmap.set_title('Correlation Heatmap', fontdict={'fontsize':20}, pad=12);
path = 'results/classification/ORR/Correlation_matrix.png'
heatmap.set_xticklabels(heatmap.get_xmajorticklabels(), fontsize = 20)
heatmap.set_yticklabels(heatmap.get_ymajorticklabels(), fontsize = 20)
fig = heatmap.get_figure()
fig.savefig(path,format='png')

cor_matrix = X_train_scaled.corr().abs()

upper_tri = cor_matrix.where(np.triu(np.ones(cor_matrix.shape),k=1).astype(bool))

to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.80)]
print(to_drop)

# no features are eliminated after correlation analysis

In [None]:
# Save the data
X_train_scaled.to_csv("data/classification/ORR/X_train.csv")
X_test_scaled.to_csv("data/classification/ORR/X_test.csv")

y_tr.to_csv("data/classification/ORR/y_train.csv")
y_t.to_csv("data/classification/ORR/y_test.csv")
