# CARDIOVASCULAR DATA SET EXPLORATION

Imports

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.tree import DecisionTreeClassifier  
from sklearn.svm import LinearSVC

## General exploration

### Reading the csv file

In [2]:
'../../raw_data/CVD_cleaned.csv'

data = pd.read_csv(path)

NameError: name 'path' is not defined

### Preview

In [None]:
data.head()

In [None]:
data.describe()

In [None]:
data.shape

In [None]:
data.info()

### Checking if the data is cleaned

Does the data have duplicated rows?

In [None]:
data.duplicated().sum()

It looks like the data is not really cleaned so we have to do some work on it

In [None]:
dirty_shape = data.shape

data.drop_duplicates(inplace=True)

cleaned_shape = data.shape

print(f'The shape of the data with duplicated rows is {dirty_shape}')
print(f'The shape of the data without duplicated rows is {cleaned_shape}')
print(f'We have deleted {dirty_shape[0] - cleaned_shape[0]} rows')

Does the data have null values?

In [None]:
data.isnull().sum()

The data does not have null values so we can operate with it. First we are going to split the data set in categorical features and numerical features in order to explore them.

In [None]:
num_data = data.select_dtypes(exclude=['object'])
cat_data = data.select_dtypes(include=['object'])

Let's start with the numerical features

In [None]:
num_data.corr()

In [None]:
sns.heatmap(num_data.corr())

The Weigth and the BMI are highly correlated so we are going to drop the BMI feature and stay with the weight one, because it's more basic.

In [None]:
num_data.drop(columns=['BMI'],inplace=True)
data.drop(columns=['BMI'],inplace=True)

Now let's check the categorical features and see how many unique values they have and how many of them they have.

In [None]:
cat_data.nunique()

In [None]:
for column in cat_data.columns:
    print(f'{column} and its values:\n{cat_data[column].value_counts()}')

cat_data.value_counts().sum()

Some of the classes are unbalanced so it may be a problem when training the model

In [None]:
num_data.columns

In [None]:
fix,axes = plt.subplots(nrows=2,ncols=3,figsize=(15,10))

for n,ax in zip(num_data.columns,axes.flat):
    
    ax.set_title(f"{n}")
    sns.histplot(num_data[n],bins=10,ax=ax,kde=True)

## Preprocessing

First we process the numerical data

In [None]:
# scaler = StandardScaler()
# num_features = data.select_dtypes(exclude=['object']).columns
# data[num_features] = scaler.fit_transform(data[num_features])

First we process the categorical data with ordinal values

In [None]:
ordinal_features = ['General_Health','Checkup','Age_Category','Diabetes']
dict = {}
for column in ordinal_features:
    dict[column] = data[column].unique()
dict

In [None]:
general_health_sorted = ['Poor','Fair','Good','Very Good','Excellent']
checkup_sorted = ['Never','5 or more years ago','Within the past 5 years','Within the past 2 years','Within the past year']
diabetes_sorted = ['Yes','No, pre-diabetes or borderline diabetes','Yes, but female told only during pregnancy','No']
age_category_sorted = ['18-24','25-29','30-34','35-39','40-44','45-49','50-54','55-59','60-64','65-69','70-74','75-79', '80+']   

In [None]:
data['General_Health'] = OrdinalEncoder(categories=[general_health_sorted]).fit_transform(data[['General_Health']])
data['Checkup'] = OrdinalEncoder(categories=[checkup_sorted]).fit_transform(data[['Checkup']])
data['Diabetes'] = OrdinalEncoder(categories=[diabetes_sorted]).fit_transform(data[['Diabetes']])
data['Age_Category'] = OrdinalEncoder(categories=[age_category_sorted]).fit_transform(data[['Age_Category']])

In [None]:
ohe = OneHotEncoder(drop='if_binary')

not_ordinal_features = [e for e in data.select_dtypes(include=['object']) if e not in ordinal_features]

ohe = OneHotEncoder(drop='if_binary', sparse=False)  # Añade 'sparse=False' para obtener una matriz densa
transformed_features = ohe.fit_transform(data[not_ordinal_features])

# Obtén los nombres de las columnas transformadas
column_names = ohe.get_feature_names_out(input_features=not_ordinal_features)

# Crear un DataFrame con las características transformadas y los nombres de las columnas
transformed_df = pd.DataFrame(transformed_features, columns=column_names)

# Elimina las características originales del DataFrame X
data = data.drop(columns=not_ordinal_features)

# Concatena el DataFrame original con el DataFrame de características transformadas
data = pd.concat([data, transformed_df], axis=1)





Now we split the data into train, validation and test. We have a lot of data so we can select a test_size of 0.15.

In [None]:
data.dropna(inplace=True)
X = data.drop(columns=['Heart_Disease_Yes'],axis=1)
y = data[['Heart_Disease_Yes']]
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.15, random_state=42)




y.shape

In [None]:
y

In [None]:
X

We repreat the split in order to get a validation set

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=42)

The date has been minimally preprocesed so we can try a model now

In [None]:
clf = DecisionTreeClassifier(max_depth=5, min_samples_split=2, min_samples_leaf=1)

In [None]:
scoring = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']
scores = cross_validate(clf, X_train, y_train, scoring=scoring, cv=5)

In [None]:
scores


In [None]:
clf.fit(X_train,y_train)

In [None]:
y_val_pred = pd.DataFrame(pd.Series(clf.predict(X_val),name='Heart_Disease_Yes'))

In [None]:
y_val_pred

In [None]:
y_val.reset_index(inplace=True,drop=True)

In [None]:
type(y_val)

In [None]:
# Realiza una comparación elemento a elemento y obtén una serie booleana que indica las diferencias
diferencias = y_val_pred != y_val

# Cuenta el número de True en la serie de diferencias para obtener la cantidad de datos diferentes
cantidad_diferentes = diferencias.sum()

In [None]:
cantidad_diferentes