In [None]:
!pip install sweetviz
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sweetviz as sv
from scipy import stats
import itertools
from pandas.api.types import is_numeric_dtype




In [None]:
df = pd.read_csv("death_prediction_synthetic.csv")
df.head(10) #first 10 rows

# First part - EDA

## A - info




pandas

In [None]:
df.info() #basic information

In [None]:
df.describe()

In [None]:
df.median(numeric_only=True)


In [None]:
report = sv.analyze(df)
report.show_html("SweetViz_Report.html")

In [None]:
report.show_html("SweetViz_Report.html")

In [None]:
pd.isna(df).sum()

In [None]:
df.isna().mean()*100
#we can see that data have differen % of NA

In [None]:
numeric_data = [var for var in df.columns if len(df[var].unique()) > 5]
char_data = [var for var in df.columns if len(df[var].unique()) <=5]

numeric_data_df = df.select_dtypes(include=['number']).loc[:, ~df.isin([0, 1,2,3]).all()] #dataFR for only numeric data

for var in char_data:
    df[var] = df[var].astype('category')

df.info()

In [None]:
len(numeric_data)

In [None]:
len(char_data)

In [None]:
fig, ax = plt.subplots(9,5,figsize = (40,20))
ax = ax.flatten()
plt.tight_layout()
plt.subplots_adjust(hspace=0.5)
for i, var in enumerate(df.columns):

    if var in numeric_data :
        sns.histplot(data = df, x = var, ax = ax[i]).set(xlabel = None, ylabel = None, title = var)
    else:
        sns.countplot(x = df[var], ax = ax[i]).set(xlabel = None, ylabel = None, title = var)

In [None]:
# Create a new figure for each column. The same figures like in the previous code.
for column in df.columns:
    plt.figure(figsize=(14, 5))

    if column in numeric_data:
        # Histogram and KDE for numeric data
        sns.histplot(df[column], kde=True, color='blue', bins=20)
        plt.title(f'Distribution of {column}')
        plt.xlabel(column)
        plt.ylabel('Frequency')
    else:
        # Barplot for categorical data
        sns.countplot(x=df[column], palette="viridis")
        plt.title(f'Barplot of {column}')
        plt.xlabel(column)
        plt.ylabel('Count')

    plt.show()



## B  -   Corr

Correlation for numeric data

In [None]:
pairs_num = list(itertools.combinations(numeric_data, 2))
for col1, col2 in pairs_num:
    x = df[col1]
    y = df[col2]
    corr, p = stats.spearmanr(x, y)
    if corr < 0.05:
        print(f"{col1} and {col2}: Corr = {corr:.2f}, p-value = {p:.4f}")
#check where in numeric pairs corr > 0.7

In [None]:
def plot_scatter(col1, col2, corr, p_value):
    plt.figure(figsize=(8, 6))
    sns.scatterplot(data=df, x=col1, y=col2)
    plt.title(f"{col1} vs {col2}\nCorr = {corr:.2f}, p-value = {p:.4f}")
    plt.xlabel(col1)
    plt.ylabel(col2)
    plt.show()

for col1, col2 in pairs_num:
    x = df[col1]
    y = df[col2]
    corr, p = stats.spearmanr(x, y)
    if p < 0.05:
      plot_scatter(col1, col2, corr, p)

# the dot plot for each numeric pairs with significant correlation (p-value < 0.05)

In [None]:
corr_mat = df.corr(numeric_only=True)

sns.heatmap(corr_mat, cmap = 'viridis')

----------------------------------------------

## C  -  Difference

Between categorical variables

If the p-value < 0.05, the differences are considered statistically significant and the null hypothesis can be rejected.

In [None]:
#cat for cat Chi-Square Test
from scipy.stats import chi2_contingency
pairs_char = list(itertools.combinations(char_data, 2))
for col1, col2 in pairs_char:
    contingency_table = pd.crosstab(df[col1], df[col2])
    stat, p, dof, expected = chi2_contingency(contingency_table)
    if p < 0.05:
        print(f"{col1} and {col2}: p-value = {p:.4f}")

In [None]:
from scipy.stats import chi2_contingency
pairs_char = list(itertools.combinations(char_data, 2))
for col1, col2 in pairs_char:
    contingency_table = pd.crosstab(df[col1], df[col2])
    stat, p, dof, expected = chi2_contingency(contingency_table)
    if p == 0:
        print(f"{col1} and {col2}: p-value = {p}")

In [None]:
def barplot_side_by_side(df, col1, col2, p):
    plt.figure(figsize=(8, 6))
    crosstab = pd.crosstab(df[col1], df[col2])
    crosstab.plot(kind='bar', figsize=(8, 6), colormap='Set2', width=0.8, position=1)
    plt.ylabel('Frequency')
    plt.xlabel(col1)
    plt.title(f"{col1} vs {col2}\np-value = {p:.4f}")
    plt.legend(title=col2, bbox_to_anchor=(1, 1), loc='upper left')
    plt.tight_layout()
    plt.show()


for col1, col2 in pairs_char:
    contingency_table = pd.crosstab(df[col1], df[col2])
    stat, p, dof, expected = chi2_contingency(contingency_table)
    if p < 0.05:
      barplot_side_by_side(df, col1, col2, p)



Between numerical and categorical variables

In [None]:
counter = 1
for col1 in numeric_data:
    for col2 in char_data:
        # Conducting an ANOVA test
        groups = [df[df[col2] == code][col1] for code in df[col2].unique()]

        if len(groups) > 1 and all(len(group) > 1 for group in groups):
            f_stat, p_value = stats.f_oneway(*groups)
            if p_value < 0.05:
                print(f"{counter}. ANOVA for {col1} and {col2}: F-statistic={f_stat:.2f}, p-value={p_value:.4f}")
                counter += 1




In [None]:
def plot_boxplot(col1, col2):
    plt.figure(figsize=(8, 6))
    df.boxplot(column=col1, by=col2, patch_artist=True)
    plt.title(f'Boxplot of {col1} by {col2}')
    plt.xlabel(col2)
    plt.ylabel(col1)
    plt.show()




for col1 in numeric_data:
    for col2 in char_data:
        # Conducting an ANOVA test
        groups = [df[df[col2] == code][col1] for code in df[col2].unique()]

        if len(groups) > 1 and all(len(group) > 1 for group in groups):
            f_stat, p_value = stats.f_oneway(*groups)
            if p_value < 0.05:
              plot_boxplot(col1, col2)


In [None]:
def plot_distribution(col1, col2):
    plt.figure(figsize=(8, 6))


    sns.displot(data=df, x=col1, hue=col2, kde=True, kind = "hist" , height=6, aspect=1.5, palette="Set2")

    plt.title(f'Distribution of {col1} by {col2}')
    plt.xlabel(col1)
    plt.ylabel(f'Density of {col1}')
    plt.show()


for col1 in numeric_data:
    for col2 in char_data:
        # Conducting an ANOVA test
        groups = [df[df[col2] == code][col1] for code in df[col2].unique()]

        if len(groups) > 1 and all(len(group) > 1 for group in groups):
            f_stat, p_value = stats.f_oneway(*groups)
            if p_value < 0.05:
                    plot_distribution(col1, col2)

## D - Outliers

In [None]:
numeric_data_df = df.select_dtypes(include=['number']).loc[:, ~df.isin([0, 1,2,3]).all()] #dataFR for only numeric data

q1 = numeric_data_df.quantile(0.25)
q3 = numeric_data_df.quantile(0.75)
IQR = q3 - q1
L1 = q1 - 1.5*IQR
L2 = q3 + 1.5*IQR
outliers = (numeric_data_df <= L1) | (numeric_data_df >= L2)

In [None]:
sns.heatmap(outliers)
plt.title('Outliers', fontsize=16)
plt.show()


In [None]:
numeric_data_df[outliers].describe().T[['count', 'mean', 'min', 'max']]

In [None]:
boundaries = pd.concat([L1, L2], axis=1)
boundaries.columns = ['Lower_Bound', 'Upper_Bound']
boundaries

In [None]:
# the outliers shown in leuko_u are not outliers. we'll update the matrix
# same with age
outliers['leuko_u'] = False
outliers['age'] = False

In [None]:
sns.heatmap(outliers)

In [None]:
red = dict(markerfacecolor='red', marker='o')
def plot_boxplot(col1):
    plt.figure(figsize=(8, 6))
    df.boxplot(column=col1, patch_artist=True , flierprops=red)
    plt.title(f'Boxplot of {col1}')
    plt.ylabel(col1)
    plt.show()


for col1 in df[numeric_data]:
    if col1 != 'leuko_u' and col1 != 'age':
      plot_boxplot(col1)

## E - heatmap

In [None]:
plt.figure(figsize=(10, 6))
sns.heatmap(df.isna())
plt.title('Missing Values Heatmap', fontsize=16)

plt.show()



---




---







---



---





---



---



---



---



# Second part - Data cleaning


## a. Outliers

In [None]:

from scipy.stats import norm

def cocor(data1, target1, data2, target2):

    # Removing missing values
    target1 = target1[data1.notna()]
    data1 = data1[data1.notna()]
    target2 = target2[data2.notna()]
    data2 = data2[data2.notna()]

    # Calculating Spearman's correlation
    correlation1 = data1.corr(target1, method='spearman')
    correlation2 = data2.corr(target2, method='spearman')

    # Calculating sample size
    n1 = len(data1)
    n2 = len(data2)

    # Calculating z-statistic
    correlation1_z = 0.5 * np.log((1 + correlation1) / (1 - correlation1))
    correlation2_z = 0.5 * np.log((1 + correlation2) / (1 - correlation2))

    se_diff_r = np.sqrt(1 / (n1 - 3) + 1 / (n2 - 3))
    diff = correlation1_z - correlation2_z
    z = abs(diff / se_diff_r)

    # Calculating p-value
    p = (1 - norm.cdf(z)) * 2

    return z, p

def is_lower(p):
    return p<0.05

In [None]:
# test distrtibutions
from scipy import stats

p_values_corr = {}

# test correlations
for j, col in enumerate(numeric_data):

    s = df[col]
    y = df['dead_5y']

    s_withot_outliers = df.loc[~outliers[col], col]
    y_withot_outliers = df.loc[~outliers[col], 'dead_5y']

    _,p = cocor(s , y , s_withot_outliers, y_withot_outliers)
    p_values_corr[col] = [p]

decision_table = pd.DataFrame(p_values_corr).T
decision_table.columns = ['p_value_correlation']

correlation_changed = decision_table['p_value_correlation'].apply(is_lower)
decision_table['correlation_changed'] = correlation_changed

# test distrtibutions

for j, col in enumerate(numeric_data):
    s = df[col]
    s = s[s.notna()]
    s_withot_outliers = df.loc[~outliers[col], col]
    s_withot_outliers = s_withot_outliers[s_withot_outliers.notna()]

    _,p = stats.kstest(s,s_withot_outliers)

    p_values_corr[col] = [p]

decision_table['p_value_distrtibutions'] = pd.DataFrame(p_values_corr).T

distribution_changed = decision_table['p_value_distrtibutions'].apply(is_lower)
decision_table['distribution_changed'] = distribution_changed

decision_table

In [None]:
# creating the decision column
decision_table['drop'] = decision_table['correlation_changed'] ^ decision_table['distribution_changed']

decision_table

In [None]:
col_to_drop = decision_table[decision_table['drop']].index
col_to_drop

In [None]:
count_before_nan = df.count()


In [None]:
df[outliers[col_to_drop]]=np.nan

In [None]:
count_after_nan = df.count()


In [None]:
count_before_nan - count_after_nan

In [None]:
sns.heatmap(outliers[col_to_drop])

## b. Missing values

In [None]:
missing_count = df.isnull().sum()

missing_ratio = (missing_count / len(df)).sort_values(ascending=False)

print("Missing values ratio by column:")
print(missing_ratio)

In [None]:
#1.Columns with more than 70% missing values will be removed.
#deleting cols that have more than 70% missing values

columns_to_drop = missing_ratio[missing_ratio >= 0.7].index

print("Columns to be dropped due to high missing value ratio:")
print(columns_to_drop)


df.drop(columns_to_drop, axis = 1, inplace=True)


In [None]:
#2.
#converting variables with 40%-70% missing values to categorical variables
cols_to_categorize = missing_ratio[(missing_ratio >= 0.4) & (missing_ratio < 0.7)].index
print("Columns with missing ratio between 40% and 70%:")
print(cols_to_categorize)

for col in cols_to_categorize:
    if col in numeric_data:
        # Dividing the values into 4 categories
        df[col] = pd.qcut(df[col], 4, labels=[1, 2, 3, 4], duplicates='drop')

        # Converting the column to categorical and adding category 0
        df[col] = df[col].astype('category')
        df[col] = df[col].cat.add_categories([0])
        df[col].fillna(0, inplace=True)

       # Updating the lists of variables
        numeric_data.remove(col)
        char_data.append(col)

    if col in char_data:

        # Checking if the first value in the categories is 0
        if df[col].cat.categories[0] == 0:
            # Adding 1 to all categories
            df[col] = df[col].apply(lambda x: x + 1 if not pd.isnull(x) else x)

        # Adding category 0 for missing values
        df[col] = df[col].cat.add_categories([0])
        df[col].fillna(0, inplace=True)

we see that we dont have variables with 40%-70% missing values


In [None]:
#3.
#checking if the missing values are MCAR (Missing Completely at Random) or MAR (Missing at Random).

cols_to_check = missing_ratio[(missing_ratio > 0) & (missing_ratio < 0.4)].index

p_values = {}

for missing_col in cols_to_check:
    p_values[missing_col] = {}

    for col in df.columns:
        if missing_col != col:

            if col in numeric_data:
                s1 = df[col]
                s2 = s1[df[missing_col].notnull()]
                _, p = stats.kstest(s1, s2)

            else:
                s1 = df[missing_col].isnull().astype(int)
                s2 = df[col]
                _, p, _, _ = stats.chi2_contingency(pd.crosstab(s1, s2))

            p_values[missing_col][col] = p

p_values_df = pd.DataFrame(p_values).T

Significant values ​​(p < 0.05) indicate that the missing_col values ​​depend on another variable (col).

In [None]:
# True - indicates a significant relationship between the missing values in the column and the column being tested.
p_values_df = pd.DataFrame(p_values).T
p_values_df = p_values_df <= 0.05
p_values_df

In [None]:
p_values_df['num_of_affecting_columns'] = (p_values_df).sum(axis=1)
p_values_df['MCAR'] = p_values_df['num_of_affecting_columns'] == 0
p_values_df

so theres no Missing Completely At Random values

In [None]:
# We will divide the features into two groups:MCAR and Not MCAR
cols_to_impute = list(p_values_df.query('MCAR == True').index)
cols_to_categorize = list(p_values_df.query('MCAR == False').index)

# imputation (MCAR)
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=3)
for col in cols_to_impute:
    df[col] = imputer.fit_transform(np.array(df[col]).reshape(-1, 1))

In [None]:
# for leuko_u, we'll convert to categorica
df.leuko_u = df.leuko_u.astype('category')
numeric_data.remove('leuko_u')
char_data.append('leuko_u')

In [None]:
df.Charlson = df.Charlson.astype('category')
numeric_data.remove('Charlson')
char_data.append('Charlson')

In [None]:
# categorize (NOT MCAR)
for col in cols_to_categorize:
    if col in numeric_data:
        df[col] = pd.qcut(df[col], 4, labels=[1, 2, 3, 4])
        df[col] = df[col].astype('category')

        if 0 not in df[col].cat.categories:
            df[col] = df[col].cat.add_categories([0])
        df[col].fillna(0, inplace=True)
        numeric_data.remove(col)
        char_data.append(col)
    else:

        if 0 not in df[col].cat.categories:
            df[col] = df[col].cat.add_categories([0])

        df[col].fillna(0, inplace=True)


        if 0 in df[col].cat.categories:
            df[col] = df[col].apply(lambda x: x + 1 if pd.notna(x) else x)



In [None]:
df.isnull().sum()

In [None]:
df.head(10)

## c. we remove one column from two where the correlation is bigger than 0.7

In [None]:
df = df.drop(columns=['weigh', 'LDL', 'HbA1c'])

In [None]:
df = df.drop(columns=['residence', 'smoking_status']) # same values

In [None]:

numeric_data = [var for var in df.columns if len(df[var].unique()) > 5]
char_data = [var for var in df.columns if len(df[var].unique()) <=5]
numeric_data_df = df.select_dtypes(include=['number']).loc[:, ~df.isin([0, 1,2,3]).all()] #dataFR for only numeric data
pairs_num = list(itertools.combinations(numeric_data, 2))
pairs_char = list(itertools.combinations(char_data, 2))

In [None]:
duplicates = df.duplicated()
num_duplicates = duplicates.sum()
print(f"number of duplicates: {num_duplicates}")


## download file

In [None]:
df.to_csv("death_prediction_synthetic_after_outliers_and_missingvalues_removal.csv")

# Third part - EDA circular

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
fig, ax = plt.subplots(9,5,figsize = (40,20))
ax = ax.flatten()
plt.tight_layout()
plt.subplots_adjust(hspace=0.5)
for i, var in enumerate(df.columns):

    if var in numeric_data :
        sns.histplot(data = df, x = var, ax = ax[i]).set(xlabel = None, ylabel = None, title = var)
    else:
        sns.countplot(x = df[var], ax = ax[i]).set(xlabel = None, ylabel = None, title = var)

In [None]:
for col1, col2 in pairs_num:
    x = df[col1]
    y = df[col2]
    corr, p = stats.spearmanr(x, y)
    if p < 0.05:
      plot_scatter(col1, col2, corr, p)

In [None]:
corr_mat = df.corr(numeric_only=True)

sns.heatmap(corr_mat, cmap = 'viridis')

In [None]:
for col1, col2 in pairs_char:
    contingency_table = pd.crosstab(df[col1], df[col2])
    stat, p, dof, expected = chi2_contingency(contingency_table)
    if p < 0.05:
      barplot_side_by_side(df, col1, col2, p)

In [None]:
def plot_boxplot(col1, col2):
    plt.figure(figsize=(8, 6))
    df.boxplot(column=col1, by=col2, patch_artist=True)
    plt.title(f'Boxplot of {col1} by {col2}')
    plt.xlabel(col2)
    plt.ylabel(col1)
    plt.show()


for col1 in numeric_data:
    for col2 in char_data:
        # Conducting an ANOVA test
        groups = [df[df[col2] == code][col1] for code in df[col2].unique()]

        if len(groups) > 1 and all(len(group) > 1 for group in groups):
            f_stat, p_value = stats.f_oneway(*groups)
            if p_value < 0.05:
              plot_boxplot(col1, col2)

# Fourth part - Adding data

In [None]:
#we add a new column that shows what medications a person takes (by the first letters of the variable)


conditions = [
    (df['antidiabetics'] == 1) & (df['cardiovascular_meds'] == 1) & (df['statines'] == 1),
    (df['antidiabetics'] == 1) & (df['cardiovascular_meds'] == 1),
    (df['antidiabetics'] == 1) & (df['statines'] == 1),
    (df['cardiovascular_meds'] == 1) & (df['statines'] == 1),
    (df['antidiabetics'] == 1),
    (df['cardiovascular_meds'] == 1),
    (df['statines'] == 1),
]


values = ['ACS', 'AC', 'AS', 'CS', 'A', 'C', 'S']


df['medication_count'] = np.select(conditions, values, default='None')


In [None]:
#Metabolic syndrome risk: Based on factors such as obesity (BMI), high blood pressure (bp_sys, bp_dias), high cholesterol (cholesterol_total).


BMI = df['BMI'].to_numpy()
bp_sys = df['bp_sys'].to_numpy()
bp_dias = df['bp_dias'].to_numpy()
triglycerides = df['triglycerides'].to_numpy()
HDL = df['HDL'].to_numpy()
glucose = df['glucose'].to_numpy()
sex = df['sex'].to_numpy()


obesity = BMI >= 30
high_bp = (bp_sys >= 130) | (bp_dias >= 85)
high_triglycerides = triglycerides >= 150
low_hdl = ((sex == 1) & (HDL < 40)) | ((sex == 0) & (HDL < 50))
high_glucose = glucose >= 100


risk_factors = obesity + high_bp + high_triglycerides + low_hdl + high_glucose

# Assign 1 if >= 3 criteria, otherwise 0
metabolic_risk = np.where(risk_factors >= 3, 1, 0)


df['metabolic_risk'] = metabolic_risk


In [None]:
#Systolic and diastolic pressure (bp_sys, bp_dias): a variable that indicates the presence of hypertension.

bp_sys = df['bp_sys'].to_numpy()
bp_dias = df['bp_dias'].to_numpy()

# We define the risk: 1 = there is a risk, 0 = no risk
risk = np.where((bp_sys >= 130) | (bp_dias >= 85), 1, 0)


df['blood_pressure_risk'] = risk


In [None]:
q1 = numeric_data_df.quantile(0.25)
q3 = numeric_data_df.quantile(0.75)
IQR = q3 - q1
L1 = q1 - 1.5*IQR
L2 = q3 + 1.5*IQR
outliers = (numeric_data_df <= L1) | (numeric_data_df >= L2)

In [None]:
outliers['leuko_u'] = False
outliers['age'] = False

In [None]:
sns.heatmap(outliers)

In [None]:
df.isnull().sum()

In [None]:
df.head()

## download file

In [None]:
df.to_csv("death_prediction_synthetic_after_adding_data")

# Fifth part - Feature selection

In [None]:
# feature Selection - filtering method

p_values = []
columns = list(df.columns)
columns.remove('dead_5y')

for var in columns:


    if var in numeric_data:

        x1 = df.query('dead_5y == 1')[var]
        x2 = df.query('dead_5y == 0')[var]

        _ , p = stats.mannwhitneyu(x1,x2)

    else:
        _ , p, _ , _ = stats.chi2_contingency(pd.crosstab(df[var], df.dead_5y))

    p_values.append(p)

selection = pd.DataFrame(p_values, index = columns, columns=['p_value'])
selection['keep'] = ['yes' if p <= 0.05 else 'no' for p in selection.p_value]
selection

In [None]:
selection.query("keep == 'yes'")
cols_to_drop = selection.query("keep == 'no'").index
df.drop(cols_to_drop, axis =1, inplace=True)
df.head()

In [None]:
df.info()

In [None]:
from scipy.stats import chi2_contingency
# Target variable
target = 'dead_5y'

# Separate numeric and categorical features
numeric_features = df.select_dtypes(include=['int64', 'float64']).columns
categorical_features = df.select_dtypes(include=['object', 'category']).columns

# 1. Correlation for numeric variables (if target is numeric)
correlation_matrix = df[numeric_features].corr()
print("\nCorrelation matrix:\n", correlation_matrix)

# Visualize the correlation
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title("Correlation matrix")
plt.show()

# 2. Visualize the distribution of numeric variables
for feature in numeric_features:
    plt.figure(figsize=(8, 6))
    sns.boxplot(x=target, y=feature, data=df)
    plt.title(f'Distribution of {feature} by target variable')
    plt.show()

# 3. Analysis of categorical variables
for feature in categorical_features:
    plt.figure(figsize=(8, 6))
    sns.countplot(x=feature, hue=target, data=df)
    plt.title(f'Distribution of {feature} by target variable')
    plt.show()

    # Statistical test for categorical variables (Chi-square)
    crosstab = pd.crosstab(df[feature], df[target])
    chi2, p, _, _ = chi2_contingency(crosstab)
    print(f"\nChi-square test for {feature}: p-value = {p}")
    if p < 0.05:
        print(f"Variable {feature} is statistically significantly associated with the target variable.")
    else:
        print(f"Variable {feature} is not statistically significant for the target variable.")

# 4. Grouping by categories and calculating statistics
for feature in categorical_features:
    grouped = df.groupby(feature)[target].count()
    print(f"\nAverage death probability for different categories of {feature}:\n", grouped)





In [None]:
df.to_csv("death_prediction_synthetic_final.csv")