In [245]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import ttest_ind, chi2_contingency
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from ucimlrepo import fetch_ucirepo
from sklearn.feature_selection import f_regression
from scipy import stats

In [246]:
# Fetch the dataset 
myocardial_infarction_complications = fetch_ucirepo(id=579)

#print(myocardial_infarction_complications)


In [247]:
# Extract features and target data
mi_features = myocardial_infarction_complications.data.features  # Features (X)
mi_targets = myocardial_infarction_complications.data.targets  # Target variable (y)


# print(mi_features)

# print("***************************************")
# print(mi_targets)


In [248]:
# Metadata about the dataset
#mi_metadata = myocardial_infarction_complications.metadata
#print("Metadata:\n", metadata)



In [249]:
# Variable information (e.g., types of variables, feature descriptions)
mi_variables = myocardial_infarction_complications.variables
print("Variables:\n", variables)

#features count for understanding

variable_features_count = mi_variables[mi_variables['role'] == 'Feature']

print(len(variable_features_count))



Variables:
           name     role         type demographic  \
0           ID       ID      Integer        None   
1          AGE  Feature      Integer         Age   
2          SEX  Feature       Binary         Sex   
3     INF_ANAM  Feature  Categorical        None   
4    STENOK_AN  Feature  Categorical        None   
..         ...      ...          ...         ...   
119   DRESSLER   Target       Binary        None   
120        ZSN   Target       Binary        None   
121     REC_IM   Target       Binary        None   
122  P_IM_STEN   Target       Binary        None   
123     LET_IS   Target  Categorical        None   

                                           description units missing_values  
0    Record ID (ID): Unique identifier. Cannot be r...  None             no  
1                                      Age of patient.  None             no  
2                                   0: female, 1: male  None             no  
3    Quantity of myocardial infarctions in the anam

In [250]:
# Check keys in dataset for additional insights
print(f"Keys of myocardial_infarction_complications: {myocardial_infarction_complications.keys()}")
print(f"Keys of myocardial_infarction_complications.data: {myocardial_infarction_complications.data.keys()}")
print(f"Keys of myocardial_infarction_complications.metadata: {myocardial_infarction_complications.metadata.keys()}")
print(f"Keys of myocardial_infarction_complications.variables: {myocardial_infarction_complications.variables.keys()}")

Keys of myocardial_infarction_complications: dict_keys(['data', 'metadata', 'variables'])
Keys of myocardial_infarction_complications.data: dict_keys(['ids', 'features', 'targets', 'original', 'headers'])
Keys of myocardial_infarction_complications.metadata: dict_keys(['uci_id', 'name', 'repository_url', 'data_url', 'abstract', 'area', 'tasks', 'characteristics', 'num_instances', 'num_features', 'feature_types', 'demographics', 'target_col', 'index_col', 'has_missing_values', 'missing_values_symbol', 'year_of_dataset_creation', 'last_updated', 'dataset_doi', 'creators', 'intro_paper', 'additional_info'])
Keys of myocardial_infarction_complications.variables: Index(['name', 'role', 'type', 'demographic', 'description', 'units',
       'missing_values'],
      dtype='object')


In [251]:
# Combine features and target into a single DataFrame
combined_data = pd.concat([mi_features, mi_targets], axis=1)
print(combined_data.head())  # Display the first few rows

    AGE  SEX  INF_ANAM  STENOK_AN  FK_STENOK  IBS_POST  IBS_NASL   GB  \
0  77.0    1       2.0        1.0        1.0       2.0       NaN  3.0   
1  55.0    1       1.0        0.0        0.0       0.0       0.0  0.0   
2  52.0    1       0.0        0.0        0.0       2.0       NaN  2.0   
3  68.0    0       0.0        0.0        0.0       2.0       NaN  2.0   
4  60.0    1       0.0        0.0        0.0       2.0       NaN  3.0   

   SIM_GIPERT  DLIT_AG  ...  JELUD_TAH  FIBR_JELUD  A_V_BLOK  OTEK_LANC  \
0         0.0      7.0  ...          0           0         0          0   
1         0.0      0.0  ...          0           0         0          0   
2         0.0      2.0  ...          0           0         0          0   
3         0.0      3.0  ...          0           0         0          0   
4         0.0      7.0  ...          0           0         0          0   

   RAZRIV  DRESSLER  ZSN  REC_IM  P_IM_STEN  LET_IS  
0       0         0    0       0          0       0  
1 

In [252]:
# Display data variables (this includes the data types and feature information)
# print("Data Variables:")
# print(myocardial_infarction_complications.variables)

In [253]:
combined_data

Unnamed: 0,AGE,SEX,INF_ANAM,STENOK_AN,FK_STENOK,IBS_POST,IBS_NASL,GB,SIM_GIPERT,DLIT_AG,...,JELUD_TAH,FIBR_JELUD,A_V_BLOK,OTEK_LANC,RAZRIV,DRESSLER,ZSN,REC_IM,P_IM_STEN,LET_IS
0,77.0,1,2.0,1.0,1.0,2.0,,3.0,0.0,7.0,...,0,0,0,0,0,0,0,0,0,0
1,55.0,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,52.0,1,0.0,0.0,0.0,2.0,,2.0,0.0,2.0,...,0,0,0,0,0,0,0,0,0,0
3,68.0,0,0.0,0.0,0.0,2.0,,2.0,0.0,3.0,...,0,0,0,0,0,0,1,0,0,0
4,60.0,1,0.0,0.0,0.0,2.0,,3.0,0.0,7.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1695,77.0,0,0.0,4.0,2.0,1.0,,2.0,0.0,7.0,...,0,0,1,0,1,0,0,0,0,3
1696,70.0,0,0.0,6.0,2.0,1.0,,2.0,0.0,7.0,...,0,0,0,0,0,0,0,0,0,1
1697,55.0,1,3.0,6.0,2.0,2.0,,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,6
1698,79.0,0,2.0,2.0,2.0,1.0,,2.0,0.0,7.0,...,0,0,0,1,0,0,0,0,0,1


In [254]:

print(combined_data.dtypes)

AGE          float64
SEX            int64
INF_ANAM     float64
STENOK_AN    float64
FK_STENOK    float64
              ...   
DRESSLER       int64
ZSN            int64
REC_IM         int64
P_IM_STEN      int64
LET_IS         int64
Length: 123, dtype: object


In [255]:
# Displaying the first few rows of the variable details
print(mi_variables.head())

# List all feature names
print(mi_variables['name'].tolist())

        name     role         type demographic  \
0         ID       ID      Integer        None   
1        AGE  Feature      Integer         Age   
2        SEX  Feature       Binary         Sex   
3   INF_ANAM  Feature  Categorical        None   
4  STENOK_AN  Feature  Categorical        None   

                                         description units missing_values  
0  Record ID (ID): Unique identifier. Cannot be r...  None             no  
1                                    Age of patient.  None             no  
2                                 0: female, 1: male  None             no  
3  Quantity of myocardial infarctions in the anam...  None            yes  
4  Exertional angina pectoris in the anamnesis. \...  None            yes  
['ID', 'AGE', 'SEX', 'INF_ANAM', 'STENOK_AN', 'FK_STENOK', 'IBS_POST', 'IBS_NASL', 'GB', 'SIM_GIPERT', 'DLIT_AG', 'ZSN_A', 'nr_11', 'nr_01', 'nr_02', 'nr_03', 'nr_04', 'nr_07', 'nr_08', 'np_01', 'np_04', 'np_05', 'np_07', 'np_08', 'np_09', 'np

In [256]:
# Displaying the entire combined data with both features and target variables
print(combined_data.info())

# Check for missing data
missing_data = combined_data.isnull().sum()
print(missing_data)

# If you want to display first few rows
print(combined_data.head())

# statistical summary
print(combined_data.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1700 entries, 0 to 1699
Columns: 123 entries, AGE to LET_IS
dtypes: float64(110), int64(13)
memory usage: 1.6 MB
None
AGE            8
SEX            0
INF_ANAM       4
STENOK_AN    106
FK_STENOK     73
            ... 
DRESSLER       0
ZSN            0
REC_IM         0
P_IM_STEN      0
LET_IS         0
Length: 123, dtype: int64
    AGE  SEX  INF_ANAM  STENOK_AN  FK_STENOK  IBS_POST  IBS_NASL   GB  \
0  77.0    1       2.0        1.0        1.0       2.0       NaN  3.0   
1  55.0    1       1.0        0.0        0.0       0.0       0.0  0.0   
2  52.0    1       0.0        0.0        0.0       2.0       NaN  2.0   
3  68.0    0       0.0        0.0        0.0       2.0       NaN  2.0   
4  60.0    1       0.0        0.0        0.0       2.0       NaN  3.0   

   SIM_GIPERT  DLIT_AG  ...  JELUD_TAH  FIBR_JELUD  A_V_BLOK  OTEK_LANC  \
0         0.0      7.0  ...          0           0         0          0   
1         0.0      0.0  ...   

# DESCRIPTIVE STATISTICS

In [257]:
# Checking the general information about the dataset
print(combined_data.info())  # Data types, non-null count, etc.

# # Counting missing values in each column
# print("\nMissing Values per Column:")
print(combined_data.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1700 entries, 0 to 1699
Columns: 123 entries, AGE to LET_IS
dtypes: float64(110), int64(13)
memory usage: 1.6 MB
None
AGE            8
SEX            0
INF_ANAM       4
STENOK_AN    106
FK_STENOK     73
            ... 
DRESSLER       0
ZSN            0
REC_IM         0
P_IM_STEN      0
LET_IS         0
Length: 123, dtype: int64


In [258]:
# Descriptive statistics for numerical features
#print(combined_data.describe())
print(combined_data.describe(include=[np.number]))


Descriptive Statistics for Numerical Columns:
               AGE          SEX     INF_ANAM    STENOK_AN    FK_STENOK  \
count  1692.000000  1700.000000  1696.000000  1594.000000  1627.000000   
mean     61.856974     0.626471     0.554835     2.316186     1.205286   
std      11.259936     0.483883     0.836801     2.440586     1.040814   
min      26.000000     0.000000     0.000000     0.000000     0.000000   
25%      54.000000     0.000000     0.000000     0.000000     0.000000   
50%      63.000000     1.000000     0.000000     1.000000     2.000000   
75%      70.000000     1.000000     1.000000     5.000000     2.000000   
max      92.000000     1.000000     3.000000     6.000000     4.000000   

          IBS_POST  IBS_NASL           GB   SIM_GIPERT      DLIT_AG  ...  \
count  1649.000000  72.00000  1691.000000  1692.000000  1452.000000  ...   
mean      1.160703   0.37500     1.393258     0.033688     3.340220  ...   
std       0.801400   0.48752     1.088803     0.180478    

In [259]:
# Descriptive statistics for categorical features
#print(combined_data.dtypes)
combined_data_objects = combined_data.apply(lambda col: col.astype('object') if (col.dtypes != 'number')  else col)
print(combined_data_objects.describe(include=[object]))


Descriptive Statistics for Categorical Columns:
           AGE   SEX  INF_ANAM  STENOK_AN  FK_STENOK  IBS_POST  IBS_NASL  \
count   1692.0  1700    1696.0     1594.0     1627.0    1649.0      72.0   
unique    62.0     2       4.0        7.0        5.0       3.0       2.0   
top       63.0     1       0.0        0.0        2.0       2.0       0.0   
freq      90.0  1065    1060.0      661.0      854.0     683.0      45.0   

            GB  SIM_GIPERT  DLIT_AG  ...  JELUD_TAH  FIBR_JELUD  A_V_BLOK  \
count   1691.0      1692.0   1452.0  ...       1700        1700      1700   
unique     4.0         2.0      8.0  ...          2           2         2   
top        2.0         0.0      0.0  ...          0           0         0   
freq     880.0      1635.0    551.0  ...       1658        1629      1643   

        OTEK_LANC  RAZRIV  DRESSLER   ZSN  REC_IM  P_IM_STEN  LET_IS  
count        1700    1700      1700  1700    1700       1700    1700  
unique          2       2         2     2 

In [285]:
#Which clinical and demographic features are most predictive of mortality in MI patients?

# Selecting columns for descriptive stats
features = ['AGE', 'SEX', 'SIM_GIPERT', 'FK_STENOK', 'IBS_POST', 'K_BLOOD']
data = combined_data[features]

# Mode Calculation
mode = data.mode()
print(mode, "\n")

# Mean Calculation
mean = data.mean()
print(mean, "\n")

# Median Calculation
median = data.median()
print(median, "\n")

# Range Calculation
range_vals = data.max() - data.min()
print(range_vals, "\n")

# Standard Deviation Calculation
std_dev = data.std()
print(std_dev, "\n")

# Variance Calculation
variance = data.var()
print(variance, "\n")

# Confidence Intervals Calculation

def confidence_interval(data, confidence=0.95):
    n = len(data)
    mean_val = data.mean()
    std_error = data.std() / np.sqrt(n)
    margin_of_error = std_error * stats.t.ppf((1 + confidence) / 2., n - 1)
    return mean_val - margin_of_error, mean_val + margin_of_error

# Calculating confidence intervals for continuous features
continuous_features = ['AGE', 'K_BLOOD']  # Add more continuous features as needed
conf_intervals = {}

for feature in continuous_features:
    conf_intervals[feature] = confidence_interval(data[feature])

# Print Confidence Intervals
print("Confidence Intervals for Features:")
for feature, ci in conf_intervals.items():
    print(f"{feature}: ({ci[0]}, {ci[1]})")


print("\nFrequency Counts for Categorical Features:")
print(data['SEX'].value_counts())
print(data['SIM_GIPERT'].value_counts())


    AGE  SEX  SIM_GIPERT  FK_STENOK  IBS_POST  K_BLOOD
0  63.0    1         0.0        2.0       2.0      4.0 

AGE           61.856974
SEX            0.626471
SIM_GIPERT     0.033688
FK_STENOK      1.205286
IBS_POST       1.160703
K_BLOOD        4.191422
dtype: float64 

AGE           63.0
SEX            1.0
SIM_GIPERT     0.0
FK_STENOK      2.0
IBS_POST       1.0
K_BLOOD        4.1
dtype: float64 

AGE           66.0
SEX            1.0
SIM_GIPERT     1.0
FK_STENOK      4.0
IBS_POST       2.0
K_BLOOD        5.9
dtype: float64 

AGE           11.259936
SEX            0.483883
SIM_GIPERT     0.180478
FK_STENOK      1.040814
IBS_POST       0.801400
K_BLOOD        0.754076
dtype: float64 

AGE           126.786155
SEX             0.234143
SIM_GIPERT      0.032572
FK_STENOK       1.083293
IBS_POST        0.642241
K_BLOOD         0.568631
dtype: float64 

Confidence Intervals for Features:
AGE: (61.32133887767785, 62.39260911286588)
K_BLOOD: (4.155550710573832, 4.227293533218492)

Frequency

In [286]:
#Which continuous variables are most predictive of the survival time or length of hospital stay after an MI?

# Selecting continuous features 
features = ['AGE', 'K_BLOOD', 'NA_BLOOD', 'ALT_BLOOD', 'AST_BLOOD', 'KFK_BLOOD', 'L_BLOOD', 'ROE', 'TIME_B_S']

data = combined_data[features]

#  Define function to calculate descriptive stats for continuous features
def descriptive_stats(df):
    stats_dict = {}
    for col in df.columns:
        stats_dict[col] = {
            'Mean': df[col].mean(),
            'Median': df[col].median(),
            'Standard Deviation': df[col].std(),
            'Variance': df[col].var(),
            'Range': df[col].max() - df[col].min(),
            'Skewness': df[col].skew(),
            'Kurtosis': df[col].kurt(),
            '95% CI Lower': np.percentile(df[col], 2.5),
            '95% CI Upper': np.percentile(df[col], 97.5)
        }
    return pd.DataFrame(stats_dict)

# print result
descriptive_stats_df = descriptive_stats(data)
print(descriptive_stats_df)


correlation_with_survival = data.corr()['TIME_B_S'].sort_values(ascending=False)
print("\nCorrelation with Time between Events (Survival/Recovery Time):")
print(correlation_with_survival)

                           AGE   K_BLOOD    NA_BLOOD  ALT_BLOOD  AST_BLOOD  \
Mean                 61.856974  4.191422  136.550943   0.481455   0.263717   
Median               63.000000  4.100000  136.000000   0.380000   0.220000   
Standard Deviation   11.259936  0.754076    6.512120   0.387261   0.201802   
Variance            126.786155  0.568631   42.407712   0.149971   0.040724   
Range                66.000000  5.900000   52.000000   2.970000   2.110000   
Skewness             -0.219756  0.955688    0.122988   2.269564   2.559231   
Kurtosis             -0.183840  2.433998    1.203355   6.966396  11.417875   
95% CI Lower               NaN       NaN         NaN        NaN        NaN   
95% CI Upper               NaN       NaN         NaN        NaN        NaN   

                    KFK_BLOOD    L_BLOOD         ROE  TIME_B_S  
Mean                 2.000000   8.782914   13.444890  4.684244  
Median               1.600000   8.000000   10.000000  4.000000  
Standard Deviation   1.0