In [None]:
%pip install matplotlib nbformat pandas plotly scikit-learn tabulate # --break-system-changes

In [4]:
import pandas as pd
from tabulate import tabulate as tab
import matplotlib.pyplot as plt
import plotly.express as px
import sklearn as sl
import copy
raw = pd.read_csv("../data/raw.csv")

In [5]:
def discrete_share_eval(df, var):
    return [
        (val, len(df[df[var] == val]), f"{100*len(df[df[var] == val])/len(df):.2f}%")
        for val in set(df[var].values)
    ]

In [6]:
# Basic visualizations
raw

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0
...,...,...,...,...,...,...,...,...,...
99995,Female,80.0,0,0,No Info,27.32,6.2,90,0
99996,Female,2.0,0,0,No Info,17.37,6.5,100,0
99997,Male,66.0,0,0,former,27.83,5.7,155,0
99998,Female,24.0,0,0,never,35.42,4.0,100,0


In [7]:
# Analisys of the categorical variables
for var in ["gender", "smoking_history", "diabetes", "hypertension", "heart_disease"]:
    print(f"\n{var} distribution:")
    print(tab(discrete_share_eval(raw, var)))


gender distribution:
------  -----  ------
Other      18  0.02%
Female  58552  58.55%
Male    41430  41.43%
------  -----  ------

smoking_history distribution:
-----------  -----  ------
No Info      35816  35.82%
not current   6447  6.45%
ever          4004  4.00%
current       9286  9.29%
never        35095  35.09%
former        9352  9.35%
-----------  -----  ------

diabetes distribution:
-  -----  ------
0  91500  91.50%
1   8500  8.50%
-  -----  ------

hypertension distribution:
-  -----  ------
0  92515  92.52%
1   7485  7.49%
-  -----  ------

heart_disease distribution:
-  -----  ------
0  96058  96.06%
1   3942  3.94%
-  -----  ------


In [8]:
# Comorbidity evaluation
t = [
    [
        a, b, c,
        len(raw[(raw["diabetes"] == a) & (raw["hypertension"] == b) & (raw["heart_disease"] == c)]),
        100*len(raw[(raw["diabetes"] == a) & (raw["hypertension"] == b) & (raw["heart_disease"] == c)])/len(raw),
        
    ]
    for [a, b, c] in [[a, b, c] for a in [0, 1] for b in [0, 1] for c in [0, 1]]
] 
t.insert(0, ["diabetes", "hypertension", "heart_disease", "qtd", "%"] )
print(tab(t))

--------  ------------  -------------  -----  ------
diabetes  hypertension  heart_disease  qtd    %
0         0             0              83986  83.986
0         0             1              2117   2.117
0         1             0              4839   4.839
0         1             1              558    0.558
1         0             0              5503   5.503
1         0             1              909    0.909
1         1             0              1730   1.73
1         1             1              358    0.358
--------  ------------  -------------  -----  ------


Preliminary analysis results:

**Gender:**

- `gender == "Other"` has not enough samples for prediction models.
- `gender == "Female"` has a substantial bigger sample than `gender == "Male"`, hence, separate models could yield better results.


**Smoking:**

- `smoking_history` distribution can be further discretized as follows:
    - `smoking_ever` for patients that have smoked at any time in the past.
    - `smoking_today` for patients that are smoking today.


**Other decesases:**

- `diabetes`, `hypertension` and `heart_disease` have uneven distributions.
- there is a significant overlaf among the conditions.

In [9]:
pp = copy.deepcopy(raw)

pp = pp[pp["gender"] != "Other"]
pp["Male"] = [1 if v == "Male" else 0 for v in pp["gender"].values]
pp = pp.drop(columns="gender")

pp = pp[pp["smoking_history"] != "No Info"]
pp["smoking_ever"] = [
    1 if v in ["former", "ever", "current"] else 0 
    for v in pp["smoking_history"].values
]
pp["smoking_today"] = [
    1 if v in ["ever", "current"] else 0 
    for v in pp["smoking_history"].values
]
pp = pp.drop(columns="smoking_history")

In [10]:
px.imshow(pp.corr())

In [11]:
# from sklearn.neighbors import KNeighborsClassifier

# .neighbors.KNeighborsClassifier(n_neighbors=5)

AttributeError: module 'sklearn' has no attribute 'neighbors'

In [None]:
raw.corr()