# Diabetes prediction model

### Preparations

In [None]:
%pip install matplotlib nbformat pandas plotly scikit-learn tabulate # --break-system-changes

In [None]:
import pandas as pd
from tabulate import tabulate as tab
import matplotlib.pyplot as plt
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import sklearn as sl
import copy
import plotly
plotly.io.templates.default = "plotly_dark"

### Preliminary data analysis

In [None]:
def discrete_share_eval(df, var):
    return [
        (val, len(df[df[var] == val]), 100*len(df[df[var] == val])/len(df))
        for val in set(df[var].values)
    ]

In [None]:
# Basic visualizations
raw = pd.read_csv("../data/raw.csv")
raw

In [None]:
# Analisys of the categorical variables
for var in ["gender", "smoking_history", "diabetes", "hypertension", "heart_disease"]:
    print(f"\n* * * {var} distribution * * *")
    print(
        tab(
            discrete_share_eval(raw, var),
            headers=["Value", "Quantity", "Distribution"]
        )
    )

In [None]:
# Comorbidity evaluation
print(f"\n* * * Comorbidity distribution * * *")
print(tab(
    [
        [
            a, b, c,
            len(raw[(raw["diabetes"] == a) & (raw["hypertension"] == b) & (raw["heart_disease"] == c)]),
            100*len(raw[(raw["diabetes"] == a) & (raw["hypertension"] == b) & (raw["heart_disease"] == c)])/len(raw),
            
        ]
        for [a, b, c] in [[a, b, c] for a in [0, 1] for b in [0, 1] for c in [0, 1]]
    ],
    headers=["diabetes", "hypertension", "heart_disease", "qtd", "%"]
    )
)

Preliminary analysis results:

**Gender:**

- `gender == "Other"` has not enough samples for prediction models.
- `gender == "Female"` has a substantial bigger sample than `gender == "Male"`, hence, separate models could yield better results.


**Smoking:**

- `smoking_history` distribution can be further discretized as follows:
    - `smoking_ever` for patients that have smoked at any time in the past.
    - `smoking_today` for patients that are smoking today.


**Other decesases:**

- `diabetes`, `hypertension` and `heart_disease` have uneven distributions.
- there is a significant overlaf among the conditions.

In [None]:
raw["blood_glucose_level"] = [float(v) for v in raw["blood_glucose_level"].values]

pp = copy.deepcopy(raw)

pp = pp[pp["gender"] != "Other"]
pp["Male"] = [1 if v == "Male" else 0 for v in pp["gender"].values]
pp = pp.drop(columns="gender")

pp = pp[pp["smoking_history"] != "No Info"]
pp["smoking_ever"] = [
    1 if v in ["former", "ever", "current"] else 0 
    for v in pp["smoking_history"].values
]
pp["smoking_today"] = [
    1 if v in ["ever", "current"] else 0 
    for v in pp["smoking_history"].values
]
pp = pp.drop(columns="smoking_history")

In [None]:
px.imshow(pp.corr(), color_continuous_scale="Turbo")

In [None]:
import plotly.subplots

pos = [
    [
        1 + l // int(np.ceil(len(raw.columns)**(1/2))),
        1 + l %  int(np.ceil(len(raw.columns)**(1/2)))
    ]
    for l in range(len(raw.columns))
][:len(raw.columns)]

fig = plotly.subplots.make_subplots(
    rows = max([p[0] for p in pos]),
    cols = max([p[1] for p in pos])
)

for p, var in zip(pos, raw.columns):
    fig.add_trace(
        go.Histogram(
            x = sorted(raw[var].values),
            histnorm ='percent',
            histfunc ="avg",
            name = var,
            nbinsx = (
                int(np.ceil(len(set(raw[var].values))**0.5))
                if str(raw[var].dtype) == "float64"
                else None
            )
        ), row=p[0], col=p[1]
    )
fig.update_layout(title="Plot Title", margin=dict(l=5, r=5, t=50, b=5, pad=5))
fig.show()

# Secondary analysis
- 