In [None]:
import numpy as np
import pandas as pd

import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

import wandb
import torch
from torch import nn
from torch.utils.data import DataLoader

print(torch.__version__)
print(torch.backends.mps.is_available())

In [None]:
df = pd.read_csv('data/pima-indians-diabetes-data.csv')
df.info()

In [None]:
fig = go.Figure()
fig.add_bar(y=df.Outcome.value_counts(), x=['Healthy', 'Diabetic'], 
            marker_color=['lightskyblue', 'indigo'])
fig.update_layout(title='Distribution of outcome')

In [None]:
px.box(df)

In [None]:
df[df[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']].eq(0).any(axis=1)]

In [None]:
df[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']].agg(lambda x: x.eq(0).sum())

In [None]:
df_zeroes = df.copy()

df[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']] = df[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']].replace(0, np.nan)
df

## Imputation of missing values

In [None]:
def find_median(data, var):
    temp = data[data[var].notnull()]
    temp = data[[var,'Outcome']].groupby('Outcome')[[var]].median().reset_index()
    
    return temp

def create_distplot(data, var):
    hist = [data.loc[(data.Outcome == 0), var], data.loc[(data.Outcome == 1), var]]

    fig = ff.create_distplot(hist, ['Healthy', 'Diabetes'], colors=['lightskyblue', 'indigo'], show_hist=True, bin_size=0, curve_type='kde')
    fig.update_layout(title=var)

    return fig

### Glucose

In [None]:
create_distplot(df_zeroes, 'Glucose').show()

find_median(df, 'Glucose')

In [None]:
df.loc[(df.Outcome == 0) & (df.Glucose.isnull()), 'Glucose'] = 107.0
df.loc[(df.Outcome == 1) & (df.Glucose.isnull()), 'Glucose'] = 140.0

create_distplot(df, 'Glucose')

### BloodPressure

In [None]:
create_distplot(df_zeroes, 'BloodPressure').show()

find_median(df, 'BloodPressure')

In [None]:
df.loc[(df.Outcome == 0) & (df.BloodPressure.isnull()), 'BloodPressure'] = 70.0
df.loc[(df.Outcome == 1) & (df.BloodPressure.isnull()), 'BloodPressure'] = 74.5

create_distplot(df, 'BloodPressure')

### SkinThickness

In [None]:
create_distplot(df_zeroes, 'SkinThickness').show()

find_median(df, 'SkinThickness')

In [None]:
df.loc[(df.Outcome == 0) & (df.SkinThickness.isnull()), 'SkinThickness'] = 27.0
df.loc[(df.Outcome == 1) & (df.SkinThickness.isnull()), 'SkinThickness'] = 32.0

create_distplot(df, 'SkinThickness')

### Insulin

In [None]:
create_distplot(df_zeroes, 'Insulin').show()

find_median(df, 'Insulin')

In [None]:
df.loc[(df.Outcome == 0) & (df.Insulin.isnull()), 'Insulin'] = 102.5
df.loc[(df.Outcome == 1) & (df.Insulin.isnull()), 'Insulin'] = 169.5

create_distplot(df, 'Insulin')

### BMI

In [None]:
create_distplot(df_zeroes, 'BMI').show()

find_median(df, 'BMI')

In [None]:
df.loc[(df.Outcome == 0) & (df.BMI.isnull()), 'BMI'] = 30.1
df.loc[(df.Outcome == 1) & (df.BMI.isnull()), 'BMI'] = 34.3

create_distplot(df, 'BMI')

In [None]:
df[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']].agg(lambda x: x.isna().sum())

In [None]:
px.box(df)

In [None]:
fig = go.Figure()
fig.add_heatmap(z=df.corr().round(4), x=df.columns, y=df.columns,
                text=df.corr().round(4),
                texttemplate="%{text}",
                colorscale=px.colors.diverging.RdBu,
                zmin=-1, zmax=1)
fig.update_layout(width=1000, height=1000)

In [None]:
def create_scatter(data, var1, var2):
    plot1 = [data.loc[(data.Outcome == 0), var1], data.loc[(data.Outcome == 1), var1]]
    plot2 = [data.loc[(data.Outcome == 0), var2], data.loc[(data.Outcome == 1), var2]]

    fig = go.Figure()

    fig.add_scatter(x=plot1[0], y=plot2[0], mode='markers', marker_color='lightskyblue', name='Healthy')
    fig.add_scatter(x=plot1[1], y=plot2[1], mode='markers', marker_color='indigo', name='Diabetic')

    fig.update_layout(width=1000, height=600,
                      title=f'Scatter of {var1} and {var2}',
                      xaxis_title=var1,
                      yaxis_title=var2)
    
    return fig

In [None]:
create_scatter(df, 'Glucose', 'Insulin')

In [None]:
create_scatter(df, 'Pregnancies', 'Age')

In [None]:
create_scatter(df, 'BMI', 'SkinThickness')

In [None]:
create_scatter(df, 'Glucose', 'Outcome')

In [None]:
def to_tensor(X_train, X_test, y_train, y_test):
    X_train = torch.from_numpy(X_train.values).float()
    X_test = torch.from_numpy(X_test.values).float()
    y_train = torch.from_numpy(y_train.values).float()
    y_test = torch.from_numpy(y_test.values).float()

    return X_train, X_test, y_train, y_test


X, y = df.copy().drop('Outcome', axis=1), df.Outcome
X_train, X_test, y_train, y_test = to_tensor(*train_test_split(X, y, test_size=0.3))
print(X_train, y_train)

In [None]:
class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(8, 16),
            nn.ReLU(),
            nn.Linear(16, 8),
            nn.ReLU(),
            nn.Linear(8, 4),
            nn.ReLU(),
            nn.Linear(4, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        logits = self.linear_relu_stack(x)
        return logits

model = NeuralNetwork().to('mps')
print(model)

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.RMSprop(model.parameters())

In [None]:
def train(X, y, model, loss_fn, optimizer, i):
    model.train()

    X, y = X.to('mps'), y.to('mps')

    # Compute prediction error
    pred = model(X)
    loss = loss_fn(pred, y)

    # Backpropagation
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if i % 100 == 0:
        print(f"loss: {loss.item():>7f}")


def test(X, y, model, loss_fn, i):
    model.eval()

    test_loss, correct = 0, 0

    X, y = X.to('mps'), y.to('mps')
    pred = model(X)
    test_loss += loss_fn(pred, y).item()
    correct += (pred.argmax(1) == y).type(torch.float).sum().item()

    correct /= X.shape[0]

    if i % 100:
        print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")


epochs = 1000
for t in range(epochs):
    if t % 100:
        print(f"Epoch {t+1}\n-------------------------------")

    train(X_train, y_train, model, loss_fn, optimizer, t)
    test(X_test, y_test, model, loss_fn, t)

print("Done!")