##1.2 Install and load libraries

In [None]:
!pip install pandas-profiling==3.1.0

In [None]:
!pip install wandb

In [None]:
import wandb
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from pandas_profiling import ProfileReport
import tempfile
import os
%matplotlib inline

##1.3 Exploratory Data Analysis (EDA)

###1.3.1 Download raw_data artifact from Wandb

In [None]:
# Login to Weights & Biases
!wandb login --relogin

In [None]:
# save_code tracking all changes of the notebook and sync with Wandb
run = wandb.init(project="diabetes_decision_tree", save_code=True)

In [None]:
# donwload the latest version of artifact raw_data.csv
artifact = run.use_artifact("diabetes_decision_tree/raw_data.csv:latest")

# create a dataframe from the artifact
df = pd.read_csv(artifact.file())

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe().transpose()

###1.3.2 Pandas Profilling

In [None]:
ProfileReport(df, title="Pandas Profiling Report", explorative=True)

###1.3.3 Manually EDA

In [None]:
# There are duplicated rows
df.duplicated().sum()

0

In [None]:
# Delete duplicated rows
df.drop_duplicates(inplace=True)
df.duplicated().sum()

In [None]:
# Outcome vs pregnances?
pd.crosstab(df.Outcome,df.Pregnancies,margins=True,normalize=True)

In [None]:
# Is data balanced?
plt.figure(figsize=(6,6))
labels = ['Healthy','Diabetic']
values = [df.Outcome.value_counts()[0], df.Outcome.value_counts()[1]]
plt.pie(values, labels=labels,autopct = '%0.0f%%')
plt.show()

In [None]:
# Correlation
sns.heatmap(df.corr(),cmap='Blues',annot=False)

In [None]:
#Outcome correlation matrix
k = 9 #number of variables for heatmap
cols = df.corr().nlargest(k, 'Outcome')['Outcome'].index
cm = df[cols].corr()
plt.figure(figsize=(10,6))
sns.heatmap(cm, annot=True, cmap = 'viridis')

In [None]:
# How the data is distributed.
fig, ax = plt.subplots()
df.hist(figsize = (15,15),ax=ax,)
fig.savefig('featureHist.png',dpi=120)
run.log(
        {
            "FeaturesHist": wandb.Image("featureHist.png")
        }
        )

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

for s in df.Outcome.unique():
    ax.scatter(df.BMI[df.Outcome==s],df.Glucose[df.Outcome==s],df.BloodPressure[df.Outcome==s],label=s)
ax.legend()
ax.set_xlabel('BMI')
ax.set_ylabel('Glucose')
ax.set_zlabel('Blood Pressure')

ax.figure.savefig('BMIxGlucosexBloodPressure.png')
run.log(
        {
            "BMIxGlucosexBloodPressure": wandb.Image("BMIxGlucosexBloodPressure.png")
        }
        )

In [None]:
for col in df.columns:
    if col != "Outcome":
        sns.catplot("Outcome", col, data = df)

In [None]:
df.isnull().sum()

In [None]:
run.finish()