##1.2 Install and load libraries

In [None]:
!pip install wandb

In [None]:
import wandb
import pandas as pd
import numpy as np
import tempfile
import logging
import os

##1.3 Preprocessing

###1.3.1 Download raw_data artifact from Wandb

In [None]:
# Login to Weights & Biases
!wandb login --relogin

In [None]:
input_artifact="diabetes_decision_tree/raw_data.csv:latest"
artifact_name="preprocessed_data.csv"
artifact_type="clean_data"
artifact_description="Data after preprocessing"

###1.3.2 Setup your wandb project and clean the dataset

In [None]:
# create a new job_type
run = wandb.init(project="diabetes_decision_tree", job_type="process_data")

In [None]:
# donwload the latest version of artifact raw_data.csv
artifact = run.use_artifact(input_artifact)

# create a dataframe from the artifact
df = pd.read_csv(artifact.file())

In [None]:
# Delete duplicated rows
df.drop_duplicates(inplace=True)

# Generate a "clean data file"
df.to_csv(artifact_name,index=False)

In [None]:
#df['New_Glucose_Class'] = pd.cut(x=df['Glucose'], bins=[0,139,200],labels = ["Normal","Prediabetes"])
#df['New_BMI_Range'] = pd.cut(x=df['BMI'], bins=[0,18.5,24.9,29.9,100],labels = ["Underweight","Healty","Overweight","Obese"])
#df['New_BloodPressure'] = pd.cut(x=df['BloodPressure'], bins=[0,79,89,123],labels = ["Normal","HS1","HS2"])
#df['New_SkinThickness'] = df['SkinThickness'].apply(lambda x: 1 if x <= 18.0 else 0)
df['New_BMI_Range'] = np.where(df['BMI'] < 18.5, "Underweight", np.where(df['BMI'] < 24.9, "Healty", np.where(df['BMI'] < 29.9, "Overweight", "Obese")))
df['New_Glucose_Class'] = np.where(df['Glucose'] < 139, "Normal", "Prediabetes")
df['New_BloodPressure'] = np.where(df['BloodPressure'] < 79, "Normal", np.where(df['BloodPressure'] < 89, "HS1", "HS2"))
df.head()

In [None]:
df.dtypes

In [None]:
def one_hot_encoder(dataframe, categorical_columns, nan_as_category=False):
    original_columns = list(dataframe.columns)
    dataframe = pd.get_dummies(dataframe, columns=categorical_columns,dummy_na=nan_as_category, drop_first=True)
    new_columns = [col for col in dataframe.columns if col not in original_columns]
    return dataframe, new_columns

In [None]:
df.head()

In [None]:
# configure logging
logging.basicConfig(level=logging.INFO,
                    format="%(asctime)s %(message)s",
                    datefmt='%d-%m-%Y %H:%M:%S')

# reference for a logging obj
logger = logging.getLogger()
with tempfile.TemporaryDirectory() as tmp_dir:
        temp_path = os.path.join(tmp_dir, artifact_name)
        df.to_csv(temp_path,index=False)

        artifact = wandb.Artifact(name=artifact_name,
                                  type=artifact_type,
                                  description="pre processed data",
        )
        
        artifact.add_file(temp_path)

        logger.info("Logging artifact")
        run.log_artifact(artifact)

        # This waits for the artifact to be uploaded to W&B. If you
        # do not add this, the temp directory might be removed before
        # W&B had a chance to upload the datasets, and the upload
        # might fail
        artifact.wait()

In [None]:
# Upload the artifact to Wandb
run.log_artifact(artifact)

In [None]:
# close the run
# waiting a while after run the previous cell before execute this
run.finish()