## --- 0.0. Libraries importing ---

In [1]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

# set notebook width to 100%
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# dataframe manipulation
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# train test split
from sklearn.model_selection import train_test_split

# Path
import os 
from pathlib import Path
os.chdir(Path(os.getcwd()).parent)

## --- 0.1. Parameters ---

### ------ 0.1.1. Independent parameters ------

In [2]:
file_name = 'diabetes_prediction_dataset'
uc_name = 'diabetes'

target_column = 'diabetes'
seed = 100
test_size = 0.3

# 1. Data Loading

In [3]:
df = (pd
      .read_csv(f'data/raw_data/{file_name}.csv',
                  low_memory=False)
     )

(
    df['hypertension']
    .replace(to_replace={0:'No', 1:'Yes'},
             inplace=True)
)

(
    df['heart_disease']
    .replace(to_replace={0:'No', 1:'Yes'},
             inplace=True)
)

display(df.head())
print(f"Shape of the data: {df.shape}.")

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Male,43.0,No,No,never,40.43,4.8,145,0
1,Female,30.0,Yes,No,current,31.0,5.7,200,0
2,Male,45.0,No,No,ever,27.47,5.8,126,0
3,Male,58.0,No,Yes,No Info,29.05,6.5,159,0
4,Male,8.0,No,No,never,18.53,6.6,159,0


Shape of the data: (10000, 9).


# 2. Data Transformation

In [4]:
df = (df
      .query(f"gender in ('Male', 'Female')")
      .reset_index(drop=True)
     )

df['smoking_history'] = (pd
                         .Series(np
                                 .where(df['smoking_history'].isin(['former', 'not current', 'ever']),
                                        'ever',
                                        df['smoking_history'])
                                )
                         .replace(to_replace={'No Info': 'no info'})
                        )

display(df.head())
print(f"Shape of the data: {df.shape}.")

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Male,43.0,No,No,never,40.43,4.8,145,0
1,Female,30.0,Yes,No,current,31.0,5.7,200,0
2,Male,45.0,No,No,ever,27.47,5.8,126,0
3,Male,58.0,No,Yes,no info,29.05,6.5,159,0
4,Male,8.0,No,No,never,18.53,6.6,159,0


Shape of the data: (9999, 9).


# 3. Train/Test Split

In [5]:
X = df.drop(columns=target_column,
            axis=1)

y = df[target_column].copy()

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=test_size,
                                                    random_state=seed,
                                                    stratify=y)

X_train, X_test, y_train, y_test = (
                                        X_train.reset_index(drop=True),
                                        X_test.reset_index(drop=True),
                                        y_train.reset_index(drop=True),
                                        y_test.reset_index(drop=True)
                                    )

print(f"Shape -> X_train: {X_train.shape}, y_train: {y_train.shape}.")
print(f"Shape -> X_test: {X_test.shape}, y_test: {y_test.shape}.")

Shape -> X_train: (6999, 8), y_train: (6999,).
Shape -> X_test: (3000, 8), y_test: (3000,).


# 4. Save Prepared Data

In [6]:
(
    X_train
    .to_csv(f'data/prepared_data/{uc_name}/X_train.csv',
            index=False)
)

print("X_train.csv is successfully saved!")

(
    y_train
    .to_csv(f'data/prepared_data/{uc_name}/y_train.csv',
            index=False)
)

print("y_train.csv is successfully saved!")

(
    X_test
    .to_csv(f'data/prepared_data/{uc_name}/X_test.csv',
            index=False)
)

print("X_test.csv is successfully saved!")

(
    y_test
    .to_csv(f'data/prepared_data/{uc_name}/y_test.csv',
            index=False)
)

print("y_test.csv is successfully saved!")

X_train.csv is successfully saved!
y_train.csv is successfully saved!
X_test.csv is successfully saved!
y_test.csv is successfully saved!
