## Load library

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import joblib

## Handling the first dataset due to the imbalancing

In [2]:
# Load data and remove all rows have empty values. Change values to float
data = pd.read_csv('../data/db1.csv')

data = data.dropna(how="all")
data = data.astype(float)

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 253680 entries, 0 to 253679
Data columns (total 22 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   Diabetes_binary       253680 non-null  float64
 1   HighBP                253680 non-null  float64
 2   HighChol              253680 non-null  float64
 3   CholCheck             253680 non-null  float64
 4   BMI                   253680 non-null  float64
 5   Smoker                253680 non-null  float64
 6   Stroke                253680 non-null  float64
 7   HeartDiseaseorAttack  253680 non-null  float64
 8   PhysActivity          253680 non-null  float64
 9   Fruits                253680 non-null  float64
 10  Veggies               253680 non-null  float64
 11  HvyAlcoholConsump     253680 non-null  float64
 12  AnyHealthcare         253680 non-null  float64
 13  NoDocbcCost           253680 non-null  float64
 14  GenHlth               253680 non-null  float64
 15  

In [None]:
# Choose features and targets
X = data.drop('Diabetes_binary', axis = 1)
y = data['Diabetes_binary']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, stratify = y, random_state= 42) #use stratify due to dataset is imbalanced


### Apply SMOTE

In [None]:
smote = SMOTE(random_state=42, sampling_strategy='auto')

X_train_sampling, y_train_sampling = smote.fit_resample(X_train, y_train)

print("Before SMOTE:\n", y_train.value_counts(normalize= True)) 
print("After SMOTE:\n" ,y_train_sampling.value_counts(normalize = True))

### Scaling after using SMOTE

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_sampling)
X_test_scaled = scaler.transform(X_test)


### Saving by using joblib

In [None]:
pipe = Pipeline ([('smote', SMOTE(random_state=42)), ('scaler',StandardScaler()), ('model',RandomForestClassifier(random_state=42))])
pipe.fit(X_train, y_train)

joblib.dump(pipe, "../models/pipeline_first_dataset.pkl")