<a href="https://colab.research.google.com/github/maulairfani/DataScience-Learning-Notebooks/blob/main/functions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score, roc_curve


def plot_roc_curve(model, y_test, y_pred):
    # Menghitung False Positive Rate (fpr) dan True Positive Rate (tpr)
    fpr, tpr, _ = roc_curve(y_test, model.predict_proba(X_test)[:,1])

    # Menghitung Area Under the ROC Curve (ROC AUC)
    roc_auc = roc_auc_score(y_test, y_pred)

    # Membuat plot ROC AUC Curve
    plt.figure(figsize=(5, 4))
    plt.plot(fpr, tpr, color='darkorange', label='ROC Curve (AUC = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend(loc="lower right")
    plt.show()

def read_data():
    # Download dataset
    !wget -q https://github.com/maulairfani/DataScience-Learning-Notebooks/blob/main/dac-find-it-2023.zip?raw=true
    import zipfile
    with zipfile.ZipFile("/content/dac-find-it-2023.zip?raw=true", 'r') as zip_ref:
        zip_ref.extractall("/content")

    df = pd.read_csv("/content/training_set_features.csv")
    df_label = pd.read_csv("/content/training_set_labels.csv")
    df["h1n1_vaccine"] = df_label["h1n1_vaccine"]
    df["seasonal_vaccine"] = df_label["seasonal_vaccine"]

    return df

# Version 1

In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE

def preprocessing_df_train(df_train, y):
    if y == "h1n1_vaccine":
        df_train = df_train.drop("seasonal_vaccine", axis=1)
    else:
        df_train = df_train.drop("h1n1_vaccine", axis=1)

    # Memilih kolom-kolom yang akan digunakan sebagai fitur
    X_features = [col for col in df_train.columns if col not in [y]]

    # Menghapus kolom-kolom tertentu
    df_train = df_train.drop(["health_insurance", "income_poverty", "employment_occupation", "employment_industry"], axis=1, errors='ignore')

    # Mengelompokkan data berdasarkan nilai pada kolom 'h1n1_vaccine'
    grouped_data = df_train.groupby(y)

    # Mengisi nilai kosong dengan mode dari nilai pada kolom tersebut dalam grup yang sama
    cols_to_inpute = [col for col in df_train.columns if col not in [y]]
    df_train[cols_to_inpute] = grouped_data[cols_to_inpute].transform(lambda x: x.fillna(x.mode()[0]))

    # Label Encoding untuk variabel ordinal
    education_map = {
    "< 12 Years" : 0,
    "12 Years" : 1,
    "Some College" : 2,
    "College Graduate" : 3
    }
    df_train["education"] = df_train["education"].map(education_map)

    age_map = {
        '18 - 34 Years' : 0,
        '35 - 44 Years' : 1,
        '45 - 54 Years' : 2,
        '55 - 64 Years' : 3,
        '65+ Years' : 4
    }
    df_train["age_group"] = df_train["age_group"].map(age_map)

    # One hot encoding untuk variabel nominal
    df_train = pd.get_dummies(df_train)
    df_train.columns = df_train.columns.str.replace(' ', '_').str.lower()
    df_train.columns = df_train.columns.str.replace('-', '_').str.lower()
    df_train.columns = df_train.columns.str.replace(',', '').str.lower()

    X = df_train.drop(['respondent_id', y], axis=1)
    y = df_train[y]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    scaler = MinMaxScaler()
    scaler = scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)

    smote = SMOTE(random_state=42)
    X_train, y_train = smote.fit_resample(X_train, y_train)

    return X_train, X_test, y_train, y_test, scaler