In [1]:
import os
import sys

class Location:
    """Specify the locations of inputs and outputs"""

    # Get the path of the directory containing the script file
    script_dir = os.path.dirname(os.path.abspath(sys.path[0]))

    # Navigate up to the top-level directory
    src_level_dir = os.path.dirname(script_dir)

    top_level_dir = os.path.dirname(src_level_dir)

    # Define the relative path to the data directory
    data_dir = os.path.join(top_level_dir, "AutomatingAnalysisModelsAndMisprediction\\data")

    data_raw: str = f"{data_dir}\\raw\\creditcard.csv"
    data_process: str = f"{data_dir}\\processed\\creditcard.pkl"
    
class ProcessConfig:
    """Specify the parameters of the `process` flow"""

    label: str = "Class"
    test_size: float = 0.2

In [2]:
"""utils function"""
 
import pickle
from sklearn.model_selection import train_test_split
import pandas as pd


def getProcessedData(file_path: str):
    # read python dict back from the file
    with open(file_path, 'rb') as f:
        split_dict = pickle.load(f)

    X_train = split_dict["X_train"]
    X_test = split_dict["X_test"]
    y_train = split_dict["y_train"]
    y_test = split_dict["y_test"]
    return X_train, X_test, y_train, y_test


def getUnprocessedData(url: str):
    dataset = pd.read_csv(url)
    return dataset


def save_processed_data(data: dict, save_location: str):
    """Save processed data

    Parameters
    ----------
    data : dict
        Data to process
    save_location : str
        Where to save the data
    """
    with open(save_location, "wb") as f:
        pickle.dump(data, f)


def get_X_y(data: pd.DataFrame, label: str):
    """Get features and label
    """
    X = data.drop(columns=label)
    y = data[label]
    return X, y


def split_train_test(X: pd.DataFrame, y: pd.DataFrame, test_size: int):
    """_summary_

    Parameters
    ----------
    X : pd.DataFrame
        Features
    y : pd.DataFrame
        Target
    test_size : int
        Size of the test set
    """
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=42
    )
    return {
        "X_train": X_train,
        "X_test": X_test,
        "y_train": y_train,
        "y_test": y_test,
    }


def get_raw_data(data_location: str):
    """Read raw data
    """
    return pd.read_csv(data_location)

In [4]:
"""Python script to process the data"""
from imblearn.over_sampling import SMOTE

def process():
    """Flow to process the Data
    """
    data = get_raw_data(Location.data_raw)

    # Cleaning the data process:

    # 1. Null values handler. There are no null values in DB.
    # print(data.isnull().sum())
    # data.dropna(inplace=True)
    # print(data.isnull().sum())

    # 2. All the features are required for the ml process

    # 3. Convert to numeric Class feature
    # 0 - non-fraudulent. 1 - fraudulent.
    data[ProcessConfig.label] = pd.to_numeric(data[ProcessConfig.label],
                                                                errors='coerce')

    processed = data
    processed = pd.get_dummies(processed)
    # After cleaning and processing the database, display general statistics of dataset
    # print(processed.describe())

    X, Y = get_X_y(processed, ProcessConfig.label)

    over_sample = SMOTE()
    X_ros, Y_ros = over_sample.fit_resample(X, Y)
    
    split_data = split_train_test(X_ros, Y_ros, ProcessConfig.test_size)
    save_processed_data(split_data, Location.data_process)
    
process()

NameError: name 'X' is not defined