In [6]:
import os
import sys

class Location:
    """Specify the locations of inputs and outputs"""

    # Get the path of the directory containing the script file
    script_dir = os.path.dirname(os.path.abspath(sys.path[0]))

    # Navigate up to the top-level directory
    src_level_dir = os.path.dirname(script_dir)

    top_level_dir = os.path.dirname(src_level_dir)

    # Define the relative path to the data directory
    data_dir = os.path.join(top_level_dir, "AutomatingAnalysisModelsAndMisprediction\\data")

    data_raw: str = f"{data_dir}\\raw\\company_bankruptcy.csv"
    data_process: str = f"{data_dir}\\processed\\company_bankruptcy.pkl"
    

In [4]:
"""utils function"""
 
import pickle
from sklearn.model_selection import train_test_split
import pandas as pd


def getProcessedData(file_path: str):
    # read python dict back from the file
    with open(file_path, 'rb') as f:
        split_dict = pickle.load(f)

    X_train = split_dict["X_train"]
    X_test = split_dict["X_test"]
    y_train = split_dict["y_train"]
    y_test = split_dict["y_test"]
    return X_train, X_test, y_train, y_test


def getUnprocessedData(url: str):
    dataset = pd.read_csv(url)
    return dataset


def save_processed_data(data: dict, save_location: str):
    """Save processed data

    Parameters
    ----------
    data : dict
        Data to process
    save_location : str
        Where to save the data
    """
    with open(save_location, "wb") as f:
        pickle.dump(data, f)


def split_train_test(X: pd.DataFrame, y: pd.DataFrame, test_size: int):
    """_summary_

    Parameters
    ----------
    X : pd.DataFrame
        Features
    y : pd.DataFrame
        Target
    test_size : int
        Size of the test set
    """
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=42
    )
    return {
        "X_train": X_train,
        "X_test": X_test,
        "y_train": y_train,
        "y_test": y_test,
    }


def get_raw_data(data_location: str):
    """Read raw data
    """
    return pd.read_csv(data_location)

In [7]:
"""Python script to process the data"""
from imblearn.over_sampling import SMOTE

def process():
    """Flow to process the Data
    """
    # Load the customer churn dataset into dataframe.
    bankrupt_df = get_raw_data(Location.data_raw)
    # print(bankrupt_df.head(5))

    # print(bankrupt_df.info())
    # As we can see no value is categorical, That's good news for us.
    # Let's check if any values are null or not

    """for i in bankrupt_df.columns:
        if bankrupt_df[i].isnull().values.any():
            print(i)
    print("Done")"""
    # As we can see there aren't any null values.

    # Extract the features and labels
    X = bankrupt_df.drop(["Bankrupt?"], axis=1)
    Y = bankrupt_df["Bankrupt?"]

    # print(bankrupt_df["Bankrupt?"].value_counts())
    # Our labels are strongly unbalanced, so we do the oversampling method

    over_sample = SMOTE()
    X_ros, Y_ros = over_sample.fit_resample(X, Y)
    split_data = split_train_test(X_ros, Y_ros, ProcessConfig.test_size)
    save_processed_data(split_data, Location.data_process)
    
process()