In [1]:
import os
import sys
import pandas as pd

import pickle
from sklearn.model_selection import train_test_split

from imblearn.over_sampling import SMOTE

In [2]:

class Location:
    """Specify the locations of inputs and outputs"""

    # Get the path of the directory containing the script file
    script_dir = os.path.dirname(os.path.abspath(sys.path[0]))

    # Navigate up to the top-level directory
    src_level_dir = os.path.dirname(script_dir)

    top_level_dir = os.path.dirname(src_level_dir)

    # Define the relative path to the data directory
    data_dir = os.path.join(top_level_dir, "AutomatingAnalysisModelsAndMisprediction\\data")

    data_raw: str = f"{data_dir}\\raw\\customer_churn.csv"
    data_csv_process: str = f"{data_dir}\\processed\\customer_churn.csv"
    data_process: str = f"{data_dir}\\processed\\customer_churn.pkl"
    

In [3]:
"""utils function"""

def getProcessedData(file_path: str):
    # read python dict back from the file
    with open(file_path, 'rb') as f:
        split_dict = pickle.load(f)

    X_train = split_dict["X_train"]
    X_test = split_dict["X_test"]
    y_train = split_dict["y_train"]
    y_test = split_dict["y_test"]
    return X_train, X_test, y_train, y_test

def save_csv_data(df: any, save_location: str):
    df.to_csv(save_location, index=False)

def save_processed_data(data: dict, save_location: str):
    """Save processed data

    Parameters
    ----------
    data : dict
        Data to process
    save_location : str
        Where to save the data
    """
    with open(save_location, "wb") as f:
        pickle.dump(data, f)


def get_X_y(data: pd.DataFrame, label: str):
    """Get features and label
    """
    X = data.drop(columns=label)
    y = data[label]
    return X, y


def split_train_test(X: pd.DataFrame, y: pd.DataFrame, test_size: int):
    """_summary_

    Parameters
    ----------
    X : pd.DataFrame
        Features
    y : pd.DataFrame
        Target
    test_size : int
        Size of the test set
    """
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=0
    )
    return {
        "X_train": X_train,
        "X_test": X_test,
        "y_train": y_train,
        "y_test": y_test,
    }


def get_raw_data(data_location: str):
    """Read raw data
    """
    return pd.read_csv(data_location)

In [4]:
class ProcessConfig:
    """Specify the parameters of the `process` flow"""

    label: str = "Churn"
    test_size: float = 0.2

In [5]:
"""Python script to process the data"""


def process():
 # Load the customer churn dataset into dataframe.
    churn_df = pd.read_csv(Location.data_raw)
    # print(churn_df.head(5))

    # Total Charges column was categorical but must be a numerical value.
    # hence we convert it to categorical and fill with NAN values if there are errors.
    churn_df["TotalCharges"] = pd.to_numeric(churn_df["TotalCharges"], errors='coerce')

    # Find out how many Nan values are in the dataset now
    # print(churn_df.isnull().sum())
    # drop NaN values since they are small
    churn_df.dropna(subset=["TotalCharges"], inplace=True)
    # print(churn_df.isnull().sum())
    # also we don't need the customer ID columm
    churn_df.drop("customerID", axis=1, inplace=True)
    # As expected there are no missing values in any of the columns. Dataset seems to be clean

    # display general statistics of dataset
    # print(churn_df.describe())

    # maping of male to 1 and female to zero.
    churn_df['gender'] = churn_df['gender'].map({'Male': 1, 'Female': 0})
    # maping of yes to 1 and No to 0 for the following column
    churn_df['Partner'] = churn_df['Partner'].map({'Yes': 1, 'No': 0})
    churn_df['Dependents'] = churn_df['Dependents'].map({'Yes': 1, 'No': 0})
    churn_df['PhoneService'] = churn_df['PhoneService'].map({'Yes': 1, 'No': 0})
    churn_df['PaperlessBilling'] = churn_df['PaperlessBilling'].map({'Yes': 1, 'No': 0})
    churn_df['Churn'] = churn_df['Churn'].map({'Yes': 1, 'No': 0})
    # print(churn_df.head(5))

    # get dummy for categorical columns now
    churn_df = pd.get_dummies(churn_df)
    # churn_df = churn_df.astype(float)
    # print(churn_df.head(5))
    save_csv_data(churn_df, Location.data_csv_process)

    X, Y = get_X_y(churn_df, ProcessConfig.label)

    over_sample = SMOTE()
    X_ros, Y_ros = over_sample.fit_resample(X, Y)
    
    split_data = split_train_test(X_ros, Y_ros, ProcessConfig.test_size)
    save_processed_data(split_data, Location.data_process)
    
process()

UnicodeDecodeError: 'utf-8' codec can't decode byte 0x80 in position 0: invalid start byte