In [1]:
import pandas as pd

def load_data(fname):
    """
    Reads data from a CSV file.

    Parameters:
    fname (str): The file path of the CSV file to read.

    Returns:
    pandas.DataFrame: A DataFrame containing the data read from the CSV file.
    """
    csv_data = pd.read_csv(fname, sep = ",")
    print(f'Data shape               : {csv_data.shape}')

    return csv_data

FNAME = './data/raw/credit_risk_dataset.csv'
data = load_data(fname=FNAME)

data.head()


Data shape               : (32581, 12)


Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4


In [6]:
def split_input_output(data, target_col):
  """
  Splits the input DataFrame into features (X) and target variable (y) based on
  the specified target column.

  Parameters:
  data (pandas.DataFrame): The input DataFrame containing features and target
  variable. target_col (str): The name of the column representing the target
  variable.

  Returns:
    - X (pandas.DataFrame): The DataFrame containing features
      (input variables) with the target column removed.
    - y (pandas.Series): The Series containing the target variable.
  """
  X = data.drop(target_col, axis = 1)
  y = data[target_col]

  print(f'Original data shape: {data.shape}')
  print(f'X data shape: {X.shape}')
  print(f'y data shape: {y.shape}')
  return X, y

TARGET_COL = 'loan_status'
X, y = split_input_output(data = data, target_col = TARGET_COL)

Original data shape: (32581, 12)
X data shape: (32581, 11)
y data shape: (32581,)


In [7]:
from sklearn.model_selection import train_test_split

def split_train_test(X, y, test_size, random_state = None):
    """
    Splits the input features and target variable into training and testing sets.

    Parameters:
    X (pandas.DataFrame): The DataFrame containing the input features.
    y (pandas.Series): The Series containing the target variable.
    test_size (float): The proportion of the dataset to include in the test split.
    random_state (int): The seed value for random state to ensure reproducibility.

    Returns:
    tuple: A tuple containing four elements:
        - X_train (pandas.DataFrame): The DataFrame containing the training input features.
        - X_test (pandas.DataFrame): The DataFrame containing the testing input features.
        - y_train (pandas.Series): The Series containing the training target variable.
        - y_test (pandas.Series): The Series containing the testing target variable.

    Splits the input features (X) and target variable (y) into training and testing sets.
    The `test_size` parameter specifies the proportion of the dataset to include in the test split.
    The `seed` parameter is used for random state to ensure reproducibility.
    Prints the shape of the training and testing input features and target variables.
    Returns a tuple containing the training and testing input features and target variables.
    """
    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        test_size = test_size,
                                                        stratify = y,
                                                        random_state = random_state)

    print(f'X train shape: f{X_train.shape}')
    print(f'X test shape : f{X_test.shape}')
    print(f'y train shape: f{y_train.shape}')
    print(f'y test shape : f{y_test.shape}\n')

    return  X_train, X_test, y_train, y_test

RANDOMSTATE = 42
X_train, X_not_train, y_train, y_not_train = split_train_test(X = X,
                                                              y = y,
                                                              test_size = 0.2,
                                                              random_state = RANDOMSTATE)

# Then, split the valid & test
X_valid, X_test, y_valid, y_test = split_train_test(X = X_not_train,
                                                    y = y_not_train,
                                                    test_size = 0.5,
                                                    random_state = RANDOMSTATE)

X train shape: f(26064, 11)
X test shape : f(6517, 11)
y train shape: f(26064,)
y test shape : f(6517,)

X train shape: f(3258, 11)
X test shape : f(3259, 11)
y train shape: f(3258,)
y test shape : f(3259,)



In [8]:
import joblib

def serialize_data(data, path):
    """
    Serializes the given data to the specified path using joblib.

    Parameters:
    data (any): The instance of the object to be serialized.
    path (str): The file path where the serialized data will be stored.

    Returns:
    None
    """
    joblib.dump(data, path)

serialize_data(X_train, "./data/interim/X_train.pkl")
serialize_data(y_train, "./data/interim/y_train.pkl")
serialize_data(X_test, "./data/interim/X_test.pkl")
serialize_data(y_test, "./data/interim/y_test.pkl")
serialize_data(X_valid, "./data/interim/X_valid.pkl")
serialize_data(y_valid, "./data/interim/y_valid.pkl")


In [5]:
def deserialize_data(path):
    """
    Deserializes data from the specified path using joblib.

    Parameters:
    path (str): The file path from where the serialized data will be loaded.

    Returns:
    any: The deserialized data.
    """
    data = joblib.load(path)

    return data