In [1]:
def extract_columns(df, columns):
    """
    Extracts a subset of columns from a DataFrame.
    
    Parameters:
        df (DataFrame): The DataFrame from which columns need to be extracted.
        columns (list): A list of column names to extract.
        
    Returns:
        DataFrame: A new DataFrame containing only the specified columns.
    """
    return df[columns].copy()


In [2]:
def filter_and_add_values(original_df, extra_df, main_column, filter_values):
    """
    Filters values from an extra DataFrame based on a main column and specific values,
    then adds the filtered values to the original DataFrame.

    Parameters:
        original_df (DataFrame): The original DataFrame to which filtered values will be added.
        extra_df (DataFrame): The extra DataFrame from which values will be filtered.
        main_column (str): The main column in both DataFrames used for filtering.
        filter_values (list): The values to filter from the extra DataFrame and add to the original DataFrame.

    Returns:
        DataFrame: The updated original DataFrame with filtered values added.
    """
    filtered_values = extra_df[extra_df[main_column].isin(filter_values)].copy()
    return pd.concat([original_df, filtered_values], ignore_index=True)



In [3]:
def sample_balanced_data(df, product_column, subproduct_column, sample_size):
    """
    Samples data points from a DataFrame while balancing subproducts within each product category.

    Parameters:
        df (DataFrame): The DataFrame from which data points will be sampled.
        product_column (str): The column name representing the product category.
        subproduct_column (str): The column name representing the subproduct category.
        sample_size (int): The number of data points to sample for each product.

    Returns:
        DataFrame: A sampled DataFrame containing the specified number of data points with balanced subproducts.
    """
    sampled_dfs = []
    for product in df[product_column].unique():
        product_df = df[df[product_column] == product]
        if len(product_df) <= sample_size:
            sampled_dfs.append(product_df)
        else:
            subproduct_counts = product_df[subproduct_column].value_counts()
            sample_per_subproduct = {subproduct: min(sample_size, count) for subproduct, count in subproduct_counts.items()}
            sampled_product_df = pd.concat([product_df[product_df[subproduct_column] == subproduct].sample(n=count, replace=True, random_state=42) for subproduct, count in sample_per_subproduct.items()])
            sampled_dfs.append(sampled_product_df)
    sampled_df = pd.concat(sampled_dfs)
    return sampled_df



In [4]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split

def create_dataframes_and_split(data, folder_path, test_size=0.1, random_state=None):
    """
    Creates dataframes for each product category, splits them into train and validation sets, and saves them to CSV files.

    Parameters:
        data (DataFrame): The original DataFrame containing all data.
        folder_path (str): The path to the folder where CSV files will be saved.
        test_size (float or int): The proportion of the dataset to include in the validation split, or the absolute number of samples.
        random_state (int or None): Controls the randomness of the training and validation data splitting.
    
    Returns:
        dict: A dictionary containing train and validation DataFrames for each product category.
    """
    # Create the folder if it doesn't exist
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
    
    # Group data by product category
    grouped_data = data.groupby('Product')
    
    train_validation_splits = {}
    for product, group_df in grouped_data:
        # Save DataFrame for this product category
        product_file_path = os.path.join(folder_path, f"{product.replace('/', '_').replace(' ', '_').lower()}_data.csv")
        group_df.to_csv(product_file_path, index=False)

        # Split the DataFrame into train and validation sets
        train_df, val_df = train_test_split(group_df, test_size=test_size, random_state=random_state, stratify=group_df['Sub-product'])
        
        # Save train DataFrame
        train_file_path = os.path.join(folder_path, f"{product.replace('/', '_').replace(' ', '_').lower()}_train_data.csv")
        train_df.to_csv(train_file_path, index=False)

        # Save validation DataFrame
        val_file_path = os.path.join(folder_path, f"{product.replace('/', '_').replace(' ', '_').lower()}_val_data.csv")
        val_df.to_csv(val_file_path, index=False)

        # Store train and validation DataFrames in the dictionary
        train_validation_splits[product] = {'train': train_df, 'validation': val_df}
    
    return train_validation_splits






In [5]:
import os

def save_dataframe_to_csv(df, file_name, directory='./'):
    """
    Save a DataFrame to a CSV file.

    Parameters:
        df (DataFrame): The DataFrame to be saved.
        file_name (str): The name of the CSV file.
        directory (str): The directory where the CSV file will be saved. Defaults to the current directory.
    """
    # Create the directory if it doesn't exist
    if not os.path.exists(directory):
        os.makedirs(directory)
    
    # Concatenate the directory and file name
    file_path = os.path.join(directory, file_name)
    
    # Save the DataFrame to a CSV file
    df.to_csv(file_path, index=False)



In [6]:

def create_balanced_data(df_2023, df_2022, product_column, products_to_include, sample_size, directory_to_save='./data_splits/'):
    # Extract selected columns
    selected_columns = ['Consumer complaint narrative', product_column, 'Sub-product']
    product_training_2023 = extract_columns(df_2023, selected_columns)
    product_training_2022 = extract_columns(df_2022, selected_columns)
    
    # Filter and add values from 2022 to 2023 data
    balanced_df = filter_and_add_values(product_training_2023, product_training_2022, product_column, products_to_include)
    
    # Sample balanced data
    sampled_df = sample_balanced_data(balanced_df, product_column, 'Sub-product', sample_size)
    
    # Save sampled data to CSV
    file_name = 'train-data-balanced.csv'
    save_dataframe_to_csv(sampled_df, file_name, directory_to_save)
    return sampled_df


In [8]:
directory_to_save = './data_splits/'
training_2023 = pd.read_csv('data_splits/train-data-split_2023.csv')
training_2022 = pd.read_csv('data_splits/train-data-split_2022.csv')

In [11]:
products_to_include = ['Loans / Mortgage', 'Credit/Prepaid Card', 'Checking or savings account']
sampled_df=create_balanced_data(training_2023, training_2022, 'Product', products_to_include, 15000, directory_to_save)

In [12]:
folder_path = 'product_data_splits'
train_validation_splits = create_dataframes_and_split(sampled_df, folder_path, test_size=0.1, random_state=42)