### General Preprocessing

#### Importing Libraries

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
import os

In [3]:
def load_and_clean_data(file_path):
    # Read the CSV file
    df = pd.read_csv(file_path)
    df['Date received'] = pd.to_datetime(df['Date received'])
    
    cols_to_consider = ['Product', 'Sub-product', 'Issue', 'Sub-issue', 'Consumer complaint narrative',
                        'Company public response', 'Company', 'State', 'ZIP code', 'Date received']
    
    df_new = df[cols_to_consider]
    
    df_new = df_new.dropna()
    
    return df_new

In [4]:
def filter_by_years(df, years):
    filtered_df = df[df['Date received'].dt.year.isin(years)].reset_index(drop=True)
    return filtered_df

In [5]:
def map_product_column(df):
    product_map = {'Credit reporting or other personal consumer reports': 'Credit Reporting',
                   'Credit reporting, credit repair services, or other personal consumer reports': 'Credit Reporting',
                   'Payday loan, title loan, personal loan, or advance loan': 'Loans / Mortgage',
                   'Payday loan, title loan, or personal loan': 'Loans / Mortgage',
                   'Student loan': 'Loans / Mortgage',
                   'Vehicle loan or lease': 'Loans / Mortgage',
                   'Debt collection': 'Debt collection',
                   'Credit card or prepaid card': 'Credit/Prepaid Card',
                   'Credit card': 'Credit/Prepaid Card',
                   'Prepaid card': 'Credit/Prepaid Card',
                   'Mortgage': 'Loans / Mortgage',
                   'Checking or savings account': 'Checking or savings account'  
                  }
    # Map 'Product' column
    df.loc[:,'Product'] = df['Product'].map(product_map)
    
    return df

In [6]:
def preprocess_data(df):
    # Compute complaint length
    df['complaint length'] = df['Consumer complaint narrative'].apply(lambda x : len(x))

    df = df[df['complaint length'] > 20]
    
    complaints_to_exclude = ['See document attached', 'See the attached documents.', 'Incorrect information on my credit report', 'incorrect information on my credit report',
    'please see attached file','Please see documents Attached','Incorrect information on my credit report.', 'Please see attached file', 'see attached',
    'See attached', 'SEE ATTACHED DOCUMENTS', 'See Attached', 'SEE ATTACHMENT', 'SEE ATTACHMENTS', 
    'XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX']
    
    df = df[~df['Consumer complaint narrative'].isin(complaints_to_exclude)]
    print("Removing complaint narrative",df.shape)

    # Select sub-issues with frequency > 500
    sub_issues_to_consider = df['Sub-issue'].value_counts()[df['Sub-issue'].value_counts() > 500].index

    # Filter DataFrame based on selected sub-issues
    reduced_subissues = df[df['Sub-issue'].isin(sub_issues_to_consider)]
    print("Removing subissue",reduced_subissues.shape)
    # Select sub-products with frequency > 100
    sub_products_to_consider = reduced_subissues['Sub-product'].value_counts()[reduced_subissues['Sub-product'].value_counts() > 100].index

    # Filter DataFrame based on selected sub-products
    final_df = reduced_subissues[reduced_subissues['Sub-product'].isin(sub_products_to_consider)]

    return final_df

In [7]:
def clean_narrative(df):
# Compute complaint length
    df['complaint length'] = df['Consumer complaint narrative'].apply(lambda x : len(x))

    df = df[df['complaint length'] > 20]
    
    complaints_to_exclude = ['See document attached', 'See the attached documents.', 'Incorrect information on my credit report', 'incorrect information on my credit report',
    'please see attached file','Please see documents Attached','Incorrect information on my credit report.', 'Please see attached file', 'see attached',
    'See attached', 'SEE ATTACHED DOCUMENTS', 'See Attached', 'SEE ATTACHMENT', 'SEE ATTACHMENTS', 
    'XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX']
    
    df = df[~df['Consumer complaint narrative'].isin(complaints_to_exclude)]
    print("Removing complaint narrative",df.shape)
    return df

In [8]:
def filter_by_frequency(df):
    # Select sub-issues with frequency > 500
    sub_issues_to_consider = df['Sub-issue'].value_counts()[df['Sub-issue'].value_counts() > 500].index

    # Filter DataFrame based on selected sub-issues
    reduced_subissues = df[df['Sub-issue'].isin(sub_issues_to_consider)]
    print("Removing subissue",reduced_subissues.shape)
    # Select sub-products with frequency > 100
    sub_products_to_consider = reduced_subissues['Sub-product'].value_counts()[reduced_subissues['Sub-product'].value_counts() > 100].index

    # Filter DataFrame based on selected sub-products
    final_df = reduced_subissues[reduced_subissues['Sub-product'].isin(sub_products_to_consider)]

    return final_df

In [9]:
def split_and_save_data(df,year, test_size=0.25, random_state=42, directory_to_save='./data_splits/'):
    # Split the data into train and test sets
    X = df['Consumer complaint narrative']
    y = df[['Product', 'Sub-product', 'Issue', 'Sub-issue']]
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y['Product'], test_size=test_size, random_state=random_state)

    # Concatenate X_train and y_train, and X_test and y_test respectively
    train_df = pd.concat([X_train, y_train], axis=1).reset_index(drop=True)
    test_df = pd.concat([X_test, y_test], axis=1).reset_index(drop=True)

    # Create directory if it doesn't exist
    if not os.path.exists(directory_to_save):
        os.makedirs(directory_to_save)
        
    # Save train and test data as CSV files with the year included in the file names
    train_df.to_csv(os.path.join(directory_to_save, f'train-data-split_{year}.csv'), index=False)
    test_df.to_csv(os.path.join(directory_to_save, f'test-data-split_{year}.csv'), index=False)

In [10]:
def main(file_path, year,year_name):
    # Load and clean the data
    df_cleaned = load_and_clean_data(file_path)
    
    # Filter the data by years
    df_filtered = filter_by_years(df_cleaned, year)
    
    # Map the 'Product' column
    df_mapped = map_product_column(df_filtered)
    
    # Preprocess the data
    df_preprocessed = preprocess_data(df_mapped)
    
    # Split and save the data
    split_and_save_data(df_preprocessed,year_name)


In [12]:
file_path = '../complaints.csv'
years_to_include = [2023]
year_name=2023
main(file_path, years_to_include,year_name)

Removing complaint narrative (264968, 11)
Removing subissue (248065, 11)


In [14]:
file_path = '../complaints.csv'
years_to_include = [2022,2021]
year_name=2022
main(file_path, years_to_include,year_name)

Removing complaint narrative (254701, 11)
Removing subissue (240079, 11)
