In [12]:
import numpy as np
import pandas as pd


from sklearn.preprocessing import KBinsDiscretizer
import shap
from sklearn.ensemble import RandomForestClassifier

#------------------------------------Plots
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.figure_factory as ff
import plotly
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.io as pio
from plotly import tools
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
import textwrap

from collections import Counter
import re
from scipy.stats import chi2_contingency, ks_2samp
import requests

#------------------------------------Plots Functions
import eda_utils
import importlib
importlib.reload(eda_utils)
from eda_utils import calculate_psi, plot_distribution_plotly, calculate_woe_iv
import utils_plot
from optbinning import BinningProcess
import joblib 
from sklearn.metrics import mutual_info_score
from joblib import Parallel, delayed

spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "false")
pd.set_option('display.max_columns', 500)


sns.set_style("whitegrid")
sns.set(rc = {'figure.figsize':(15,10)})



# General variables and Functions

In [13]:
#Training Columns
training_columns = ['home_ownership', 'addr_state', 'dti', 'fico_range_high',
       'all_util', 'inq_last_12m', 'acc_open_past_24mths', 'avg_cur_bal',
       'bc_open_to_buy', 'mo_sin_old_rev_tl_op', 'mo_sin_rcnt_rev_tl_op',
       'mo_sin_rcnt_tl', 'mort_acc', 'mths_since_recent_bc',
       'mths_since_recent_inq', 'num_actv_rev_tl', 'emp_length_numeric',
       'emp_title_final_grouped', 'title_grouped', 'region_median_income',
       'loan_to_income_ratio', 'revolving_balance_to_income_ratio',
       'credit_utilization_manual', 'open_acc_ratio']


# Load data

In [None]:
df = spark.sql("select * from data_credit_test_ds_chanllenges").toPandas()

df['d_vintage'] = pd.to_datetime(df['issue_d'], format='%b-%Y')

print('Dimension of the data:', df.shape)

####Target

df['loan_status'] = df['loan_status'].fillna('NA')
df['loan_status'] = np.where(df['loan_status']=='Does not meet the credit policy. Status:Fully Paid', 'Fully Paid', df['loan_status'])
df['loan_status'] = np.where(df['loan_status']=='Does not meet the credit policy. Status:Charged Off', 'Charged Off', df['loan_status'])


#----------------------------------------------------------------------------------------------------------------------------------------------------------------------
#General definitions

df['target'] = np.where(df['loan_status']=='Fully Paid', 0.0, 
                        np.where(df['loan_status']=='Charged Off', 1.0, np.nan))

Dimension of the data: (2260698, 151)


# Preprocessing

## Emp Title & Term

In [15]:
emp_length_mapping = {
    '10+ years': 10,
    '2 years': 2,
    '3 years': 3,
    '< 1 year': 0,
    '1 year': 1,
    '4 years': 4,
    '5 years': 5,
    '6 years': 6,
    '7 years': 7,
    '8 years': 8,
    '9 years': 9,
}
df['emp_length_numeric'] = df['emp_length'].map(emp_length_mapping)

#----------------------------------------------------------------------------------------------


df['term_numeric'] = df['term'].str.extract('(\d+)').astype('Int64')


In [16]:
print("Creating mapping function.")

def map_emp_title_v5_final(title):
    """
    Job Title Categorization. This version identifies niche roles, 
    common misspellings, and specific employers
    """
    
    if not isinstance(title, str) or title.strip() == '':
        return 'Other/Unknown'
    
    title_lower = title.lower().strip()


    # 1. Specific Info
    if 'kaiser permanente' in title_lower: return 'Healthcare (Kaiser)'
    if any(keyword in title_lower for keyword in ['bank of america', 'wells fargo', 'jp morgan', 'chase']): return 'Major Bank Employee'
    if 'walmart' in title_lower: return 'Major Retailer (Walmart)'
    if 'at&t' in title_lower: return 'Major Telecom (AT&T)'
    if 'department of defense' in title_lower: return 'Public Service (DoD)'
    if 'ibm' in title_lower: return 'IT & Engineering (IBM)'

    # 2. Healthcare
    if any(keyword in title_lower for keyword in ['nurse', 'rn', 'lpn', 'cna', 'physician', 'therapist', 'pharmacist', 'medical', 'dental', 'paramedic', 'healthcare', 'veterinarian', 'surgical', 'sonographer', 'caregiver', 'dentist', 'psychologist', 'emt', 'phlebotomist', 'lvn', 'radiologic technologist']):
        return 'Healthcare'

    # 3. Education
    if any(keyword in title_lower for keyword in ['teacher', 'professor', 'instructor', 'educator', 'principal', 'school', 'registrar', 'librarian', 'faculty', 'paraprofessional', 'superintendent']):
        return 'Education'
        
    # 4. IT & Engineering 
    if any(keyword in title_lower for keyword in ['engineer', 'developer', 'analyst', 'technician', ' it ', 'i.t.', 'software', 'network', 'systems', 'web', 'programmer', 'architect', 'tech', 'cpa']): # CPA es técnico-financiero, pero lo ponemos aquí
        return 'IT & Engineering'

    # 5. Finance & Accounting 
    if any(keyword in title_lower for keyword in ['accountant', 'accounting', 'financial', 'controller', 'bookkeeper', 'auditor', 'banker', 'underwriter', 'cfo', 'loan officer', 'finance', 'accounts', 'claims', 'adjuster', 'collector', 'trader', 'teller', 'broker', 'processor', 'billing']):
        return 'Finance & Accounting'

    # 6. Sales & Customer Service 
    if any(keyword in title_lower for keyword in ['sales', 'account executive', 'customer service', 'csr', 'agent', 'representative', 'concierge', 'service advisor', 'dealer', 'business development', 'merchandiser', 'associate']):
        if 'manager' not in title_lower:
            return 'Sales & Customer Service'

    # 7. Public Service 
    if any(keyword in title_lower for keyword in ['police', 'officer', 'firefighter', 'sheriff', 'sergeant', 'captain', 'army', 'military', 'deputy', 'government', 'navy', 'soldier', 'trooper', 'usaf', 'investigator', 'detective', 'security']):
        return 'Public Service'

    # 8. Postal Service
    if any(keyword in title_lower for keyword in ['postal', 'mail', 'usps', 'carrier']):
        return 'Postal Service'

    # 9. Logistics & Supply Chain
    if any(keyword in title_lower for keyword in ['logistics', 'supply chain', 'courier', 'ups', 'material handler', 'loader', 'warehouse']):
        return 'Logistics & Supply Chain'

    # 10. Transportation
    if any(keyword in title_lower for keyword in ['driver', 'pilot', 'dispatcher', 'truck', 'flight attendant', 'conductor']):
        return 'Transportation'

    # 11. Management / Executive
    if any(keyword in title_lower for keyword in ['manager', 'director', 'supervisor', 'president', 'owner', 'ceo', 'vice president', 'vp', 'partner', 'leader', 'chief', 'manger', 'management', 'gm', 'coo', 'cto', 'executive']):
        return 'Management/Executive'

    # 12. Skilled Trades & Labor
    if any(keyword in title_lower for keyword in ['mechanic', 'machinist', 'welder', 'foreman', 'forman', 'operator', 'maintenance', 'laborer', 'labor', 'pipefitter', 'painter', 'production', 'assembler', 'assembly', 'plumber', 'lineman']):
        return 'Skilled Trades & Labor'

    # 13. Administrative 
    if any(keyword in title_lower for keyword in ['assistant', 'clerk', 'secretary', 'receptionist', 'coordinator', 'administrator', 'admin', 'clerical', 'payroll', 'specialist']): # Specialist es genérico, lo ponemos aquí
        return 'Administrative'
        
    # 14. Food, Retail & Service
    if any(keyword in title_lower for keyword in ['server', 'bartender', 'cook', 'chef', 'cashier', 'hospitality', 'stocker', 'porter', 'baker', 'janitor', 'barista', 'hairstylist', 'stylist', 'waitress', 'waiter', 'housekeeper', 'housekeeping']):
        return 'Food, Retail & Service'

    # 15. Quality Assurance
    if any(keyword in title_lower for keyword in ['quality assurance', 'quality control']):
        return 'Quality Assurance'

    # 16. Procurement
    if any(keyword in title_lower for keyword in ['buyer', 'purchasing']):
        return 'Procurement'

    # 17. Human Resources 
    if any(keyword in title_lower for keyword in ['human resources', 'hr', 'recruiter', 'trainer']):
        return 'Human Resources'

    # 18. Art, Design & Media 
    if any(keyword in title_lower for keyword in ['artist', 'designer', 'graphic', 'writer', 'editor', 'producer']):
        return 'Art, Design & Media'
        
    # 19. Science & Research 
    if any(keyword in title_lower for keyword in ['scientist', 'research', 'chemist']):
        return 'Science & Research'

    
    if 'consultant' in title_lower: return 'Consulting'
    if any(keyword in title_lower for keyword in ['property', 'realtor']): return 'Real Estate'
    if any(keyword in title_lower for keyword in ['social worker', 'counselor', 'minister', 'pastor']): return 'Community & Social Services'
    if any(keyword in title_lower for keyword in ['construction', 'carpenter', 'electrician', 'installer', 'inspector', 'estimator']): return 'Construction & Installation'


    return 'Other/Unknown'



print("Applying the function to the 'emp_title' column...")
df['emp_title_final_grouped'] = df['emp_title'].apply(map_emp_title_v5_final)


print("\n--- Definitive Grouping Results (v5) for 'emp_title' ---")
print(f"Number of unique categories: {df['emp_title_final_grouped'].nunique()}")
print("\nFrequency of new categories (in %):")
print(df['emp_title_final_grouped'].value_counts(normalize=True).map('{:.2%}'.format))

Creating mapping function.
Applying the function to the 'emp_title' column...

--- Definitive Grouping Results (v5) for 'emp_title' ---
Number of unique categories: 29

Frequency of new categories (in %):
emp_title_final_grouped
Management/Executive           24.85%
Other/Unknown                  22.86%
IT & Engineering               10.25%
Healthcare                      7.15%
Administrative                  5.30%
Sales & Customer Service        5.24%
Education                       4.49%
Finance & Accounting            4.18%
Skilled Trades & Labor          3.83%
Public Service                  3.19%
Transportation                  2.71%
Food, Retail & Service          1.28%
Logistics & Supply Chain        0.78%
Consulting                      0.64%
Construction & Installation     0.63%
Community & Social Services     0.52%
Art, Design & Media             0.51%
Postal Service                  0.43%
Human Resources                 0.42%
Science & Research              0.20%
Procurement

## Loan Purposes

In [17]:
def map_loan_title(title):
    """
    Categorize loan titles through a thorough keyword analysis. 
    This is our most complete version, encompassing niche categories.
    """
    if not isinstance(title, str):
        return 'Other'
    
    title_lower = title.lower().strip()



    # 1.  Credit Refinancing
    if 'credit' in title_lower and 'card' in title_lower:
        return 'Credit Card Refinancing'
    if 'refi' in title_lower or 'refinance' in title_lower or 'payoff' in title_lower or 'consolidation' in title_lower or 'consolidate' in title_lower or 'cosolidation ' in title_lower:
        return 'Credit Card Refinancing'

    # 2. Debt Consolidation
    if 'consolidation' in title_lower or 'payoff' in title_lower or 'pay off' in title_lower or 'bills' in title_lower or 'consolidate' in title_lower or 'consol' in title_lower or 'cons' in title_lower or 'debt' in title_lower:
        return 'Debt Consolidation'

    # 3. Home Improvement
    if 'home' in title_lower or 'house' in title_lower or 'kitchen' in title_lower or 'remodel' in title_lower or 'improvement' in title_lower or 'pool' in title_lower or 'roof' in title_lower or 'repairs' in title_lower or 'renovation' in title_lower:
        return 'Home Improvement'

    # 4. Home Buying
    if 'home' in title_lower and 'buy' in title_lower:
        return 'Home Buying'
    if 'down payment' in title_lower:
        return 'Home Buying'

    # 5. Car Financing
    if 'car' in title_lower or 'auto' in title_lower or 'vehicle' in title_lower or 'motorcycle' in title_lower or 'harley' in title_lower:
        return 'Car Financing'
        
    # 6. Major Purchase
    if 'major purchase' in title_lower:
        return 'Major Purchase'

    # 7. Medical Expenses
    if 'medical' in title_lower or 'health' in title_lower or 'dental' in title_lower:
        return 'Medical Expenses'

    # 8. Business Loan
    if 'business' in title_lower:
        return 'Business Loan'

    # 9. Moving and Relocation
    if 'moving' in title_lower or 'relocation' in title_lower:
        return 'Moving and Relocation'

    # 10. Vacation
    if 'vacation' in title_lower or 'trip' in title_lower:
        return 'Vacation'
        
    # 11. Wedding
    if 'wedding' in title_lower or 'engagement' in title_lower:
        return 'Wedding'
        
    # 12. Education
    if 'education' in title_lower or 'school' in title_lower or 'student' in title_lower:
        return 'Education'
        
    # 13. Taxes
    if 'tax' in title_lower or 'taxes' in title_lower:
        return 'Taxes'

    # 14. Green Loan 
    if 'green loan' in title_lower:
        return 'Green Loan'

    if 'personal'  in title_lower or 'my loan' in title_lower or 'mine' in title_lower or 'free' in title_lower :
        return 'Personal Loan'


    return 'Other'


print("Applying thefunction to the 'title' column.")
df['title_grouped'] = df['title'].apply(map_loan_title)


print("\n--- Final Grouping Results for 'title' ---")
print(f"Number of unique categories: {df['title_grouped'].nunique()}")
print("\nFrequency of new categories:")
print(df['title_grouped'].value_counts())

Applying thefunction to the 'title' column.

--- Final Grouping Results for 'title' ---
Number of unique categories: 16

Frequency of new categories:
title_grouped
Credit Card Refinancing    1714686
Other                       194649
Home Improvement            161750
Major Purchase               45439
Medical Expenses             26728
Car Financing                24680
Debt Consolidation           23960
Business Loan                23069
Vacation                     15132
Moving and Relocation        15026
Personal Loan                10788
Wedding                       2513
Green Loan                    1252
Education                      508
Taxes                          417
Home Buying                    101
Name: count, dtype: int64


## ZIP Code Enrich Information

In [18]:
print("Fetching Median Household Income data from the U.S. Census API...")
API_URL = "https://api.census.gov/data/2023/acs/acs5/profile?get=NAME,DP03_0062E&for=zip%20code%20tabulation%20area:*"
response = requests.get(API_URL)

if response.status_code == 200:
    print("Successfully connected to the Census API!")
    data = response.json()
    df_census = pd.DataFrame(data[1:], columns=data[0])
else:
    print(f"Error contacting the Census API. Status code: {response.status_code}")
    df_census = pd.DataFrame()

if not df_census.empty:

    print("\nCleaning and preparing census data...")
    df_census = df_census.rename(columns={
        "DP03_0062E": "median_household_income",
        "zip code tabulation area": "zip_code_5_digit"
    })
    df_census['median_household_income'] = pd.to_numeric(df_census['median_household_income'], errors='coerce')
    df_census.dropna(subset=['median_household_income'], inplace=True)
    df_census['zip_code_5_digit'] = df_census['zip_code_5_digit'].astype(str)

   
    print("\nAdding 3-digit level census data to match our data...")


    df_census['zip_code_3_digit'] = df_census['zip_code_5_digit'].str[:3]

    df_census_agg = df_census.groupby('zip_code_3_digit')['median_household_income'].median().reset_index()
    df_census_agg = df_census_agg.rename(columns={'median_household_income': 'region_median_income'})

    print(f"Aggregated incomes calculated for {len(df_census_agg)} 3-digit regions.")
    print("Example of aggregated census data:")
    print(df_census_agg.head())

    
    print("\nPreparing the main DataFrame for merging...")
    
  
    df['zip_code_3_digit'] = df['zip_code'].str[:3]
    
    print("Performing the merge with the aggregated data...")
    

    df = pd.merge(
        df,
        df_census_agg,
        on='zip_code_3_digit',
        how='left' 
    )
    

    print("\nMerge complete! The new 'region_median_income' feature has been added.")

    null_count = df['region_median_income'].isnull().sum()
    total_rows = len(df)
    print(f"Were found {null_count} loans ({null_count/total_rows:.2%}) without a corresponding income data.")
    
    if null_count > 0:
        median_overall = df['region_median_income'].median()
        df['region_median_income'].fillna(median_overall, inplace=True)
        print(f"Null values have been filled with the overall median regional income: ${median_overall:,.0f}")

    print("\nDescription of the new variable:")
    print(df['region_median_income'].describe().apply("{:,.0f}".format))
    
    print("\nPreview of the final DataFrame:")
 
    df.drop(columns=['zip_code_3_digit'], inplace=True, errors='ignore')
    print(df[['zip_code', 'region_median_income', 'target']].head())


Fetching Median Household Income data from the U.S. Census API...
Successfully connected to the Census API!

Cleaning and preparing census data...

Adding 3-digit level census data to match our data...
Aggregated incomes calculated for 894 3-digit regions.
Example of aggregated census data:
  zip_code_3_digit  region_median_income
0              006               20984.5
1              007               22578.0
2              009               29540.5
3              010               88223.0
4              011               51851.0

Preparing the main DataFrame for merging...
Performing the merge with the aggregated data...

Merge complete! The new 'region_median_income' feature has been added.
Were found 638 loans (0.03%) without a corresponding income data.
Null values have been filled with the overall median regional income: $77,000

Description of the new variable:
count       2,260,698
mean         -246,286
std        14,520,460
min      -666,666,666
25%            64,106
50%     


A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.





  zip_code  region_median_income  target
0    190xx              108393.5     0.0
1    577xx               58810.0     0.0
2    605xx              109230.0     0.0
3    076xx              134311.0     NaN
4    174xx               77134.0     0.0


## Financial ratios

In [21]:
def create_financial_ratios(df):
    """
    Create a set of interaction features and financial ratios based on pre-decision data.
    This function avoids data leakage by using 'loan_amnt' instead of 'funded_amnt'.
    It handles division by zero by replacing the resulting infinities with a safe value (e.g., 0 or NaN).
    """
    print("Creating new financial ratio features...")
    
    # A small epsilon to prevent division by zero in a robust way
    epsilon = 1e-6

    # --- Loan and Debt Burden Ratios (relative to income) ---
    
    # Loan-to-Annual Income Ratio (Using 'loan_amnt' to avoid data leakage)
    # This ratio indicates how large the requested loan is compared to the applicant's annual income.
    df['loan_to_income_ratio'] = df['funded_amnt'] / (df['annual_inc'] + epsilon)
    
    # Revolving Debt Balance to Annual Income Ratio
    # This shows the burden of existing revolving debt relative to income.
    df['revolving_balance_to_income_ratio'] = df['revol_bal'] / (df['annual_inc'] + epsilon)

    # --- Credit Behavior and Usage Ratios ---

    # Manual Revolving Credit Utilization Ratio
    # Measures how much of the available revolving credit is being used.
    df['credit_utilization_manual'] = df['revol_bal'] / (df['revol_bal'] + df['bc_open_to_buy'] + epsilon)
    
    # Open Accounts to Total Accounts Ratio
    # Represents the proportion of credit lines that are currently open.
    df['open_acc_ratio'] = df['open_acc'] / (df['total_acc'] + epsilon)
    
    # --- Credit History and Loan Term Ratios ---

 
    # --- Income Context Ratio ---
    
    # Applicant's Income relative to their Region's Median Income
    # This contextualizes income based on local cost of living.
    df['income_to_region_median_ratio'] = df['annual_inc'] / (df['region_median_income'] + epsilon)
    
    # --- Post-processing: Handle potential NaNs or infinities from division ---
    ratio_cols = [
        'loan_to_income_ratio', 'revolving_balance_to_income_ratio', 
        'credit_utilization_manual', 'open_acc_ratio', 
        'income_to_region_median_ratio'
    ]
    
    # Replace any infinite values that might have occurred with NaN
    df[ratio_cols] = df[ratio_cols].replace([np.inf, -np.inf], np.nan)
    # Fill any resulting NaNs (e.g., from 0/0 division) with 0
    df[ratio_cols] = df[ratio_cols].fillna(0)
        
    print("New features created successfully!")
    return df

# Apply the function to our dataframe
df = create_financial_ratios(df)

# Display the new columns along with some of the original ones for verification
new_feature_columns = [
     'funded_amnt', 'annual_inc', 'loan_to_income_ratio',
     'revol_bal', 'revolving_balance_to_income_ratio',
     'bc_open_to_buy', 'credit_utilization_manual',
     'open_acc', 'total_acc', 'open_acc_ratio',
     'term_numeric',
     'region_median_income', 'income_to_region_median_ratio'
 ]

print("\nPreview of the new features:")
display(df[new_feature_columns].head())

Creating new financial ratio features...
New features created successfully!

Preview of the new features:


Unnamed: 0,funded_amnt,annual_inc,loan_to_income_ratio,revol_bal,revolving_balance_to_income_ratio,bc_open_to_buy,credit_utilization_manual,open_acc,total_acc,open_acc_ratio,term_numeric,region_median_income,income_to_region_median_ratio
0,3600.0,55000.0,0.065455,2765.0,0.050273,1506.0,0.647389,7.0,13.0,0.538461,36,108393.5,0.50741
1,24700.0,65000.0,0.38,21470.0,0.330308,57830.0,0.270744,22.0,38.0,0.578947,36,58810.0,1.105254
2,20000.0,63000.0,0.31746,7869.0,0.124905,2737.0,0.741939,6.0,18.0,0.333333,60,109230.0,0.576765
3,35000.0,110000.0,0.318182,7802.0,0.070927,54962.0,0.124307,13.0,17.0,0.764706,60,134311.0,0.818995
4,10400.0,104433.0,0.099585,21929.0,0.209982,4567.0,0.827634,12.0,35.0,0.342857,60,77134.0,1.353917


In [36]:
deleted_vars = ['income_to_region_median_ratio', 'percent_bc_gt_75', 'total_acc', 'grade', 'zip_code', 'inq_last_6mths', 'open_acc', 'revol_bal', 'mths_since_last_delinq',
'delinq_2yrs','pub_rec', 'initial_list_status', 'emp_title', 'emp_length', 'purpose', 'title', 'term' ]

df_model_final =  df.drop(deleted_vars, axis=1)

print('Final pre-dimension', df_model_final.shape)

Final pre-dimension (2260698, 145)


# WoE Transformation

In [44]:

df_model_final_aux = df_model_final.set_index(['id', 'd_vintage' , 'target'])

woe_bin = joblib.load('binning_process_final.joblib')

df_woe = woe_bin.transform(df_model_final_aux, metric="woe")

df_woe = df_woe.reset_index()



df_woe['Set'] = np.where((df_woe['d_vintage']>=pd.to_datetime('2016-07-01')) & (df_woe['target'].notna()), 'Test',
                 np.where( (df_woe['d_vintage']< pd.to_datetime('2016-07-01')) &   (df_woe['target'].notna()),  'Train', 'Val'))


print('Final dimension data with Woe', df_woe.shape)

df_woe.head(15)           

Final dimension data with Woe (2260698, 34)


Unnamed: 0,id,d_vintage,target,funded_amnt,int_rate,sub_grade,home_ownership,annual_inc,verification_status,addr_state,dti,fico_range_high,all_util,inq_last_12m,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_inq,num_actv_rev_tl,emp_length_numeric,term_numeric,emp_title_final_grouped,title_grouped,region_median_income,loan_to_income_ratio,revolving_balance_to_income_ratio,credit_utilization_manual,open_acc_ratio,Set
0,68407277,2015-12-01,0.0,0.258841,-0.104389,-0.287373,0.160306,-0.086534,0.365801,-0.067006,0.403308,-0.217115,0.106942,-0.405376,0.02474,0.156173,-0.155903,-0.077618,-0.148191,-0.172215,-0.034501,-0.169441,-0.089716,0.084253,0.05701,0.310279,-0.020484,-0.007931,0.099562,0.457006,0.063355,0.023397,-0.0075,Train
1,68355089,2015-12-01,0.0,-0.15267,0.229718,0.088483,0.160306,-0.030447,0.365801,-0.101768,0.107425,0.280817,0.106942,-0.405376,0.02474,-0.056093,0.630465,0.061496,-0.179923,-0.205649,0.238651,-0.252891,-0.260049,0.020176,0.05701,0.310279,0.172054,-0.081017,-0.075997,-0.478301,-0.066297,0.283653,-0.079388,Train
2,68341763,2015-12-01,0.0,-0.15267,0.440679,0.355937,0.160306,-0.030447,0.365801,0.120678,0.28204,0.021937,-0.231529,-0.044784,-0.181297,0.378525,-0.139264,0.037777,0.025413,0.188066,0.238651,0.391569,0.047732,0.161126,0.05701,-0.721577,-0.313243,-0.081017,0.099562,-0.326559,0.111091,-0.045164,0.112083,Train
3,66310712,2015-12-01,,-0.240557,-0.247124,-0.371093,0.160306,0.262201,-0.07836,-0.05946,0.059621,1.098914,0.106942,-0.044784,0.300659,0.263099,0.630465,-0.136709,-0.179923,-0.205649,-0.034501,-0.252891,0.0,0.020176,0.05701,-0.721577,0.172054,-0.007931,0.140748,-0.326559,0.078482,0.290291,-0.226646,Val
4,68476807,2015-12-01,0.0,-0.052293,-1.109875,-1.16719,0.160306,0.262201,-0.07836,-0.067006,-0.164547,0.021937,-0.398292,-0.405376,-0.578527,0.378525,-0.075754,0.061496,-0.111674,-0.098426,0.350769,-0.169441,-0.260049,-0.044349,0.010116,-0.721577,-0.020484,-0.007931,-0.035045,0.387752,0.023428,-0.098688,0.112083,Train
5,68426831,2015-12-01,0.0,-0.052293,-0.104389,-0.155127,-0.168686,-0.212208,-0.07836,0.094121,0.28204,-0.044342,-0.398292,-0.044784,0.456646,-0.176827,-0.194252,-0.392487,0.400004,0.495469,-0.189779,0.15274,0.0,0.161126,0.010116,0.310279,-0.020484,-0.007931,0.054853,-0.478301,-0.032938,-0.127045,-0.226646,Train
6,68476668,2015-12-01,0.0,-0.15267,0.736853,0.669339,0.160306,0.366531,0.365801,-0.014735,0.193799,-0.182636,-0.398292,-0.044784,-0.181297,0.378525,-0.347748,0.191351,0.001416,0.131069,0.238651,-0.076567,0.047732,-0.044349,0.05701,0.310279,0.012941,-0.007931,0.054853,0.387752,-0.141886,-0.339684,0.037363,Train
7,67275481,2015-12-01,0.0,-0.15267,0.736853,0.769896,0.160306,0.072425,0.365801,0.255807,0.059621,0.123391,-0.231529,-0.044784,0.02474,0.109081,0.184215,-0.392487,0.400004,0.188066,0.169388,0.15274,0.047732,0.237851,0.05701,0.310279,-0.313243,-0.007931,-0.127956,-0.024098,0.043867,0.290291,-0.0075,Train
8,68466926,2015-12-01,0.0,0.107043,1.752382,1.8015,-0.168686,0.072425,0.365801,-0.067006,0.193799,-0.084299,0.106942,-0.044784,-0.264431,-0.176827,0.040141,-0.077618,-0.283219,-0.285198,-0.034501,-0.169441,-0.260049,-0.215452,0.010116,0.310279,0.012941,-0.007931,-0.035045,0.3601,0.111091,0.087131,-0.079388,Train
9,68616873,2015-12-01,0.0,0.206653,0.229718,0.239685,0.160306,-0.155503,0.365801,0.120678,-0.534175,0.123391,0.106942,-0.044784,-0.079886,0.378525,0.09958,0.137563,0.025413,0.131069,-0.034501,0.184122,0.047732,0.161126,0.05701,0.310279,-0.020484,-0.007931,-0.035045,0.168854,0.059959,0.18746,0.037363,Train


# Save Data

In [45]:
df_woe.to_csv('df_total_woe_validations.csv', index=False)