In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV
import joblib

# Step 1: Load the dataset
df = pd.read_csv('../data/processed/financial_loan_with_late_status.csv')

# Step 2: Data inspection
print(df.info())
print(df.describe())
print(df.isnull().sum())

df = df[(df["loan_status"]=="Charged Off")|(df["loan_status"]=="Fully Paid")]

# Handle missing data

# Replace missing values in 'emp_title' with 'Unknown'
df['emp_title'] = df['emp_title'].fillna('Unknown')

# Verify the replacement
print(df['emp_title'].isnull().sum(), "missing values after replacement.")

# List of columns to convert to datetime
date_columns = ['issue_date', 'last_payment_date', 'next_payment_date']

# Convert each column to datetime
for column in date_columns:
    df[column] = pd.to_datetime(df[column])

# Optionally, verify the conversion
print(df.info())

def encode_loan_status(df, column_name='loan_status'):
    """
    Encode the loan_status column where 'Charged Off' == 1 and 'Fully Paid' == 0.
    
    :param df: pandas DataFrame
    :param column_name: Name of the column to encode. Default is 'loan_status'.
    :return: DataFrame with the encoded loan_status column.
    """
    encoding_map = {
        'Charged Off': 1,
        'Fully Paid': 0
    }
    
    df[column_name] = df[column_name].map(encoding_map)
    
    # Optional: Raise an error if there are values in the column not in the encoding map
    if df[column_name].isnull().any():
        raise ValueError("There are loan statuses that are not 'Charged Off' or 'Fully Paid'. Please handle them accordingly.")
    
    return df

df["loan_status"] = encode_loan_status(df[["loan_status"]], column_name='loan_status')
df["loan_status"].unique()

df1 = df

# Convert the 'issue_date' column to datetime type
df1['issue_date'] = pd.to_datetime(df1['issue_date'])

# Feature Engineering on the 'issue_date' column
df1['issue_year'] = df1['issue_date'].dt.year
df1['issue_month'] = df1['issue_date'].dt.month
df1['issue_day'] = df1['issue_date'].dt.day
df1['issue_dayofweek'] = df1['issue_date'].dt.dayofweek  # Monday=0, Sunday=6
df1['issue_quarter'] = df1['issue_date'].dt.quarter
df1['issue_is_weekend'] = df1['issue_dayofweek'].apply(lambda x: 1 if x >= 5 else 0)

# Convert the 'issue_date' column to datetime type
df1['last_payment_date'] = pd.to_datetime(df1['last_payment_date'])

# Feature Engineering on the 'issue_date' column
df1['last_payment_year'] = df1['last_payment_date'].dt.year
df1['last_payment_month'] = df1['last_payment_date'].dt.month
df1['last_payment_day'] = df1['last_payment_date'].dt.day
df1['last_payment_dayofweek'] = df1['last_payment_date'].dt.dayofweek  # Monday=0, Sunday=6
df1['last_payment_quarter'] = df1['last_payment_date'].dt.quarter
df1['last_payment_is_weekend'] = df1['last_payment_dayofweek'].apply(lambda x: 1 if x >= 5 else 0)

# Convert the 'issue_date' column to datetime type
df1['next_payment_date'] = pd.to_datetime(df1['next_payment_date'])

# Feature Engineering on the 'issue_date' column
df1['next_payment_year'] = df1['next_payment_date'].dt.year
df1['next_payment_month'] = df1['next_payment_date'].dt.month
df1['next_payment_day'] = df1['next_payment_date'].dt.day
df1['next_payment_dayofweek'] = df1['next_payment_date'].dt.dayofweek  # Monday=0, Sunday=6
df1['next_payment_quarter'] = df1['next_payment_date'].dt.quarter
df1['next_payment_is_weekend'] = df1['next_payment_dayofweek'].apply(lambda x: 1 if x >= 5 else 0)

df1 = df1.drop(columns=['issue_date'],axis=1)
df1 = df1.drop(columns=['next_payment_date'],axis=1)
df1 = df1.drop(columns=['last_payment_date'],axis=1)

df1.drop(columns=['id','delinquent','member_id'],axis=1,inplace=True)
df1.head()

df1 = df1.drop(columns=['emp_title'],axis=1)
# Encode categorical variables
label_encoders = {}
categorical_columns = df1.select_dtypes(include=['object']).columns

for column in categorical_columns:
    le = LabelEncoder()
    df1[column] = le.fit_transform(df1[column])
    label_encoders[column] = le

# Split the data into features and target
X = df1.drop(columns=['loan_status'], axis=1)  # Features
y = df1['loan_status']  # Target

# Step 1: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Step 3: Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)



In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV
import joblib

# Step 1: Load the dataset
df = pd.read_csv('../data/processed/financial_loan_with_late_status.csv')

# Step 2: Data inspection
print(df.info())
print(df.describe())
print(df.isnull().sum())

# Filter to include only 'Charged Off' and 'Fully Paid' loans
df = df[(df["loan_status"] == "Charged Off") | (df["loan_status"] == "Fully Paid")]

# Handle missing data
# Replace missing values in 'emp_title' with 'Unknown'
df['emp_title'] = df['emp_title'].fillna('Unknown')

# Verify the replacement
print(df['emp_title'].isnull().sum(), "missing values after replacement.")

# List of columns to convert to datetime
date_columns = ['issue_date', 'last_payment_date', 'next_payment_date']

# Convert each column to datetime
for column in date_columns:
    df[column] = pd.to_datetime(df[column])

# Optionally, verify the conversion
print(df.info())

# Function to encode loan status
def encode_loan_status(df, column_name='loan_status'):
    """
    Encode the loan_status column where 'Charged Off' == 1 and 'Fully Paid' == 0.
    
    :param df: pandas DataFrame
    :param column_name: Name of the column to encode. Default is 'loan_status'.
    :return: DataFrame with the encoded loan_status column.
    """
    encoding_map = {
        'Charged Off': 1,
        'Fully Paid': 0
    }
    
    df[column_name] = df[column_name].map(encoding_map)
    
    # Optional: Raise an error if there are values in the column not in the encoding map
    if df[column_name].isnull().any():
        raise ValueError("There are loan statuses that are not 'Charged Off' or 'Fully Paid'. Please handle them accordingly.")
    
    return df

# Encode 'loan_status' column
df = encode_loan_status(df, column_name='loan_status')

# Feature engineering on 'issue_date'
df['issue_year'] = df['issue_date'].dt.year
df['issue_month'] = df['issue_date'].dt.month
df['issue_day'] = df['issue_date'].dt.day
df['issue_dayofweek'] = df['issue_date'].dt.dayofweek  # Monday=0, Sunday=6
df['issue_quarter'] = df['issue_date'].dt.quarter
df['issue_is_weekend'] = df['issue_dayofweek'].apply(lambda x: 1 if x >= 5 else 0)

# Feature engineering on 'last_payment_date'
df['last_payment_year'] = df['last_payment_date'].dt.year
df['last_payment_month'] = df['last_payment_date'].dt.month
df['last_payment_day'] = df['last_payment_date'].dt.day
df['last_payment_dayofweek'] = df['last_payment_date'].dt.dayofweek  # Monday=0, Sunday=6
df['last_payment_quarter'] = df['last_payment_date'].dt.quarter
df['last_payment_is_weekend'] = df['last_payment_dayofweek'].apply(lambda x: 1 if x >= 5 else 0)

# Feature engineering on 'next_payment_date'
df['next_payment_year'] = df['next_payment_date'].dt.year
df['next_payment_month'] = df['next_payment_date'].dt.month
df['next_payment_day'] = df['next_payment_date'].dt.day
df['next_payment_dayofweek'] = df['next_payment_date'].dt.dayofweek  # Monday=0, Sunday=6
df['next_payment_quarter'] = df['next_payment_date'].dt.quarter
df['next_payment_is_weekend'] = df['next_payment_dayofweek'].apply(lambda x: 1 if x >= 5 else 0)

# Drop original date columns
df = df.drop(columns=['issue_date', 'next_payment_date', 'last_payment_date'])

# Drop unnecessary columns
df.drop(columns=['id', 'delinquent', 'member_id', 'emp_title'], axis=1, inplace=True)

# Encode categorical variables
label_encoders = {}
categorical_columns = df.select_dtypes(include=['object']).columns

for column in categorical_columns:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le

# Split the data into features and target
X = df.drop(columns=['loan_status'], axis=1)  # Features
y = df['loan_status']  # Target

# Step 3: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Step 4: Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 5: Train the RandomForestClassifier
rfc = RandomForestClassifier(random_state=42)
rfc.fit(X_train_scaled, y_train)

# Step 6: Evaluate the model
y_pred = rfc.predict(X_test_scaled)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Save the model and scalers
joblib.dump(rfc, 'loan_status_predictor.pkl')
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(label_encoders, 'label_encoders.pkl')