# Finance Retail Analytics - Credit Risk Assessment

## 1. Introduction
This notebook analyzes financial data to predict whether a company will default on its net worth next year. We use data preprocessing, feature engineering (VIF), and machine learning models (Logistic Regression, Random Forest).

In [None]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os

# Add src to path
sys.path.append(os.path.abspath(os.path.join('..')))

from src.data_preprocessing import load_data, clean_column_names, handle_missing_values, scale_features
from src.feature_engineering import calculate_vif, drop_high_vif_features
from src.modeling import train_logistic_regression, get_logistic_predictions, train_random_forest
from src.evaluation import get_performance_metrics, plot_confusion_matrix, plot_roc_curve

## 2. Data Loading and Preprocessing

In [None]:
data_path = '../data/raw/FRA-Comp_Fin_Data.csv'
df = load_data(data_path)
df = clean_column_names(df)

# Define Target
df['default'] = np.where(df['Networth_Next_Year'] > 0, 0, 1)
df = df.drop(['Networth_Next_Year'], axis=1) # Drop target proxy

# Basic Cleaning (Dropping strict identifiers if any, similar to original notebook)
# df = df.drop(['Num', 'Equity_face_value'], axis=1, errors='ignore') 

df.head()

## 3. Handling Missing Values
Using KNN Imputation.

In [None]:
df_imputed = handle_missing_values(df)

## 4. Train-Test Split and Scaling

In [None]:
from sklearn.model_selection import train_test_split

X = df_imputed.drop('default', axis=1)
y = df_imputed['default']

# Drop non-numeric for VIF/Modeling consistency if any remain
X = X.select_dtypes(include=[np.number])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

X_train_scaled, X_test_scaled, scaler = scale_features(X_train, X_test)

## 5. Feature Engineering (VIF)

In [None]:
X_train_vif, X_test_vif = drop_high_vif_features(X_train_scaled, X_test_scaled, threshold=5)

## 6. Model Training & Evaluation

### Logistic Regression

In [None]:
logit_model = train_logistic_regression(X_train_vif, y_train)
print(logit_model.summary())

y_pred_logit, y_prob_logit = get_logistic_predictions(logit_model, X_test_vif)
print(get_performance_metrics(y_test, y_pred_logit))

### Random Forest

In [None]:
rf_model = train_random_forest(X_train_scaled, y_train) # RF handles correlation better, using scaled features
y_pred_rf = rf_model.predict(X_test_scaled)
print(get_performance_metrics(y_test, y_pred_rf))

## 7. Conclusions
Summary of findings and business recommendations.