In [None]:
import pandas as pd

try:
    df = pd.read_csv('gs://rob_buck/loan.csv')
    print("The dataframe has been loaded successfully!")
except Exception as e:
    print(f"There was an error loading the file\nError: {e}")

In [None]:
df.shape

In [None]:
df['is_default'] = df['loan_status'].map({
    'Fully Paid': 0,
    'Current': 0,
    'In Grace Period': 0,
    'Does not meet the credit policy. Status:Fully Paid': 0,
    
    'Charged Off': 1,
    'Default': 1,
    'Late (31-120 days)': 1,
    'Late (16-30 days)': 1,
    'Does not meet the credit policy. Status:Charged Off': 1
})

In [None]:
key_features = [
    'loan_amnt',           # Loan amount
    'int_rate',            # Interest rate  
    'grade',               # LC grade (A-G)
    'annual_inc',          # Annual income
    'dti',                 # Debt-to-income ratio
    'delinq_2yrs',         # Delinquencies in past 2 years
    'revol_util',          # Credit utilization
    'emp_length',          # Employment length
    'home_ownership',      # Home ownership status
    'purpose',             # Loan purpose
    'is_default'            
]

In [None]:
df = df[key_features]

In [None]:
df.shape

In [None]:
df.isnull().sum() 

In [None]:
df['annual_inc'] = df['annual_inc'].fillna(df['annual_inc'].mean())
df['dti'] = df['dti'].fillna(df['dti'].median())
df['delinq_2yrs'] = df['delinq_2yrs'].fillna(df['delinq_2yrs'].median())
df['revol_util'] = df['revol_util'].fillna(df['revol_util'].median())
df['emp_length'] = df['emp_length'].fillna('Unknown')

In [None]:
df.isnull().sum() 

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.set_theme(style="darkgrid")
plt.figure(figsize=(10, 6))

sns.countplot(data=df, x=df['is_default'])
plt.show()

In [None]:
from sklearn.preprocessing import LabelEncoder

# OneHot encode categorical columns directly
for col in ['grade', 'emp_length', 'home_ownership', 'purpose']:
   encoder = LabelEncoder()
   df[col] = encoder.fit_transform(df[col])

df.head()

In [None]:
df.columns

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.drop('is_default', axis=1), df['is_default'], test_size=0.2, random_state=42)

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
print(f"X_train_scaled shape: {X_train_scaled.shape}")
print(f"X_test_scaled shape: {X_test_scaled.shape}")

In [None]:
from scipy.stats import uniform, randint

param_dist = {
    'learning_rate': uniform(0.01, 0.2),
    'n_estimators': randint(300, 2000),
    'max_depth': randint(6, 20),
    'min_child_weight': randint(1, 10),
    'subsample': uniform(0.6, 0.4),
    'colsample_bytree': uniform(0.6, 0.4),
    'colsample_bylevel': uniform(0.6, 0.4),
    'reg_alpha': uniform(0, 10),
    'reg_lambda': uniform(1, 10),
    'gamma': uniform(0, 5),
    'scale_pos_weight': uniform(7, 12),  # Around your 9:1 ratio
}

In [None]:
from xgboost import XGBClassifier
from RandomizedSearchCV import RandomizedSearchCV

xgb_model = XGBClassifier(
    objective='binary:logistic',
    eval_metric='auc',
    use_label_encoder=False,
    random_state=42,
    n_jobs=-1,
    tree_method='hist'  # Better for large datasets
)

In [None]:
from sklearn.model_selection import StratifiedKFold

cv_strategy = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_dist,
    n_iter=100,
    cv=cv_strategy,  # Use stratified CV
    scoring='roc_auc',
    n_jobs=-1,
    random_state=42,
    return_train_score=True
)

In [None]:
random_search.fit(X_train_scaled, y_train)

In [None]:
print("Best parameters:", random_search.best_params_)
print("Best cross-validation score:", random_search.best_score_)

In [None]:
model = random_search.best_estimator_

In [None]:
pred = model.predict(X_test_scaled)
pred_proba = model.predict_proba(X_test_scaled)[: , 1]

In [None]:
from sklearn.metrics import accuracy_score, roc_auc_score

acc = accuracy_score(y_test, pred)
print(f"Accuracy: {acc:.4f}")

auc = roc_auc_score(y_test, pred_proba)
print(f"ROC: {auc:.4f}")
