In [5]:
import numpy as np 
import pandas as pd 

train = pd.read_csv("/Users/mumtaz/Documents/personal/loan-payback-prediction/data/train.csv")
test = pd.read_csv("/Users/mumtaz/Documents/personal/loan-payback-prediction/data/test.csv")

In [6]:
train.isnull().sum()

id                      0
annual_income           0
debt_to_income_ratio    0
credit_score            0
loan_amount             0
interest_rate           0
gender                  0
marital_status          0
education_level         0
employment_status       0
loan_purpose            0
grade_subgrade          0
loan_paid_back          0
dtype: int64

In [7]:
# Configuration
class Config:
    """Configuration class for hyperparameters and settings"""
    N_SPLITS = 5
    SEED = 42
    TARGET = 'loan_paid_back'
    VERBOSE = True
    
    # Model weights for ensemble (will be optimized later)
    WEIGHTS = {
        'lgb': 0.33,
        'xgb': 0.33,
        'cat': 0.34
    }

config = Config()
print("Configuration loaded successfully!")
print(f"- Number of folds: {config.N_SPLITS}")
print(f"- Random seed: {config.SEED}")
print(f"- Target variable: {config.TARGET}")

Configuration loaded successfully!
- Number of folds: 5
- Random seed: 42
- Target variable: loan_paid_back


In [8]:
# Target distribution
print("TARGET DISTRIBUTION ANALYSIS")

target_counts = train[config.TARGET].value_counts()
target_pct = train[config.TARGET].value_counts(normalize=True) * 100

target_summary = pd.DataFrame({
    'Value': target_counts.index,
    'Count': target_counts.values,
    'Percentage': target_pct.values
})

display(target_summary.style.background_gradient(cmap='Blues'))

TARGET DISTRIBUTION ANALYSIS


Unnamed: 0,Value,Count,Percentage
0,1.0,474494,79.881952
1,0.0,119500,20.118048


# Feature Encoding

In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 593994 entries, 0 to 593993
Data columns (total 13 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    593994 non-null  int64  
 1   annual_income         593994 non-null  float64
 2   debt_to_income_ratio  593994 non-null  float64
 3   credit_score          593994 non-null  int64  
 4   loan_amount           593994 non-null  float64
 5   interest_rate         593994 non-null  float64
 6   gender                593994 non-null  object 
 7   marital_status        593994 non-null  object 
 8   education_level       593994 non-null  object 
 9   employment_status     593994 non-null  object 
 10  loan_purpose          593994 non-null  object 
 11  grade_subgrade        593994 non-null  object 
 12  loan_paid_back        593994 non-null  float64
dtypes: float64(5), int64(2), object(6)
memory usage: 58.9+ MB


## Feature Types

In [14]:
# Separate numerical and categorical columns
numerical_cols = train.select_dtypes(include=[np.number]).columns.tolist()
numerical_cols.remove('id')
if config.TARGET in numerical_cols:
    numerical_cols.remove(config.TARGET)

categorical_cols = train.select_dtypes(include=['object']).columns.tolist()

print(f"Numerical features ({len(numerical_cols)}):")
for i, col in enumerate(numerical_cols, 1):
    print(f"{i}. {col}")

print(f"\nCategorical features ({len(categorical_cols)}):")
for i, col in enumerate(categorical_cols, 1):
    print(f"{i}. {col}")

print(f"\nTotal features: {len(numerical_cols) + len(categorical_cols)}")

Numerical features (5):
1. annual_income
2. debt_to_income_ratio
3. credit_score
4. loan_amount
5. interest_rate

Categorical features (6):
1. gender
2. marital_status
3. education_level
4. employment_status
5. loan_purpose
6. grade_subgrade

Total features: 11


In [None]:
from sklearn.preprocessing import StandardScaler

# Store target encoding mappings globally
target_encoding_maps = {}

def transform(df, train=True):
    global target_encoding_maps
    
    ### Label Encoding On Ordinal Data
    
    # Binning - Extract first letter part from grade_subgrade to represent subgrade 
    df['grade_subgrade'] = df['grade_subgrade'].str[0]
    education_level_mapping = {"High School":0, "Bachelor's":1, "Master's":2, "PhD":3, "Other":4}
    df['education_level'] = df['education_level'].map(education_level_mapping)
    
    ### Target Encoding
    
    if train:
        for col in categorical_cols:
            # Calculate mean target value for each category
            target_means = df.groupby(col)[config.TARGET].mean()
            # Store the mapping for later use
            target_encoding_maps[col] = target_means
            # Replace original values with target encoding
            df[f"{col}_en"] = df[col].map(target_means)
    else:
        for col in categorical_cols:
            # Use the stored mappings from training data
            df[f"{col}_en"] = df[col].map(target_encoding_maps[col])
            # Handle unseen categories by filling with the overall mean from training
            df[f"{col}_en"] = df[f"{col}_en"].fillna(target_encoding_maps[col].mean())
            
    ## Feature Scaling ##
    scaler = StandardScaler()
    df[numerical_cols] = scaler.fit_transform(df[numerical_cols])
    
    return df

In [6]:
train_en = transform(train.copy())
train_en

Unnamed: 0,id,annual_income,debt_to_income_ratio,credit_score,loan_amount,interest_rate,gender,marital_status,education_level,employment_status,loan_purpose,grade_subgrade,loan_paid_back,gender_en,marital_status_en,employment_status_en,grade_subgrade_en,loan_purpose_en
0,0,-0.705461,0.084,0.993849,-1.803484,0.653899,Female,Single,0,Self-employed,Other,C,1.0,0.801708,0.798873,0.898457,0.847260,0.802377
1,1,-0.977248,0.166,-0.810394,-1.505401,0.280571,Male,Married,2,Employed,Debt consolidation,D,0.0,0.795752,0.799144,0.894145,0.715334,0.796911
2,2,0.050689,0.097,0.236067,0.286558,-1.292385,Male,Single,0,Employed,Debt consolidation,C,1.0,0.795752,0.798873,0.894145,0.847260,0.796911
3,3,-0.050687,0.065,-2.668764,-1.492497,1.863482,Female,Single,0,Employed,Debt consolidation,F,1.0,0.801708,0.798873,0.894145,0.625179,0.796911
4,4,-0.850388,0.053,-0.287163,-0.409421,-1.068388,Male,Married,0,Employed,Other,D,1.0,0.795752,0.799144,0.894145,0.715334,0.802377
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
593989,593989,-0.943696,0.152,0.398449,0.857295,-0.714971,Female,Single,0,Employed,Business,C,1.0,0.801708,0.798873,0.894145,0.847260,0.813104
593990,593990,-0.483783,0.105,-2.199661,-1.698263,1.126781,Male,Single,1,Employed,Debt consolidation,F,1.0,0.795752,0.798873,0.894145,0.625179,0.796911
593991,593991,-0.041164,0.072,-0.106739,-2.034358,0.882873,Female,Married,1,Employed,Debt consolidation,C,1.0,0.801708,0.799144,0.894145,0.847260,0.796911
593992,593992,1.068296,0.067,1.066019,0.183368,-1.237630,Male,Single,1,Employed,Debt consolidation,B,1.0,0.795752,0.798873,0.894145,0.931959,0.796911


In [7]:
test_en = transform(test.copy(), train=False)
test_en

Unnamed: 0,id,annual_income,debt_to_income_ratio,credit_score,loan_amount,interest_rate,gender,marital_status,education_level,employment_status,loan_purpose,grade_subgrade,gender_en,marital_status_en,employment_status_en,grade_subgrade_en,loan_purpose_en
0,593994,-0.728006,0.049,-0.989459,-0.513617,1.178469,Female,Single,0,Employed,Other,D,0.801708,0.798873,0.894145,0.715334,0.802377
1,593995,-0.060132,0.093,0.916193,0.068692,0.246668,Female,Married,2,Employed,Other,C,0.801708,0.799144,0.894145,0.847260,0.802377
2,593996,0.251568,0.367,-1.259127,-1.620933,0.464749,Male,Single,1,Employed,Debt consolidation,D,0.795752,0.798873,0.894145,0.715334,0.796911
3,593997,-0.845389,0.110,-0.180456,-1.219629,-1.379028,Female,Single,1,Employed,Debt consolidation,C,0.801708,0.798873,0.894145,0.847260,0.796911
4,593998,-0.863165,0.081,0.125167,0.387183,0.221886,Female,Married,3,Employed,Business,C,0.801708,0.799144,0.894145,0.847260,0.813104
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
254564,848558,1.669294,0.068,1.131927,2.121775,0.558920,Female,Single,1,Employed,Debt consolidation,B,0.801708,0.798873,0.894145,0.931959,0.796911
254565,848559,0.022957,0.091,-0.845636,0.760974,-1.374071,Female,Married,0,Employed,Debt consolidation,D,0.801708,0.799144,0.894145,0.715334,0.796911
254566,848560,-1.031623,0.096,0.664503,1.642668,-1.661542,Male,Single,2,Employed,Debt consolidation,C,0.795752,0.798873,0.894145,0.847260,0.796911
254567,848561,-0.528750,0.094,1.042038,-0.564766,-1.260074,Male,Single,1,Employed,Business,C,0.795752,0.798873,0.894145,0.847260,0.813104


# Feature Selection

In [8]:
train_en = train_en[numeric_features+['education_level', to_predict]+[f"{i}_en" for i in categorical_cols]]
test_en  = test_en[numeric_features+['education_level']+[f"{i}_en" for i in categorical_cols]]

## Lasso and RFE

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel, RFE
from sklearn.preprocessing import StandardScaler

# Separate features and target in train_en
X = train_en.drop(columns=[to_predict])
y = train_en[to_predict]

# L1 Logistic Regression for feature selection
lasso = LogisticRegression(penalty='l1', solver='liblinear', max_iter=5)
lasso.fit(X, y)

# Select features where coef_ != 0
model = SelectFromModel(lasso, prefit=True)
X_lasso_selected = model.transform(X)
selected_features_lasso = X.columns[model.get_support()]

print("Features selected by Lasso:")
print(selected_features_lasso)

# # Recursive Feature Elimination with Logistic Regression
# log_reg = LogisticRegression(max_iter=1000, solver='lbfgs')
# rfe = RFE(estimator=log_reg, n_features_to_select=5)  # Adjust number of features as needed
# rfe.fit(X, y)
# selected_features_rfe = X.columns[rfe.support_]

# print("Features selected by RFE:")
# print(selected_features_rfe)

Features selected by Lasso:
Index(['annual_income', 'credit_score', 'loan_amount', 'interest_rate',
       'education_level', 'gender_en', 'marital_status_en',
       'employment_status_en', 'grade_subgrade_en', 'loan_purpose_en'],
      dtype='object')




# Prediction Test On Training Data

In [10]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X = train_en[selected_features_lasso]
# X = train_en[selected_features_rfe]
y = train_en[to_predict]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression(max_iter=1000, solver='lbfgs')

model.fit(X_train, y_train)

# Predict on scaled test data
y_pred = model.predict(X_test)

In [11]:
from sklearn.metrics import roc_auc_score

# Predict probabilities for positive class
y_probs = model.predict_proba(X_test)[:, 1]

# Calculate ROC AUC score
roc_auc = roc_auc_score(y_test, y_probs)
print("ROC AUC score with Logistic Regression:", roc_auc)

ROC AUC score with Logistic Regression: 0.8629207377988484


# Logistic Regression Model on Test data

In [12]:
y_pred = model.predict(test_en)

In [13]:
test_en.loc[:, 'id'] = test['id']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_en.loc[:, 'id'] = test['id']


In [14]:
output = pd.DataFrame({'id': test_en['id'], 'loan_paid_back': y_pred}) #.to_csv('sample_submission.csv', index=False)

In [15]:
output['loan_paid_back'].value_counts()

loan_paid_back
1.0    222771
0.0     31798
Name: count, dtype: int64