In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
Loan_info = pd.read_csv('/content/loan_applications.csv')

In [None]:
Loan_info.head()

In [None]:
Loan_info.info()

In [None]:
Loan_info.columns

In [None]:
Loan_info.describe()

In [None]:
Transaction_info = pd.read_csv('/content/transactions.csv')

In [None]:
Transaction_info.head()

In [None]:
Transaction_info.info()

In [None]:
Transaction_info.columns

In [None]:
Transaction_info.describe()

In [None]:
object_columns = [
    'loan_type',
    'purpose_of_loan',
    'employment_status',
    'property_ownership_status',
    'gender',
    'loan_status',
    'fraud_type'
]

# Print value counts for each object column
for col in object_columns:
    print(f"\n📊 Value counts for '{col}':")
    print(Loan_info[col].value_counts(dropna=False))

In [None]:

# Set plot style
sns.set(style="whitegrid")
plt.figure(figsize=(18, 20))

# Plot 1: loan_type
plt.subplot(3, 2, 1)
sns.countplot(data=Loan_info, x='loan_type', order=Loan_info['loan_type'].value_counts().index, palette="Set2")
plt.title('Loan Type Distribution')
plt.xticks(rotation=45)

# Plot 2: purpose_of_loan
plt.subplot(3, 2, 2)
sns.countplot(data=Loan_info, x='purpose_of_loan', order=Loan_info['purpose_of_loan'].value_counts().index, palette="Set3")
plt.title('Purpose of Loan Distribution')
plt.xticks(rotation=45)

# Plot 3: employment_status
plt.subplot(3, 2, 3)
sns.countplot(data=Loan_info, x='employment_status', order=Loan_info['employment_status'].value_counts().index, palette="Set1")
plt.title('Employment Status Distribution')
plt.xticks(rotation=45)

# Plot 4: property_ownership_status
plt.subplot(3, 2, 4)
sns.countplot(data=Loan_info, x='property_ownership_status', order=Loan_info['property_ownership_status'].value_counts().index, palette="Set2")
plt.title('Property Ownership Status')
plt.xticks(rotation=45)

# Plot 5: gender
plt.subplot(3, 2, 5)
sns.countplot(data=Loan_info, x='gender', order=Loan_info['gender'].value_counts().index, palette="coolwarm")
plt.title('Gender Distribution')
plt.xticks(rotation=0)

# Plot 6: loan_status
plt.subplot(3, 2, 6)
sns.countplot(data=Loan_info, x='loan_status', order=Loan_info['loan_status'].value_counts().index, palette="pastel")
plt.title('Loan Status Distribution')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

# Plot fraud_type separately
plt.figure(figsize=(8, 6))
fraud_type_counts = Loan_info['fraud_type'].value_counts().dropna()
sns.barplot(x=fraud_type_counts.index, y=fraud_type_counts.values, palette="Reds")
plt.title("Fraud Type Distribution (Excluding NaN)")
plt.ylabel("Count")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
import pandas as pd
import plotly.express as px

# Loan_type
loan_type_df = Loan_info['loan_type'].value_counts().reset_index()
loan_type_df.columns = ['loan_type', 'count']
fig1 = px.bar(loan_type_df, x='loan_type', y='count',
              title='Loan Type Distribution',
              color='loan_type',
              hover_data=['count'])
fig1.show()

# Purpose_of_loan
purpose_df = Loan_info['purpose_of_loan'].value_counts().reset_index()
purpose_df.columns = ['purpose_of_loan', 'count']
fig2 = px.bar(purpose_df, x='purpose_of_loan', y='count',
              title='Purpose of Loan Distribution',
              color='purpose_of_loan',
              hover_data=['count'])
fig2.show()

# Employment_status
employment_df = Loan_info['employment_status'].value_counts().reset_index()
employment_df.columns = ['employment_status', 'count']
fig3 = px.bar(employment_df, x='employment_status', y='count',
              title='Employment Status Distribution',
              color='employment_status',
              hover_data=['count'])
fig3.show()

# Property_ownership_status
property_df = Loan_info['property_ownership_status'].value_counts().reset_index()
property_df.columns = ['property_ownership_status', 'count']
fig4 = px.bar(property_df, x='property_ownership_status', y='count',
              title='Property Ownership Status',
              color='property_ownership_status',
              hover_data=['count'])
fig4.show()

# Gender
gender_df = Loan_info['gender'].value_counts().reset_index()
gender_df.columns = ['gender', 'count']
fig5 = px.bar(gender_df, x='gender', y='count',
              title='Gender Distribution',
              color='gender',
              hover_data=['count'])
fig5.show()

# Loan_status
loan_status_df = Loan_info['loan_status'].value_counts().reset_index()
loan_status_df.columns = ['loan_status', 'count']
fig6 = px.bar(loan_status_df, x='loan_status', y='count',
              title='Loan Status Distribution',
              color='loan_status',
              hover_data=['count'])
fig6.show()

# Fraud_type (excluding NaN)
fraud_df = Loan_info['fraud_type'].value_counts(dropna=True).reset_index()
fraud_df.columns = ['fraud_type', 'count']
fig7 = px.bar(fraud_df, x='fraud_type', y='count',
              title='Fraud Type Distribution (Excluding NaN)',
              color='fraud_type',
              hover_data=['count'])
fig7.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

num_cols = [
    'loan_amount_requested', 'loan_tenure_months', 'interest_rate_offered',
    'monthly_income', 'cibil_score', 'debt_to_income_ratio',
    'applicant_age', 'number_of_dependents'
]

# Univariate plots - Distribution and boxplot
for col in num_cols:
    fig, axes = plt.subplots(1, 2, figsize=(12, 4))
    sns.histplot(Loan_info[col], kde=True, ax=axes[0], color='skyblue')
    axes[0].set_title(f'Distribution of {col}')

    sns.boxplot(x=Loan_info[col], ax=axes[1], color='lightgreen')
    axes[1].set_title(f'Boxplot of {col}')
    plt.tight_layout()
    plt.show()

In [None]:
# Bivariate: Numerical Features vs. Fraud Flag
for col in num_cols:
    plt.figure(figsize=(8, 4))
    sns.boxplot(data=Loan_info, x='fraud_flag', y=col, palette='Set2')
    plt.title(f'{col} vs Fraud Flag')
    plt.xlabel('Fraud Flag (0=No, 1=Yes)')
    plt.ylabel(col)
    plt.tight_layout()
    plt.show()


In [None]:
plt.figure(figsize=(10, 8))
corr = Loan_info[num_cols + ['fraud_flag']].corr()
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt=".2f", square=True)
plt.title('Correlation Heatmap')
plt.tight_layout()
plt.show()


In [None]:
for col in num_cols:
    plt.figure(figsize=(8, 4))
    sns.kdeplot(data=Loan_info, x=col, hue='fraud_flag', fill=True, common_norm=False, palette='Set1')
    plt.title(f'Distribution of {col} by Fraud Flag')
    plt.xlabel(col)
    plt.ylabel('Density')
    plt.legend(title='Fraud Flag', labels=['Non-Fraud (0)', 'Fraud (1)'])
    plt.tight_layout()
    plt.show()


In [None]:
Loan_info['application_date'] = pd.to_datetime(Loan_info['application_date'])

# Fraud counts by date
fraud_trend = Loan_info.groupby(['application_date', 'fraud_flag']).size().unstack().fillna(0)

fraud_trend.plot(figsize=(12, 5), title='Fraud vs Non-Fraud Over Time')
plt.xlabel('Date')
plt.ylabel('Number of Applications')
plt.legend(['Non-Fraud', 'Fraud'])
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
from sklearn.preprocessing import LabelEncoder

# Copy dataset
loan_encoded = Loan_info.copy()

# 1. One-Hot Encoding for nominal categorical columns
one_hot_cols = ['loan_type', 'purpose_of_loan', 'employment_status',
                'property_ownership_status', 'gender']
loan_encoded = pd.get_dummies(loan_encoded, columns=one_hot_cols, drop_first=True)

# 2. Ordinal encoding for 'loan_status'
loan_status_order = {
    'Approved': 0,
    'Declined': 1,
    'Fraudulent - Detected': 2,
    'Fraudulent - Undetected': 3
}
loan_encoded['loan_status'] = loan_encoded['loan_status'].map(loan_status_order)

# 3. Label Encoding for 'fraud_type' (optional, dropna first or fillna)
le = LabelEncoder()
loan_encoded['fraud_type'] = loan_encoded['fraud_type'].fillna('Unknown')
loan_encoded['fraud_type'] = le.fit_transform(loan_encoded['fraud_type'])


In [None]:
bool_cols = loan_encoded.select_dtypes(include='bool').columns
loan_encoded[bool_cols] = loan_encoded[bool_cols].astype(int)

In [None]:
loan_encoded.head()

In [None]:
loan_encoded.info()

In [None]:
object_columns = [
    'transaction_type',
    'merchant_category',
    'device_used',
    'transaction_status',
    'transaction_notes'
]

# Print value counts for each object column
for col in object_columns:
    print(f"\n Value counts for '{col}':")
    print(Transaction_info[col].value_counts(dropna=False))


In [None]:
import pandas as pd
import plotly.express as px

# Transaction Type
tx_type_df = Transaction_info['transaction_type'].value_counts().reset_index()
tx_type_df.columns = ['transaction_type', 'count']
fig1 = px.bar(tx_type_df, x='transaction_type', y='count',
              title='Transaction Type Distribution',
              color='transaction_type',
              hover_data=['count'])
fig1.show()

# Merchant Category
merchant_cat_df = Transaction_info['merchant_category'].value_counts().reset_index()
merchant_cat_df.columns = ['merchant_category', 'count']
fig2 = px.bar(merchant_cat_df, x='merchant_category', y='count',
              title='Merchant Category Distribution',
              color='merchant_category',
              hover_data=['count'])
fig2.show()

# Device Used
device_df = Transaction_info['device_used'].value_counts().reset_index()
device_df.columns = ['device_used', 'count']
fig3 = px.bar(device_df, x='device_used', y='count',
              title='Device Used Distribution',
              color='device_used',
              hover_data=['count'])
fig3.show()

# Transaction Status
status_df = Transaction_info['transaction_status'].value_counts().reset_index()
status_df.columns = ['transaction_status', 'count']
fig4 = px.bar(status_df, x='transaction_status', y='count',
              title='Transaction Status Distribution',
              color='transaction_status',
              hover_data=['count'])
fig4.show()

# Transaction Notes (Top 10 Only for Readability)
notes_df = Transaction_info['transaction_notes'].value_counts().nlargest(10).reset_index()
notes_df.columns = ['transaction_notes', 'count']
fig5 = px.bar(notes_df, x='transaction_notes', y='count',
              title='Top 10 Transaction Notes',
              color='transaction_notes',
              hover_data=['count'])
fig5.show()

In [None]:
fig = px.treemap(merchant_cat_df,
                 path=['merchant_category'],
                 values='count',
                 title='Merchant Category Distribution (Treemap)',
                 color='count')
fig.show()

In [None]:
fig = px.histogram(Transaction_info,
                   x='device_used',
                   color='transaction_status',
                   barmode='group',
                   title='Device Used vs Transaction Status',
                   facet_col='transaction_status')
fig.show()

In [None]:
import plotly.express as px

fig = px.histogram(Transaction_info, x='transaction_amount',
                   nbins=50,
                   title='Transaction Amount Distribution',
                   color_discrete_sequence=['indianred'])
fig.show()

In [None]:
# Calculate fraud rate by transaction type (international/local)
fraud_rate = Transaction_info.groupby('is_international_transaction')['fraud_flag'].mean().reset_index()

# Replace 0/1 for clarity
fraud_rate['is_international_transaction'] = fraud_rate['is_international_transaction'].map({
    0: 'Domestic', 1: 'International'
})

# Bar chart
fig = px.bar(fraud_rate,
             x='is_international_transaction',
             y='fraud_flag',
             color='is_international_transaction',
             title='Fraud Rate: International vs Domestic Transactions',
             labels={'fraud_flag': 'Fraud Rate'},
             text='fraud_flag')
fig.update_traces(texttemplate='%{text:.2%}', textposition='outside')
fig.show()

In [None]:
fig = px.histogram(Transaction_info, x='account_balance_after_transaction',
                   nbins=50,
                   title='Account Balance After Transaction Distribution',
                   color_discrete_sequence=['mediumseagreen'])
fig.show()

In [None]:
import plotly.express as px

# Group by fraud_flag and calculate mean transaction amount
avg_txn_by_fraud = Transaction_info.groupby('fraud_flag')['transaction_amount'].mean().reset_index()

# Replace 0/1 with labels
avg_txn_by_fraud['fraud_flag'] = avg_txn_by_fraud['fraud_flag'].map({0: 'Non-Fraud', 1: 'Fraud'})

# Bar chart
fig = px.bar(avg_txn_by_fraud,
             x='fraud_flag',
             y='transaction_amount',
             color='fraud_flag',
             title='Average Transaction Amount: Fraud vs Non-Fraud',
             labels={'transaction_amount': 'Avg Transaction Amount'},
             text='transaction_amount')
fig.update_traces(texttemplate='%{text:.2f}', textposition='outside')
fig.show()

In [None]:
intl_df = Transaction_info['is_international_transaction'].value_counts().reset_index()
intl_df.columns = ['is_international_transaction', 'count']

fig = px.bar(intl_df, x='is_international_transaction', y='count',
             title='International vs Domestic Transactions',
             color='is_international_transaction',
             text='count')
fig.show()

In [None]:
fraud_df = Transaction_info['fraud_flag'].value_counts().reset_index()
fraud_df.columns = ['fraud_flag', 'count']

fig = px.bar(fraud_df, x='fraud_flag', y='count',
             title='Fraud Flag Distribution',
             color='fraud_flag',
             text='count')
fig.show()

In [None]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

# Columns to encode
categorical_cols = [
    'transaction_type',
    'merchant_category',
    'device_used',
    'transaction_status',
    'transaction_notes'
]

# 1️ Label Encoding for 'transaction_status' (binary)
label_enc = LabelEncoder()
Transaction_info['transaction_status_encoded'] = label_enc.fit_transform(Transaction_info['transaction_status'])

# 2️ One-Hot Encoding for others (low cardinality)
onehot_cols = ['transaction_type', 'merchant_category', 'device_used', 'transaction_notes']
transaction_df = pd.get_dummies(Transaction_info, columns=onehot_cols, drop_first=True)

#  Check result
transaction_df.head()


In [None]:
# Optional: convert boolean columns to 0/1 integers
bool_cols = transaction_df.select_dtypes(include='bool').columns
transaction_df[bool_cols] = transaction_df[bool_cols].astype(int)

In [None]:
transaction_df.info()

| Feature                                  | What it means                                |
| ---------------------------------------- | -------------------------------------------- |
| `transaction_amount_mean`                | Avg amount per transaction                   |
| `transaction_amount_std`                 | Volatility in amounts                        |
| `transaction_amount_max/min`             | Spending range                               |
| `transaction_amount_sum`                 | Total spending volume                        |
| `account_balance_after_transaction_mean` | Avg account balance after transaction        |
| `is_international_transaction_mean`      | Fraction of international transactions       |
| `fraud_flag_max`                         | Whether any transaction was flagged as fraud |
| `transaction_status_encoded_mean`        | Success rate of transactions                 |


In [None]:
agg_txn = transaction_df.groupby('customer_id').agg({
    'transaction_amount': ['mean', 'std', 'max', 'min', 'sum'],
    'account_balance_after_transaction': ['mean', 'std'],
    'is_international_transaction': 'mean',
    'fraud_flag': 'max',
    'transaction_status_encoded': 'mean'
}).reset_index()

# Flatten multi-index column names
agg_txn.columns = ['customer_id'] + [f'{col[0]}_{col[1]}' for col in agg_txn.columns[1:]]

In [None]:
agg_txn.head()

In [None]:
merged_df = pd.merge(loan_encoded, agg_txn, on='customer_id', how='left')

In [None]:
merged_df.head()

In [None]:
merged_df.shape

In [None]:
merged_df.info()

In [None]:
# Fill NaNs (e.g., from customers with no transaction history)
merged_df.fillna(0, inplace=True)

In [None]:
merged_df.info()

In [None]:
# Loan_status
loan_status_df = Loan_info['loan_status'].value_counts().reset_index()
loan_status_df.columns = ['loan_status', 'count']
fig6 = px.bar(loan_status_df, x='loan_status', y='count',
              title='Loan Status Distribution',
              color='loan_status',
              hover_data=['count'])
fig6.show()

In [None]:
cols_to_drop = [
    'application_id',
    'customer_id',
    'application_date',
    'residential_address'
]

X = merged_df.drop(columns=cols_to_drop + ['loan_status'])  # Drop extra + target
y = merged_df['loan_status']  # Target column

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

In [None]:
X_train_res.shape

In [None]:
y_train_res.shape

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt

# 🎯 Train the model
rf = RandomForestClassifier(random_state=42, class_weight='balanced')
rf.fit(X_train, y_train)

# ✅ Predictions
y_train_pred = rf.predict(X_train)
y_test_pred = rf.predict(X_test)

# 🎯 Accuracy
train_acc = accuracy_score(y_train, y_train_pred)
test_acc = accuracy_score(y_test, y_test_pred)

print(f"✅ Train Accuracy: {train_acc:.4f}")
print(f"✅ Test Accuracy : {test_acc:.4f}")

# 📊 Confusion Matrix
cm = confusion_matrix(y_test, y_test_pred)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=loan_status_order.keys(),
            yticklabels=loan_status_order.keys())
plt.title("Confusion Matrix - Random Forest")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# 📋 Classification Report
print("\n📋 Classification Report:")
print(classification_report(y_test, y_test_pred, target_names=loan_status_order.keys()))

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt

# 🎯 Initialize and train model
xgb = XGBClassifier(
    objective='multi:softmax',
    num_class=4,
    eval_metric='mlogloss',
    use_label_encoder=False,
    scale_pos_weight=1,  # adjust if needed
    random_state=42
)
xgb.fit(X_train, y_train)

# ✅ Predictions
y_train_pred_xgb = xgb.predict(X_train)
y_test_pred_xgb = xgb.predict(X_test)

# 🎯 Accuracy
train_acc_xgb = accuracy_score(y_train, y_train_pred_xgb)
test_acc_xgb = accuracy_score(y_test, y_test_pred_xgb)

print(f"✅ Train Accuracy: {train_acc_xgb:.4f}")
print(f"✅ Test Accuracy : {test_acc_xgb:.4f}")

# 📊 Confusion Matrix
cm_xgb = confusion_matrix(y_test, y_test_pred_xgb)

# Reverse the loan_status_order mapping to get label names
loan_status_labels = list(loan_status_order.keys())

plt.figure(figsize=(8, 6))
sns.heatmap(cm_xgb, annot=True, fmt='d', cmap='Greens',
            xticklabels=loan_status_labels,
            yticklabels=loan_status_labels)
plt.title("Confusion Matrix - XGBoost")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# 📋 Classification Report
print("\n📋 Classification Report:")
print(classification_report(y_test, y_test_pred_xgb, target_names=loan_status_labels))

In [None]:
import joblib

# Save trained XGBoost model
joblib.dump(xgb, "loan_status_model_xgb.pkl")

# Save any other transformers or encoders if needed
# joblib.dump(scaler, "scaler.pkl")  # Optional if you're using scaling
joblib.dump(loan_status_order, "loan_status_label_map.pkl")


In [None]:
def predict_loan_status(new_loan_data, new_transaction_data):
    import pandas as pd
    import joblib

    # 1️⃣ Load model and label map
    model = joblib.load("loan_status_model_xgb.pkl")
    label_map = joblib.load("loan_status_label_map.pkl")
    reverse_label_map = {v: k for k, v in label_map.items()}

    # 2️⃣ Preprocess new transaction data (same aggregation)
    agg_txn = new_transaction_data.groupby('customer_id').agg({
        'transaction_amount': ['mean', 'std', 'max', 'min', 'sum'],
        'account_balance_after_transaction': ['mean', 'std'],
        'is_international_transaction': 'mean',
        'fraud_flag': 'max',
        'transaction_status_encoded': 'mean'
    }).reset_index()
    agg_txn.columns = ['customer_id'] + [f'{col[0]}_{col[1]}' for col in agg_txn.columns[1:]]

    # 3️⃣ Preprocess loan data: same one-hot and label encodings
    new_loan_data['fraud_type'] = new_loan_data['fraud_type'].fillna('Unknown')
    new_loan_data['fraud_type'] = le.transform(new_loan_data['fraud_type'])

    new_loan_data = pd.get_dummies(new_loan_data, columns=one_hot_cols, drop_first=True)
    new_loan_data = new_loan_data.drop(columns=[
        'application_id', 'customer_id', 'application_date', 'residential_address'
    ], errors='ignore')

    # 4️⃣ Merge
    final_input = pd.merge(new_loan_data, agg_txn, on='customer_id', how='left')
    final_input.fillna(0, inplace=True)
    final_input = final_input.drop(columns=['customer_id'], errors='ignore')  # Optional

    # 5️⃣ Predict
    prediction = model.predict(final_input)
    prediction_label = reverse_label_map[prediction[0]]

    return prediction_label


In [None]:
import pandas as pd

# Loan data
new_loan_data = pd.DataFrame([{
    'application_id': 'c8bf0bea-70e6-4870-9125-41b8210c527f',
    'customer_id': 'CUST109427',
    'application_date': '2023-04-09',
    'loan_type': 'Business Loan',
    'loan_amount_requested': 604000.0,
    'loan_tenure_months': 12,
    'interest_rate_offered': 11.66,
    'purpose_of_loan': 'Medical Emergency',
    'employment_status': 'Retired',
    'monthly_income': 34700.0,
    'cibil_score': 714,
    'existing_emis_monthly': 1100.0,
    'debt_to_income_ratio': 3.17,
    'property_ownership_status': 'Rented',
    'residential_address': '94/31, Sehgal Zila, Vadodara-380521, Anantapur, Uttarakhand, 918055',
    'applicant_age': 28,
    'gender': 'Female',
    'number_of_dependents': 3,
    'fraud_flag': 0,
    'fraud_type': 0                # Fill or leave dummy if not predicting this
}])

# Transaction data
new_transaction_data = pd.DataFrame([{
    'transaction_id': '2d7ddfd4-1112-4354-a2dd-fad94b45a850',
    'customer_id': 'CUST102188',
    'transaction_date': '2022-01-04 11:27:00',
    'transaction_type': 'Bill Payment',
    'transaction_amount': 2100.0,
    'merchant_category': 'Dining',
    'merchant_name': 'Rattan, Khanna and Magar',
    'transaction_location': 'Srikakulam, Himachal Pradesh',
    'account_balance_after_transaction': 8211.0,
    'is_international_transaction': 0,
    'device_used': 'Web',
    'ip_address': '31.102.21.141',
    'transaction_status': 'Success',
    'transaction_source_destination': 'BCHG80012468703731',
    'transaction_notes': 'Payment for Dining',
    'fraud_flag': 0
}])


In [None]:
predicted_status = predict_loan_status(new_loan_data, new_transaction_data)
print("🟢 Predicted Loan Status:", predicted_status)