In [None]:
import numpy as np
import pandas as pd
import random
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import poisson

# Set seed for reproducibility
np.random.seed(42)
random.seed(42)

# Number of samples
n = 5000

# ---------------------------
# 1. Basic Applicant Information
# ---------------------------
applicant_id = np.arange(1, n + 1)
age = np.random.randint(22, 60, n)
education_level = np.random.choice(
    ["High School", "Graduate", "Postgraduate"],
    n,
    p=[0.4, 0.45, 0.15]
)

# ---------------------------
# 2. Gig Work Details
# ---------------------------
# List of popular gig platforms in India
all_platforms = [
    "Swiggy", "Zomato", "Rapido", "Ola", "Uber",
    "Amazon Flex", "Dunzo", "UrbanClap", "Fiverr", "Upwork"
]

def pick_platforms():
    count = np.random.choice([1, 2, 3], p=[0.6, 0.3, 0.1])
    return random.sample(all_platforms, count)

gig_platforms_list = [pick_platforms() for _ in range(n)]
gig_platforms = [", ".join(platforms) for platforms in gig_platforms_list]
num_platforms = [len(platforms) for platforms in gig_platforms_list]
work_experience = np.random.randint(0, 15, n)  # in years

# ---------------------------
# 3. Financial Information with Seasonal Effects
# ---------------------------
# Base monthly income using a normal distribution (in INR)
base_income = np.random.normal(loc=30000, scale=10000, size=n).astype(int)
base_income = np.clip(base_income, 5000, 100000)

# Seasonal variation factor: simulate peaks (range from 0.8 to 1.5)
seasonal_variation = np.random.choice(np.linspace(0.8, 1.5, 15), n)

# Adjust monthly income for multi-platform engagement (10% boost per extra platform)
monthly_income = (base_income * seasonal_variation * (1 + 0.1 * (np.array(num_platforms) - 1))).astype(int)

# Income volatility: proportional to income (5% to 30% variability)
income_volatility = (monthly_income * np.random.uniform(0.05, 0.3, n)).astype(int)

# Savings balance: roughly 6 months of income times a factor between 0.2 and 0.5
savings_balance = (monthly_income * 6 * np.random.uniform(0.2, 0.5, n)).astype(int)
savings_balance = np.clip(savings_balance, 0, 500000)

# Simulate existing loans realistically
existing_loans = np.random.choice([0, 1, 2, 3], size=n, p=[0.7, 0.2, 0.08, 0.02])

# Calculate Debt-to-Income Ratio (DTI)
dti_base = np.random.uniform(0.1, 0.3, n)
debt_to_income_ratio = np.clip(dti_base * (1 + 0.2 * existing_loans), 0.1, 0.6)

# Loan amount requested (in INR) with microfinance focus: 70% micro (below 50k), 30% regular
micro_loans = np.random.randint(10000, 50000, int(n * 0.7))
regular_loans = np.random.randint(50000, 500000, n - int(n * 0.7))
loan_amount_requested = np.concatenate([micro_loans, regular_loans])
np.random.shuffle(loan_amount_requested)

# ---------------------------
# 4. Credit Score Calculation (CIBIL-like)
# ---------------------------
credit_score = (
    300 +
    (monthly_income / 100) -
    (debt_to_income_ratio * 50) +
    np.random.normal(0, 30, n)
)
credit_score = np.clip(credit_score, 300, 900).astype(int)

# ---------------------------
# 5. Behavioral and Economic Factors
# ---------------------------
transaction_frequency = np.random.randint(10, 100, n)
avg_monthly_expenses = np.random.randint(5000, 70000, n)
credit_card_utilization = np.random.randint(10, 90, n)
subscription_services = np.random.randint(0, 5, n)
financial_emergencies_last_year = np.random.randint(0, 5, n)
# Enhanced inflation rates (capped at 7.8% per RBI data)
inflation_rate = np.round(np.random.uniform(3, 7.8, n), 2)

# ---------------------------
# 6. Additional Untraditional Parameters
# ---------------------------
loan_reasons = [
    "Vehicle Purchase", "Medical Emergency", "Education", 
    "Home Renovation", "Debt Consolidation", "Business Expansion", "Other"
]
reason_for_loan = np.random.choice(loan_reasons, n)

def generate_platform_ratings(platforms):
    ratings = {p: round(np.random.uniform(3.0, 5.0), 1) for p in platforms}
    return "; ".join([f"{p}:{r}" for p, r in ratings.items()])

platform_ratings = [generate_platform_ratings(platforms) for platforms in gig_platforms_list]
customer_feedback_score = [
    round(np.mean([float(r.split(":")[1]) for r in ratings.split("; ")]) * 20 + np.random.uniform(-5, 5), 1)
    for ratings in platform_ratings
]
work_consistency = np.random.randint(1, 8, n)
penalties = [max(0, int(np.random.poisson(1) - (score / 100))) for score in customer_feedback_score]
alternative_income_source = np.random.choice(["Yes", "No"], n, p=[0.3, 0.7])
loan_coapplicant = np.random.choice(["Yes", "No"], n, p=[0.2, 0.8])

# ---------------------------
# 7. Enhanced Geographic and Additional Features
# ---------------------------
indian_states = [
    'Maharashtra', 'Karnataka', 'Delhi', 'Tamil Nadu', 'Uttar Pradesh',
    'Gujarat', 'West Bengal', 'Telangana', 'Rajasthan', 'Bihar'
]
location = np.random.choice(indian_states, n, p=[0.18, 0.15, 0.12, 0.1, 0.1, 0.09, 0.08, 0.07, 0.06, 0.05])
urban_ratio = 0.35
urban_rural = np.random.choice(['Urban', 'Rural'], n, p=[urban_ratio, 1 - urban_ratio])

def generate_platform_tenure(platforms):
    return [random.randint(3, 60) for _ in platforms]

platform_tenures = [generate_platform_tenure(platforms) for platforms in gig_platforms_list]
avg_platform_tenure = [np.mean(tenures) for tenures in platform_tenures]

family_dependents = poisson.rvs(mu=1.5, size=n)
family_dependents = np.clip(family_dependents, 0, 5)

cost_of_living_index = {
    'Maharashtra': 1.15, 'Karnataka': 1.1, 'Delhi': 1.25,
    'Tamil Nadu': 1.05, 'Uttar Pradesh': 0.95, 'Gujarat': 1.0,
    'West Bengal': 0.9, 'Telangana': 1.07, 'Rajasthan': 0.93, 'Bihar': 0.85
}
cost_of_living = np.array([cost_of_living_index[state] for state in location])

# ---------------------------
# 8. Loan Approval Outcome with Logical Checks
# ---------------------------
def determine_loan_approval(i):
    # Basic financial criteria
    crit_credit = credit_score[i] > 650
    crit_dti = debt_to_income_ratio[i] < 0.4
    crit_savings = savings_balance[i] > (loan_amount_requested[i] * 0.2)
    crit_feedback = customer_feedback_score[i] > 70
    crit_consistency = work_consistency[i] >= 3

    # Logical checks:
    crit_min_credit = credit_score[i] >= 500  # Very low credit scores should be rejected

    # If no co-applicant, require a higher credit score
    if loan_coapplicant[i] == "No":
        crit_coapplicant = credit_score[i] > 700
    else:
        crit_coapplicant = True

    # Combine all criteria
    if all([crit_credit, crit_dti, crit_savings, crit_feedback, crit_consistency, 
            crit_min_credit, crit_coapplicant]):
        return 1
    else:
        return 0

loan_approved = [determine_loan_approval(i) for i in range(n)]

# ---------------------------
# 9. Create Final DataFrame and Introduce Missing Values
# ---------------------------
df = pd.DataFrame({
    "applicant_id": applicant_id,
    "age": age,
    "education_level": education_level,
    "gig_platforms": gig_platforms,
    "num_platforms": num_platforms,
    "work_experience": work_experience,
    "monthly_income": monthly_income,
    "seasonal_variation": seasonal_variation,
    "income_volatility": income_volatility,
    "savings_balance": savings_balance,
    "debt_to_income_ratio": debt_to_income_ratio,
    "credit_score": credit_score,
    "existing_loans": existing_loans,
    "loan_amount_requested": loan_amount_requested,
    "transaction_frequency": transaction_frequency,
    "avg_monthly_expenses": avg_monthly_expenses,
    "credit_card_utilization": credit_card_utilization,
    "subscription_services": subscription_services,
    "financial_emergencies_last_year": financial_emergencies_last_year,
    "inflation_rate": inflation_rate,
    "reason_for_loan": reason_for_loan,
    "platform_ratings": platform_ratings,
    "customer_feedback_score": customer_feedback_score,
    "work_consistency": work_consistency,
    "penalties": penalties,
    "alternative_income_source": alternative_income_source,
    "loan_coapplicant": loan_coapplicant,
    "location": location,
    "urban_rural": urban_rural,
    "avg_platform_tenure": avg_platform_tenure,
    "family_dependents": family_dependents,
    "cost_of_living_index": cost_of_living,
    "loan_approved": loan_approved
})

# ---------------------------
# 10. Introduce Missing Values (Enhanced Pattern)
# ---------------------------
missing_config = {
    'education_level': 0.08,
    'work_experience': 0.08,
    'monthly_income': 0.12,
    'savings_balance': 0.12,
    'credit_score': 0.12,
    'avg_monthly_expenses': 0.12,
    'urban_rural': 0.05,
    'family_dependents': 0.03
}

for col, ratio in missing_config.items():
    df.loc[df.sample(frac=ratio).index, col] = np.nan

# ---------------------------
# 11. Validation Checks (Sanity Checks)
# ---------------------------
# 1. Inflation rate sanity check (should not exceed 7.8%)
assert df.inflation_rate.max() <= 7.8, "Inflation rate exceeds RBI cap."

# 2. Micro-loans proportion: ~70% loans should be under â‚¹50k
micro_proportion = (df.loan_amount_requested < 50000).mean()
assert 0.65 <= micro_proportion <= 0.75, "Micro-loan proportion out of range."

# 3. Regional cost of living: sample check for Delhi (if any records exist)
if (df.location == 'Delhi').any():
    delhi_cost = df[df.location == 'Delhi'].cost_of_living_index.mean()
    assert 1.2 < delhi_cost < 1.3, "Delhi cost of living anomaly."

# ---------------------------
# 12. Save Final Enhanced Dataset
# ---------------------------
df.to_csv("C:/test/gig_loan_approval_predictor/Artifacts/indian_gig_loan_data.csv", index=False)

: 

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df.columns

In [None]:
df.isnull().sum()

In [None]:
df.loc[(df['work_experience'] == 0) & (df['monthly_income'].isnull()), 'loan_approved'] = 0
# Create a fraud flag column
df['fraud_flag'] = ((df['work_experience'] == 0) & (df['monthly_income'].isnull())).astype(int)
#those who have work experience and monthly income as 0 are frauds

In [None]:
# Split the data into fraud and non-fraud
non_fraud_data = df[df['fraud_flag'] == 0]
fraud_data = df[df['fraud_flag'] == 1]

In [None]:
# Fill missing values as 0 in monthly_income the fraud data
fraud_data['monthly_income'] = fraud_data['monthly_income'].fillna(0)

In [None]:
# Fill missing values with median in the non-fraud data
non_fraud_data['monthly_income'].fillna(non_fraud_data['monthly_income'].median(), inplace=True)
non_fraud_data['work_experience'].fillna(non_fraud_data['work_experience'].median(), inplace=True)
non_fraud_data[non_fraud_data['work_experience']==0].fillna(non_fraud_data['work_experience'].median(), inplace=True)

In [None]:
df=pd.concat([non_fraud_data,fraud_data],ignore_index=True)

In [None]:

# Create a flag for first-time applicants (no work experience, no existing loans)
df['first_time_applicant'] = ((df['work_experience'] == 0) & (df['existing_loans'] == 0)).astype(int)

In [None]:
# Set credit score to -1 for first-time applicants
df.loc[df['first_time_applicant'] == 1, 'credit_score'] = -1

In [None]:
# Fill missing values in the savings _balence, avg_monthly_expenses and credit_score with the median
df['savings_balance'].fillna(df['savings_balance'].median(), inplace=True)
df['avg_monthly_expenses'].fillna(df['avg_monthly_expenses'].median(), inplace=True)
df['credit_score'].fillna(df['credit_score'].median(), inplace=True)

In [None]:
# Fill missing values in the urban_rural, family_dependents and education_level columns with the mode
df['urban_rural'].fillna(df['urban_rural'].mode()[0], inplace=True)
df['family_dependents'].fillna(df['family_dependents'].mode()[0], inplace=True)
df['education_level'].fillna(df['education_level'].mode()[0], inplace=True)

In [None]:
df.isnull().sum()

In [None]:
df.dtypes

In [None]:
# Split the string into key-value pairs
platform_ratings = df['platform_ratings'].str.split('; ', expand=True)

# Convert to a dictionary using `str.split`
platform_ratings_dict = platform_ratings.applymap(lambda x: dict([x.split(':')]) if pd.notna(x) else {})

In [None]:
platform_ratings

In [None]:
platform_ratings_dict

In [None]:
# Function to calculate the average of dictionary values
def calculate_avg(ratings):
    if isinstance(ratings, dict) and len(ratings) > 0:
        return sum(map(float, ratings.values())) / len(ratings)
    return None

# Create a new column with the average rating
platform_ratings_dict['avg_platform_rating'] = platform_ratings_dict[[0,1,2]].applymap(calculate_avg).mean(axis=1)

In [None]:
platform_ratings_dict.drop([0,1,2], axis=1, inplace=True)
platform_ratings_dict

In [None]:
pd.concat([df, platform_ratings_dict], axis=1)

In [None]:
df.to_csv("C:/test/gig_loan_approval_predictor/Artifacts/gig_loan_processed_analysis.csv", index=False)

In [None]:
df.drop(columns=['first_time_applicant','fraud_flag','applicant_id','gig_platforms','platform_ratings','location'],inplace=True,axis=1)

In [None]:
df.columns

In [None]:
df.dtypes

#label encoding categorical columns
cat=df.select_dtypes(include='object')
for i in cat:
    df[i]=df[i].astype('category')

In [None]:
correlation=df.corr()['loan_approved'].to_frame()
plt.figure(figsize=(40, 30))
sns.heatmap(correlation, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
plt.title("Correlation Heatmap")
plt.show()

In [None]:
#label encoding for the categorical columns
df['education_level'] = df['education_level'].map({'High School': 0, 'Graduate': 1, 'Postgraduate': 2})
df['urban_rural'] = df['urban_rural'].map({'Urban': 1, 'Rural': 0})
df['alternative_income_source'] = df['alternative_income_source'].map({'Yes': 1, 'No': 0})
df['loan_coapplicant'] = df['loan_coapplicant'].map({'Yes': 1, 'No': 0})

In [None]:
df['education_level']=df['education_level'].astype('int')
df['alternative_income_source']=df['alternative_income_source'].astype('int')
df['loan_coapplicant']=df['loan_coapplicant'].astype('int')
df['urban_rural']=df['urban_rural'].astype('int')
df['family_dependents']=df['family_dependents'].astype('int')

In [None]:
# one hot encoding 
df = pd.get_dummies(df, columns=['reason_for_loan'], drop_first=True)

In [None]:
df

In [None]:
df.dtypes

In [None]:
df.corr()

In [None]:
correlation

In [None]:
df.skew()

In [None]:
df['savings_balance'] = np.log1p(df['savings_balance'])
df['existing_loans'] = np.log1p(df['existing_loans'])
df['loan_amount_requested'] = np.log1p(df['loan_amount_requested'])
df['penalties'] = np.log1p(df['penalties'])
df['num_platforms'] = np.log1p(df['num_platforms'])
df['loan_coapplicant'] = np.log1p(df['loan_coapplicant'])

In [None]:
df.skew()

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import PowerTransformer


In [None]:
X = df.drop('loan_approved', axis=1) 
y = df['loan_approved']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=42, 
                                                    stratify=y)

pt = PowerTransformer(method='yeo-johnson')
X_train['credit_score'] = pt.fit_transform(X_train[['credit_score']])
X_test['credit_score'] = pt.transform(X_test[['credit_score']])



smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

In [None]:
rf = RandomForestClassifier(n_estimators=100, 
                            max_depth=10, 
                            random_state=42, 
                            )

rf.fit(X_train, y_train)

In [None]:
y_pred = rf.predict(X_test)

In [None]:
# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Classification Report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Confusion Matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))