In [2]:
import pandas as pd
import numpy as np

In [None]:
np.random.seed(42)
n = 5000
age = np.random.randint(18, 70, n)
marital_status = np.random.choice(
    ["Married", "Single", "Divorced", "Widowed"], n, p=[0.25, 0.25, 0.25, 0.25]
)
employment_status = np.array(
    [
        np.random.choice(["Employed", "Unemployed", "Self-Employed"], p = [0.6,  0.1, 0.3])
        if a < 63
        else np.random.choice(["Retired", "Employed", "Self-Employed"], p = [0.3, 0.4, 0.3])
        for a in age
    ]
)
alternative_income = np.random.choice(
    np.arange(0, 5000000, 50000),
    n,
    p=[1 / len(np.arange(0, 5000000, 50000))] * len(np.arange(0, 5000000, 50000)),
)
education_level = np.random.choice(
    ["High School", "Bachelor's", "Master's", "PhD"], n, p=[0.15, 0.35, 0.35, 0.15]
)
loan_amount = np.random.choice(np.arange(50000, 5000000, 50000), n)
loan_purpose = np.random.choice(
    ["Loan granted to financial institutions",
    "Mortgage loan",
    "Agricultural loan",
    "Loan granted to individuals",
    "Commercial and industrial loan",
    "Leasing finance",
    "Consumer loan"],
    n,
    p=[0.14, 0.14, 0.14, 0.14, 0.14, 0.14, 0.16 ],
)
FICO_score = np.random.randint(300, 850, n)
total_existing_debt = np.random.randint(0, 1000000, n)
debt_to_income_ratio = np.round(np.random.uniform(0.1, 0.6, n), 2)
num_open_credit_lines = np.random.randint(1, 5, n)
collateral = np.random.choice(["Yes", "No"], n, p=[0.7, 0.3])
previous_defaults = np.random.randint(0, 5, n)
loan_term = np.random.choice([12, 24, 36, 48, 60], n, p=[0.1, 0.2, 0.3, 0.2, 0.2])
interest_rate = np.round(np.random.uniform(9, 20, n), 2)

income = np.array(
    [
        0
        if emp == "Unemployed"
        else np.random.choice(np.arange(150000, 3000000, 20000))
        if emp != "Retired"
        else np.random.choice(
            np.arange(36000, 100000, 500)
        )  
        for emp in employment_status
    ]
)


def generate_employment_duration(age, start_age=18):
    max_possible_duration = age - start_age
    if max_possible_duration < 0:
        return 0
    return np.random.randint(0, max_possible_duration + 1)


employment_duration = [generate_employment_duration(a) for a in age]


def assess_creditworthiness(row):
    if row["FICO_Score"] < 580:
        fico_category = "Very Low"
    elif 580 <= row["FICO_Score"] <= 669:
        fico_category = "Low"
    elif 670 <= row["FICO_Score"] <= 739:
        fico_category = "Medium"
    elif 740 <= row["FICO_Score"] <= 799:
        fico_category = "High"
    else:
        fico_category = "Very High"

    if row['collateral'] == "Yes":
        collateral_status = "Yes"           
    else:
        collateral_status = "No"

    if row["Debt_to_Income_Ratio"] > 0.5:
        dti_status = "High"
    elif 0.3 <= row["Debt_to_Income_Ratio"] <= 0.5:
        dti_status = "Moderate"
    else:
        dti_status = "Low"

    if row['num_open_credit_lines'] > 3:
        risk_factor = "High"
    elif row['num_open_credit_lines'] == 3:
        risk_factor = "Medium"  
    else:
        risk_factor = "Low"

    
    if row["Employment_Status"] in ["Employed", "Self-Employed"]:
        employment_status = "Stable"
    else:
        employment_status = "Unstable"

    
    if row["Previous_Defaults"] > 2:
        default_status = "High Risk"
    else:
        default_status = "Low Risk"

    
    if (
        fico_category == "Very High"
        and dti_status == "Low"
        and employment_status == "Stable"
        and default_status == "Low Risk"
        and risk_factor == "Low"
        and collateral_status == "Yes"
    ):
        return "Very High"
    elif (
        fico_category == "High"
        and dti_status != "High"
        and employment_status == "Stable"
        and default_status == "Low Risk"
        and risk_factor == "Low"
    ):
        return "High"
    elif (
        fico_category == "Medium"
        and dti_status != "High"
        and employment_status == "Stable"

    ):
        return "Medium"
    elif (
        fico_category == "Low" or dti_status == "High" or default_status == "High Risk"
    ):
        return "Low"
    else:
        return "Very Low"


first_names = [
    "Arman",
    "Gor",
    "Hayk",
    "Tigran",
    "Narek",
    "Karen",
    "Levon",
    "Vardan",
    "Sargis",
    "David",
    "Trdat",
    "Aram",
    "Artur",
    "Ashot",
    "Gagik",
    "Hovhannes",
    "Ruben",
    "Vahan",
    "Samvel",
    "Ara",
    "Edgar",
    "Gevorg",
    "Hakob",
    "Suren",
    "Vahagn",
    "Areg",
    "Arsen",
    "Ani",
    "Mariam",
    "Sona",
    "Lilit",
    "Anahit",
    "Hasmik",
    "Nune",
    "Armine",
    "Elina",
    "Tatev",
    "Arpi",
    "Mari",
    "Lusine",
    "Liana",
    "Narine",
    "Ruzanna",
    "Gayane",
    "Hasmik",
    "Meline",
    "Anush",
    "Karine",
    "Nune",
]
last_names = [
    "Harutyunyan",
    "Avetisyan",
    "Poghosyan",
    "Markosyan",
    "Vardanyan",
    "Gevorgyan",
    "Nazaryan",
    "Hakobyan",
    "Sargsyan",
    "Hakobyan",
    "Petrosyan",
    "Avagyan",
    "Karapetyan",
    "Mkrtchyan",
    "Hovhannisyan",
    "Ghazaryan",
    "Melkonyan",
]


full_names = [
    f"{np.random.choice(first_names)} {np.random.choice(last_names)}" for _ in range(n)
]

df = pd.DataFrame(
    {
        "Full_Name": full_names,
        "Age": age,
        "Marital_Status": marital_status,
        "Employment_Status": employment_status,
        "Income_AMD": income,
        "Education_Level": education_level,
        "Loan_Amount_AMD": loan_amount,
        "Loan_Purpose": loan_purpose,
        "Collateral": collateral,
        "FICO_Score": FICO_score,
        "Debt_to_Income_Ratio": debt_to_income_ratio,
        "Num_Open_Credit_Lines": num_open_credit_lines,
        "Previous_Defaults": previous_defaults,
        "Loan_Term": loan_term,
        "Interest_Rate": interest_rate,
        "Employment_Duration_Years": employment_duration,
        "Alternative_Income_AMD": alternative_income,
    }
)

df["Creditworthiness"] = df.apply(assess_creditworthiness, axis=1)

df.to_csv("armenian_credit_data.csv", index=False)
print("Data has been saved to armenian_credit_data.csv")

        
    


Data has been saved to armenian_credit_data.csv


In [10]:
creditworthiness_counts = df['Creditworthiness'].value_counts()
print(creditworthiness_counts)

Creditworthiness
Low          2769
Very Low     1417
Medium        455
High          249
Very High     110
Name: count, dtype: int64


In [11]:
df.head()

Unnamed: 0,Full_Name,Age,Marital_Status,Employment_Status,Income_AMD,Education_Level,Loan_Amount_AMD,Loan_Purpose,Collateral,FICO_Score,Debt_to_Income_Ratio,Num_Open_Credit_Lines,Previous_Defaults,Loan_Term,Interest_Rate,Employment_Duration_Years,Alternative_Income_AMD,Creditworthiness
0,Gagik Markosyan,56,Widowed,Employed,950000,Master's,4000000,Mortgage loan,Yes,433,0.18,3,2,60,10.31,4,1750000,Very Low
1,Elina Sargsyan,69,Single,Employed,1150000,High School,4700000,Agricultural loan,Yes,358,0.47,7,0,24,13.12,41,800000,Very Low
2,Gayane Poghosyan,46,Widowed,Self-Employed,2330000,Bachelor's,2850000,Leasing finance,No,389,0.42,9,4,60,14.12,25,500000,Low
3,Hasmik Melkonyan,32,Married,Employed,1730000,Master's,2350000,Loan granted to financial institutions,No,369,0.26,2,3,48,14.26,6,4500000,Low
4,Vahan Markosyan,60,Divorced,Self-Employed,230000,Master's,1800000,Agricultural loan,Yes,484,0.42,4,0,24,12.29,28,4950000,Very Low
