In [21]:
import os
import importlib
import pandas as pd
import Python as py
import numpy as np
importlib.reload(py)

<module 'Python' from '/app/Task_2/Python.py'>

In [22]:
root = os.getcwd()
df_scheduled = pd.read_csv(root + "/data/scheduled_loan_repayments.csv")
df_actual = pd.read_csv(root + "/data/actual_loan_repayments.csv")
df_balances = py.calculate_df_balances(df_scheduled, df_actual)

# Question 1

In [15]:
def question_1(df_balances):
    """
    Calculate the percent of loans that defaulted as per the type 1 default definition.

    Args:
        df_balances (DataFrame): Dataframe created from the 'calculate_df_balances()' function

    Returns:
        float: The percentage of type 1 defaulted loans (ie 50.0 not 0.5)

    """
    
    
    def check_repayments(group):
        """
        Check if there was any missed repayment in the loan's history.

        A loan is flagged as defaulted (type 1) if at least one row has ActualRepayment < ScheduledRepayment.

        Args:
            group (DataFrame): A group of repayments for a single loan (LoanID).

        Returns:
            bool: True if defaulted, False otherwise.
        """
        return (group['ActualRepayment'] < group['ScheduledRepayment']).any()

    # Apply loan default type 1 check to each set of repayments for each loan
    df = df_balances.groupby('LoanID').apply(check_repayments, include_groups=False)
    
    # Calculate default type 1 rate as percentage
    default_rate_percent = df.mean()*100

    return default_rate_percent

question_1(df_balances)

15.0

# Question 2

In [18]:
def question_2(df_scheduled, df_balances):
    """
    Calculate the percent of loans that defaulted as per the type 2 default definition

    Args:
        df_balances (DataFrame): Dataframe created from the 'calculate_df_balances()' function
        df_scheduled (DataFrame): Dataframe created from the 'scheduled_loan_repayments.csv' dataset

    Returns:
        float: The percentage of type 2 defaulted loans (ie 50.0 not 0.5)

    """

    # Calculate the total repayment required per loan per year (monthly payment * 12)
    scheduled_total = df_scheduled.groupby('LoanID')['ScheduledRepayment'].first() * 12 # 12 months in a year

    print(df_scheduled.groupby('LoanID')['ScheduledRepayment'].first())

    # Calculate the actual repayment for each loan (there are only 12 months represented in the data)
    actual_total = df_balances.groupby('LoanID')['ActualRepayment'].sum() # Total payments

    # Calculate the difference
    unpaid = scheduled_total - actual_total

    # Clip instances where actual payment exceeded scheduled - overpayed
    unpaid = unpaid.clip(lower=0)

    # Calculate for each loan the percentage unpaid
    unpaid_percentage = (unpaid / scheduled_total) * 100

    # Identify defaulted loans (Type 2) (unpaid > 15%)
    default = unpaid_percentage > 15

    # Calculate default type 2 rate as percentage
    default_rate_percent = default.mean() * 100

    return default_rate_percent

question_2(df_scheduled,df_balances)

LoanID
1       2399.54
2       5306.67
3       4199.19
4       3553.16
5       1338.20
         ...   
996     1015.19
997     5122.09
998     1245.91
999     5306.67
1000    2768.70
Name: ScheduledRepayment, Length: 1000, dtype: float64


1.2

# Question 3

In [13]:
def question_3(df_balances):
    """
    Calculate the anualized portfolio CPR (As a %) from the geometric mean SMM.
    SMM is calculated as: (Unscheduled Principal)/(Start of Month Loan Balance)
    SMM_mean is calculated as (∏(1+SMM))^(1/12) - 1
    CPR is calcualted as: 1 - (1- SMM_mean)^12

    Definitions: (Had Homework to do) from: https://www.investopedia.com/terms/a/amortization.asp
    Unscheduled principal refers to any amount paid towards the principal balance of a loan that exceeds the regular, scheduled payments. 
    Scheduled Principal: This is the regular monthly payment of principal that a borrower is required to make according to the loan agreement. 
    Unscheduled Principal: This is any additional amount paid towards the principal balance beyond the scheduled payment.
    Principle Balance: The principal balance of a loan is the outstanding amount of money you still owe, excluding interest and fees

    To calculate Unsheduled Principle:
    1. Identify the Scheduled Principal:
    This is the regular, required amount of principal that the borrower is obligated to pay each month. - ScheduledRepayment
    2. Identify the Actual Principal Paid:
    This is the total amount of principal paid in the period, including any extra amounts paid beyond the scheduled amount. 
    3. Calculate the Unscheduled Principal:
    Subtract the scheduled principal from the actual principal paid.

    Args:
        df_balances (DataFrame): Dataframe created from the 'calculate_df_balances()' function

    Returns:
        float: The anualized CPR of the loan portfolio as a percent.

    """
    # Copying df_balances to a different frame
    df = df_balances.copy()

    # Principal Payment = Total Monthly Payment - (Outstanding Loan Balance * (Interest Rate / 12)) => InterestPayment
    df['PrincipalPayment'] = df['ActualRepayment'] - df['InterestPayment']

    # Payment towards interest is included in the sheduled montly payment
    # Scheduled principal = scheduled total repayment - interest portion
    df['ScheduledPrincipal'] = df['ScheduledRepayment'] - df['InterestPayment']

    # Unscheduled principal = any additional payment towards principal
    df['UnscheduledPrincipal'] = df['PrincipalPayment'] - df['ScheduledPrincipal']

    # Calculate SMM
    df['SMM'] = df['UnscheduledPrincipal'] / df['LoanBalanceStart']

    # Calculate geometric mean of (1 + SMM)
    smm_product = (1 + df['SMM']).prod()

    smm_mean = smm_product ** (1 / 12) - 1

    # Calculate CPR as annualized prepayment rate
    cpr = 1 - (1 - smm_mean) ** 12
    cpr_percent = cpr * 100

    return cpr_percent

question_3(df_balances)

-229010586008.01517

# Question 4

In [20]:
def question_4(df_balances):
    """
    Calculate the predicted total loss for the second year in the loan term.
    Use the equation: probability_of_default * total_loan_balance * (1 - recovery_rate).
    The probability_of_default value must be taken from either your question_1 or question_2 answer.
    Decide between the two answers based on which default definition you believe to be the more useful metric.
    Assume a recovery rate of 80%

    A type 1 default occurs on a loan when any scheduled monthly repayment is not met in full. - Strict, not as realistic
    A type 2 default occurs on a loan when more than 15% of the expected total payments are unpaid for the year. - More leniant, more realistic - better picture

    Args:
        df_balances (DataFrame): Dataframe created from the 'calculate_df_balances()' function

    Returns:
        float: The predicted total loss for the second year in the loan term.

    """
    # Calculate the total repayment required per loan per year (monthly payment * 12)
    scheduled_total = df_balances.groupby('LoanID')['ScheduledRepayment'].first() * 12 # 12 months in a year

    # Calculate the actual repayment for each loan (there are only 12 months represented in the data)
    actual_total = df_balances.groupby('LoanID')['ActualRepayment'].sum() # Total payments

    # Calculate the difference
    unpaid = scheduled_total - actual_total

    # Clip instances where actual payment exceeded scheduled - overpayed
    unpaid = unpaid.clip(lower=0)

    # Calculate for each loan the percentage unpaid
    unpaid_percentage = (unpaid / scheduled_total) * 100

    # Identify defaulted loans (Type 2) (unpaid > 15%)
    default = unpaid_percentage > 15

    # Calculate default type 2 rate as percentage
    default_rate_percent = default.mean()

    # Assume start of year 2 is equivalent to end of year 1 (It is) so use loan balance end
    total_balance = df_balances[df_balances['Month'] == 12]['LoanBalanceEnd'].sum()

    recovery_rate = 0.80

    total_loss = default_rate_percent * total_balance * (1 - recovery_rate)

    return total_loss

question_4(df_balances)

78365.85352799998

In [8]:
# Basic overview
print(df_balances.info())
print(df_balances.describe())

# Check for missing values
print(f'Checking for missing vals:\n {df_balances.isna().sum()}')

# Loan duration analysis
loan_months = df_balances.groupby('LoanID')['Month'].nunique()
print("Loans with fewer than 12 months of data:")
print(loan_months[loan_months < 12])

# Check if any negative balances exist
print("Negative LoanBalanceStart:", (df_balances['LoanBalanceStart'] < 0).sum())
print("Negative LoanBalanceEnd:", (df_balances['LoanBalanceEnd'] < 0).sum())


# Check how many repayments per loanid
repayment_counts = df_balances.groupby('LoanID').size().reset_index(name='RepaymentCount')

# Preview a few
print(repayment_counts.head())

# Summary statistics
print(repayment_counts['RepaymentCount'].describe())

# Count how many loans have full 12 months of repayments
full_loans = repayment_counts[repayment_counts['RepaymentCount'] == 12].shape[0]
print(f"Loans with 12 repayments: {full_loans}")

# Count loans with fewer than 12 repayments
incomplete_loans = repayment_counts[repayment_counts['RepaymentCount'] < 12].shape[0]
print(f"Loans with <12 repayments: {incomplete_loans}")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   RepaymentID         12000 non-null  float64
 1   LoanID              12000 non-null  int64  
 2   Month               12000 non-null  int64  
 3   ActualRepayment     12000 non-null  float64
 4   LoanAmount          12000 non-null  float64
 5   ScheduledRepayment  12000 non-null  float64
 6   LoanBalanceStart    12000 non-null  float64
 7   LoanBalanceEnd      12000 non-null  float64
 8   InterestPayment     12000 non-null  float64
dtypes: float64(7), int64(2)
memory usage: 843.9 KB
None
        RepaymentID        LoanID         Month  ActualRepayment  \
count  12000.000000  12000.000000  12000.000000     12000.000000   
mean    6000.499958    500.500000      6.500000      3111.375243   
std     3464.245967    288.687019      3.452196      1733.487004   
min        1.000000     

In [23]:
py.question_1(df_balances)

15.0

In [25]:
py.question_2(df_scheduled,df_balances)

1.2

In [26]:
py.question_3(df_balances)

-229010586008.01517

In [27]:
py.question_4(df_balances)

78365.85352799998