<a href="https://colab.research.google.com/github/nug1209/PwC_Switzerland_Digital_Intelligence_Virtual_Case_Experience/blob/main/PwC_Task_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In this task we deal with merchant loan data. We will create a forecast about how much will the total repayment will be after these loans are finished (after 30 months since the loans started). The formula for counting the repayment is provided. We will also create percentage values for each repayment in comparison with the origination amount.

In [88]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import drive
from datetime import datetime

In [89]:
drive.mount('/content/drive')
path = '/content/drive/MyDrive/Virtual Experience/PwC_Task_2_Data.csv'

df = pd.read_csv(path, delimiter=';')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [91]:
df = df.rename(columns={'Unnamed: 0':'origination_date', 'Origination Amount':'origination_amount'})
# df.head(3)

In [92]:
df.shape

(20, 22)

In [93]:
to_remove = ['origination_date', 'origination_amount']
value_columns = [i for i in list(df.columns) if i not in to_remove]

df = pd.melt(df, id_vars=['origination_date', 'origination_amount'], value_vars=value_columns)
df = df.rename(columns={'variable':'repayment_date', 'value':'repayment_amount'})

In [94]:
df = df[df['repayment_amount'] != 0.00]
# df

In [95]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 210 entries, 0 to 399
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   origination_date    210 non-null    object 
 1   origination_amount  210 non-null    float64
 2   repayment_date      210 non-null    object 
 3   repayment_amount    210 non-null    float64
dtypes: float64(2), object(2)
memory usage: 8.2+ KB


In [96]:
add_row = {'origination_date':['31.12.2020'], 'origination_amount':[30482978.52], 'repayment_date':['31.01.2021'], 'repayment_amount':[8747661.94]}

In [97]:
df = pd.concat([df, pd.DataFrame(add_row)], ignore_index=True)

In [98]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 211 entries, 0 to 210
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   origination_date    211 non-null    object 
 1   origination_amount  211 non-null    float64
 2   repayment_date      211 non-null    object 
 3   repayment_amount    211 non-null    float64
dtypes: float64(2), object(2)
memory usage: 6.7+ KB


In [99]:
df['origination_date'] = pd.to_datetime(df['origination_date'], format='%d.%m.%Y')
df['repayment_date'] = pd.to_datetime(df['repayment_date'], format='%d.%m.%Y')
# df.info()

In [100]:
df = df.sort_values(by=['origination_date', 'repayment_date'])
df

Unnamed: 0,origination_date,origination_amount,repayment_date,repayment_amount
0,2019-05-31,10018746.17,2019-05-31,1443069.08
1,2019-05-31,10018746.17,2019-06-30,3332200.33
3,2019-05-31,10018746.17,2019-07-31,1328138.75
6,2019-05-31,10018746.17,2019-08-31,928085.74
10,2019-05-31,10018746.17,2019-09-30,736418.27
...,...,...,...,...
207,2020-10-31,27699586.46,2020-12-31,1503544.68
189,2020-11-30,29872889.68,2020-11-30,4383982.78
208,2020-11-30,29872889.68,2020-12-31,8383025.07
209,2020-12-31,30482978.52,2020-12-31,4373830.97


In [None]:
df_copy = df.copy()

In [112]:
df1 = pd.DataFrame(df['origination_date'].unique(), columns=['origination_date'])
df1

Unnamed: 0,origination_date
0,2019-05-31
1,2019-06-30
2,2019-07-31
3,2019-08-31
4,2019-09-30
5,2019-10-31
6,2019-11-30
7,2019-12-31
8,2020-01-31
9,2020-02-29


Get the first two repayments.

In [113]:
two_repayments = df.groupby('origination_date')['repayment_amount'].apply(list).reset_index(name='repayments')['repayments'].apply(lambda x: x[0: 2])
df2 = pd.DataFrame(two_repayments, columns=['repayments'])
df2

Unnamed: 0,repayments
0,"[1443069.08, 3332200.33]"
1,"[1392751.6, 3011884.91]"
2,"[1537650.24, 2953335.55]"
3,"[1617681.94, 4082016.0]"
4,"[1992242.84, 3930445.6]"
5,"[2289453.76, 4682354.31]"
6,"[2162283.09, 4637701.69]"
7,"[2402403.37, 4947764.21]"
8,"[2502066.86, 4696910.48]"
9,"[2833811.35, 6142911.08]"


In [117]:
df_agg = pd.concat([df1, df2], axis=1)
df_agg

Unnamed: 0,origination_date,repayments
0,2019-05-31,"[1443069.08, 3332200.33]"
1,2019-06-30,"[1392751.6, 3011884.91]"
2,2019-07-31,"[1537650.24, 2953335.55]"
3,2019-08-31,"[1617681.94, 4082016.0]"
4,2019-09-30,"[1992242.84, 3930445.6]"
5,2019-10-31,"[2289453.76, 4682354.31]"
6,2019-11-30,"[2162283.09, 4637701.69]"
7,2019-12-31,"[2402403.37, 4947764.21]"
8,2020-01-31,"[2502066.86, 4696910.48]"
9,2020-02-29,"[2833811.35, 6142911.08]"


In [124]:
def expected_repayments(p):
  for i in np.arange(4):

    calc1 = sum(p)

    calc2 = 1 - calc1

    calc3 = (i - 1) / 30

    calc4 = 1 - calc2

    calc5 = 1 + (calc4 * calc3)

    calc6 = p[1] * np.log(calc5)

    pi = max(calc6, 0)

    pi = 99.79 * pi

    p.append(pi)
  
  return p
	

In [138]:
def test(x):
  x.append(1)

df_agg['repayments'].apply(sum, axis=0)

TypeError: ignored