# **Data Analyst/Scientist Test**

### **import libraries**

In [3]:
import pandas as pd
import numpy as np

### create Dataframes

In [4]:
score = pd.DataFrame({
    'application_id':[123,234,345,456,567,678],
    'zest_score':[0.1,0.2,0.3,0.4,0.3,0.15]})
score

Unnamed: 0,application_id,zest_score
0,123,0.1
1,234,0.2
2,345,0.3
3,456,0.4
4,567,0.3
5,678,0.15


In [5]:
feature = pd.DataFrame(
    {'application_id':['123','234','345','456','567','567','678','789'],
     'DTI':[10,20,30,40,50,50,30,40],
     'tier':['tier1','tier3','tier3','tier2','tier3','tier3',np.nan,'tier3'],
     'state':['PA','HI','CA','AL','CT','CT','NY','CA'],
     'application_date':['2021-12-01','2022-02-23','2022-01-03','2022-01-04','2021-12-31','2021-12-31','2022-02-03','2021-12-18']})
feature

Unnamed: 0,application_id,DTI,tier,state,application_date
0,123,10,tier1,PA,2021-12-01
1,234,20,tier3,HI,2022-02-23
2,345,30,tier3,CA,2022-01-03
3,456,40,tier2,AL,2022-01-04
4,567,50,tier3,CT,2021-12-31
5,567,50,tier3,CT,2021-12-31
6,678,30,,NY,2022-02-03
7,789,40,tier3,CA,2021-12-18


# Q1
### Clean the two dataframes and combine them together (keep the intersection). The joined dataframe is called app.
### Sort the new dataframe app based on tier (from tier3 to tier1). You can ask for the information you need to do the data cleaning.

In [6]:
print(score.info())
print(feature.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   application_id  6 non-null      int64  
 1   zest_score      6 non-null      float64
dtypes: float64(1), int64(1)
memory usage: 224.0 bytes
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   application_id    8 non-null      object
 1   DTI               8 non-null      int64 
 2   tier              7 non-null      object
 3   state             8 non-null      object
 4   application_date  8 non-null      object
dtypes: int64(1), object(4)
memory usage: 448.0+ bytes
None


In [7]:
score['application_id'] = score['application_id'].astype(int)
print(score)
print(score.info())
score

   application_id  zest_score
0             123        0.10
1             234        0.20
2             345        0.30
3             456        0.40
4             567        0.30
5             678        0.15
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   application_id  6 non-null      int32  
 1   zest_score      6 non-null      float64
dtypes: float64(1), int32(1)
memory usage: 200.0 bytes
None


Unnamed: 0,application_id,zest_score
0,123,0.1
1,234,0.2
2,345,0.3
3,456,0.4
4,567,0.3
5,678,0.15


In [8]:
feature2 = feature.fillna('tier3')
feature2
feature2['application_id'] = feature2['application_id'].astype(int)

In [9]:
app = pd.merge(score,
            feature2,
            how='inner',
            on='application_id')
app

Unnamed: 0,application_id,zest_score,DTI,tier,state,application_date
0,123,0.1,10,tier1,PA,2021-12-01
1,234,0.2,20,tier3,HI,2022-02-23
2,345,0.3,30,tier3,CA,2022-01-03
3,456,0.4,40,tier2,AL,2022-01-04
4,567,0.3,50,tier3,CT,2021-12-31
5,567,0.3,50,tier3,CT,2021-12-31
6,678,0.15,30,tier3,NY,2022-02-03


In [10]:
app.sort_values(by=['tier'], ascending=False)

Unnamed: 0,application_id,zest_score,DTI,tier,state,application_date
1,234,0.2,20,tier3,HI,2022-02-23
2,345,0.3,30,tier3,CA,2022-01-03
4,567,0.3,50,tier3,CT,2021-12-31
5,567,0.3,50,tier3,CT,2021-12-31
6,678,0.15,30,tier3,NY,2022-02-03
3,456,0.4,40,tier2,AL,2022-01-04
0,123,0.1,10,tier1,PA,2021-12-01


# Q2
#### An applicant is approved if it satisfies one of the following two criteria:
#### 1. DTI < 30 and zest_score < 0.3
#### 2. tier >= 2 and zest_score <= 0.2.

In [11]:
# method 1
app['approved'] = np.where(
    ((app['DTI']<30) & (app['zest_score'] < 0.3)) | 
    ((app['tier'].isin(['tier2','tier3']) & (app['zest_score'] <= 0.2))), True, False)
app

Unnamed: 0,application_id,zest_score,DTI,tier,state,application_date,approved
0,123,0.1,10,tier1,PA,2021-12-01,True
1,234,0.2,20,tier3,HI,2022-02-23,True
2,345,0.3,30,tier3,CA,2022-01-03,False
3,456,0.4,40,tier2,AL,2022-01-04,False
4,567,0.3,50,tier3,CT,2021-12-31,False
5,567,0.3,50,tier3,CT,2021-12-31,False
6,678,0.15,30,tier3,NY,2022-02-03,True


In [13]:
# method 2

condition_1 = app['DTI']<30
condition_2 = app['zest_score'] < 0.3
condition_3 = app['tier'].isin(['tier2','tier3'])
condition_4 = app['zest_score'] <= 0.2

app['approved'] = np.where(
    (condition_1 & condition_2) | (condition_3 & condition_4), True, False)
app

Unnamed: 0,application_id,zest_score,DTI,tier,state,application_date,approved
0,123,0.1,10,tier1,PA,2021-12-01,True
1,234,0.2,20,tier3,HI,2022-02-23,True
2,345,0.3,30,tier3,CA,2022-01-03,False
3,456,0.4,40,tier2,AL,2022-01-04,False
4,567,0.3,50,tier3,CT,2021-12-31,False
5,567,0.3,50,tier3,CT,2021-12-31,False
6,678,0.15,30,tier3,NY,2022-02-03,True


# Q3
#### Calculate the application numbers, the average approval rate and the unique number of states for each month, save the result in one dataframe called app_month

In [14]:
application_number = sum(app['DTI'])
avg_aprovaL_rate = round(np.mean(app['approved']), 2)
print("application_number = " + str(application_number), 
      "average approval rate= " + str(avg_aprovaL_rate),
     sep='\n')

application_number = 230
average approval rate= 0.43


# Q4
#### The definition of individual applicant's risk is: value of `target` if the applicant is approved, otherwise 0
#### Now suppose we make approval/denial decision purely based on `zest_score` (the lower the score, the better), find the cutoff zest_score such that the average risk is 1/3
#### The cutoff zest_score is defined as: applicants below or equal this score will be approved, otherwise declined.