### Resources Used

- https://docs.rapids.ai/api/cudf/stable/10min.html
- https://www.dataquest.io/blog/data-science-portfolio-machine-learning/
- https://docs.dask.org/en/latest/dataframe-best-practices.html
- https://docs.dask.org/en/latest/setup/single-distributed.html#localcluster
- https://distributed.dask.org/en/latest/memory.html
- Dataset: https://docs.rapids.ai/datasets/mortgage-data
- https://dask-cuda.readthedocs.io/en/latest/specializations.html
- #!conda create -n rapids-0.17 -c rapidsai -c nvidia -c conda-forge -c defaults rapids-blazing=0.17 python=3.7 cudatoolkit=11.0 matplotlib=3.3.3 gcsfs=0.7.1


## Check Environment

In [None]:
%%bash
nvidia-smi
nvcc --version

In [None]:
import numpy as np; print('numpy Version:', np.__version__)
import pandas as pd; print('pandas Version:', pd.__version__)
import xgboost as xgb; print('XGBoost Version:', xgb.__version__)
import cudf; print('cudf Version:', cudf.__version__)
import cuml; print('cudf Version:', cuml.__version__)
import gcsfs; print('gcsfs Version:', gcsfs.__version__)
import time
import dask_cudf; print('dask_cudf Version:', gcsfs.__version__)
import dask; print('dask Version:', gcsfs.__version__)
import dask.dataframe as dask_df
import glob;
import matplotlib; print('matplotlib Version:', matplotlib.__version__)
from dask.diagnostics import ProgressBar
from dask.distributed import Client, progress, wait


## Data Types

In [None]:
col_acq = ['LoanID','Channel','SellerName','OrInterestRate','OrUnpaidPrinc','OrLoanTerm',
        'OrDate','FirstPayment','OrLTV','OrCLTV','NumBorrow','DTIRat','CreditScore',
        'FTHomeBuyer','LoanPurpose','PropertyType','NumUnits','OccStatus','PropertyState',
        'Zip','MortInsPerc','ProductType','CoCreditScore','MortInsType','RelMortInd']

col_per = ['LoanID','MonthRep','Servicer','CurrInterestRate','CAUPB','LoanAge','MonthsToMaturity',
          'AdMonthsToMaturity','MaturityDate','MSA','CLDS','ModFlag','ZeroBalCode','ZeroBalDate',
          'LastInstallDate','ForeclosureDate','DispositionDate','PPRC','AssetRecCost','MHRC',
          'ATFHP','NetSaleProceeds','CreditEnhProceeds','RPMWP','OFP','NIBUPB','PFUPB','RMWPF',
          'FPWA','ServicingIndicator']

parse_dates_acq =['OrDate','FirstPayment']
parse_dates_per =['MonthRep','MaturityDate','ZeroBalDate','LastInstallDate','ForeclosureDate','DispositionDate']

dtype_acq={ "LoanID":"int","Channel":"str","SellerName":"str","OrInterestRate":"float","OrUnpaidPrinc":"float","OrLoanTerm":"float","OrDate":"str",
   "FirstPayment":"str","OrLTV":"float","OrCLTV":"float",  "NumBorrow":"float", "DTIRat":"float", "CreditScore":"float", "FTHomeBuyer":"str",
   "LoanPurpose":"str", "PropertyType":"str", "NumUnits":"float", "OccStatus":"str",  "PropertyState":"str",  "Zip":"int", "MortInsPerc":"float",
   "ProductType":"str", "CoCreditScore":"float", "MortInsType":"float", "RelMortInd":"str"}

dtype_per={"LoanID":"int","MonthRep":"str","Servicer":"str", "CurrInterestRate":"float", "CAUPB":"float", "LoanAge":"float","MonthsToMaturity":"float",
   "AdMonthsToMaturity":"float", "MaturityDate":"str", "MSA":"float", "CLDS":"float", "ModFlag":"str", "ZeroBalCode":"float", "ZeroBalDate":"str",
    "LastInstallDate":"str",  "ForeclosureDate":"str", "DispositionDate":"str", "PPRC":"float", "AssetRecCost":"float", "MHRC":"float", "ATFHP":"float",
    "NetSaleProceeds":"float", "CreditEnhProceeds":"float","RPMWP":"float","OFP":"float","NIBUPB":"float", "PFUPB":"float", "RMWPF":"float",
   "FPWA":"str", "ServicingIndicator":"str"
}


## Test on small sample

In [None]:
# sample_acq_fnames='gs://mchrestkha-github-ml-examples/fannie_mae_loans/acq/Acquisition_2016Q1.txt'
# sample_perf_fnames='gs://mchrestkha-github-ml-examples/fannie_mae_loans/perf/Performance_2016Q1.txt'

# df_acq = pd.read_csv(sample_acq_fnames, sep='|', names=col_acq, index_col=False, dtype=dtype_acq, parse_dates=parse_dates_acq)
# df_per = pd.read_csv(sample_perf_fnames, sep='|', names=col_per, index_col=False, dtype=dtype_per, parse_dates=parse_dates_per)

## Run on full population

### Dask_cuDF

In [None]:
from dask_cuda import LocalCUDACluster
from dask.distributed import Client
# Create a Dask Cluster with one worker per GPU
# num_workers=2
# processes=True
# cluster = LocalCUDACluster(n_workers=num_workers, processes=processes)
cluster = LocalCUDACluster()
client = Client(cluster)
client

In [None]:
client.restart()
client

In [None]:
csv_acq_fnames='gs://mchrestkha-github-ml-examples/fannie_mae_loans/acq/Acquisition_201*'
csv_perf_fnames='gs://mchrestkha-github-ml-examples/fannie_mae_loans/perf/Performance_2016*'
parquet_perf_fnames='gs://mchrestkha-github-ml-examples/fannie_mae_loans/perf/parquet/*'

!gsutil du -sh 'gs://mchrestkha-github-ml-examples/fannie_mae_loans/acq/'
!gsutil du -sh 'gs://mchrestkha-github-ml-examples/fannie_mae_loans/perf/'
!gsutil du -sh 'gs://mchrestkha-github-ml-examples/fannie_mae_loans/perf/parquet'

In [None]:
#%time df_acq = dask_cudf.read_csv(acq_fnames, sep='|', names=col_acq, dtype=dtype_acq, parse_dates=parse_dates_acq)
%time df_per = dask_cudf.read_csv(csv_perf_fnames, sep='|', names=col_per, dtype=dtype_per, parse_dates=parse_dates_per)
%time pdf_per = dask_cudf.read_parquet(parquet_perf_fnames, sep='|', names=col_per, dtype=dtype_per, parse_dates=parse_dates_per)

In [None]:
csv_test=df_per.persist()
progress(csv_test)

In [None]:
print(pdf_per.npartitions)
#pdf_per=pdf_per.repartition(npartitions=100)
#print(pdf_per.npartitions)


In [None]:
parquet_test=pdf_per.persist()
progress(parquet_test)

In [None]:
parquet_out='gs://mchrestkha-github-ml-examples/fannie_mae_loans/perf/parquet/'

In [None]:
pdf_per.to_parquet(parquet_out,write_index=False)

In [None]:
print(type(df_per))
print(df_per.npartitions)
print(df_per.shape)

In [None]:
rows=df_per.shape[0].persist() # start computation in the background
progress(rows)      # watch progress (takes ~3 min)

In [None]:
#rows=rows.compute()
cols=df_per.shape[1]
#print(rows,',', cols) 

In [None]:
df_memory=df_per.memory_usage().sum().persist()
progress(df_memory)

In [None]:
print("Required Memory:", df_memory.compute()/(1024*1024*1024))

In [None]:
del parquet_test

In [None]:
parquet_test.compute().describe()

In [None]:
parquet_test.describe().compute()

In [None]:
loan_age=df_per[['LoanAge','LoanID']].groupby('LoanAge',as_index=False).count().persist()
progress(loan_age)

In [None]:
type(loan_age)

In [None]:
loan_age_df=loan_age.compute().to_pandas().sort_values(by=['LoanAge'])
loan_age_df.plot.line()

In [None]:
#df_per_1loan=df_per.compute().drop_duplicates(subset='LoanID', keep='last', inplace=False)
df_per_1loan=df_per.drop_duplicates(subset='LoanID', keep='last', inplace=False).persist()
progress(df_per_1loan)

In [None]:
dup_loan_age_df=df_per_1loan.compute().to_pandas().sort_values(by=['LoanAge'])
dup_loan_age_df.plot.line()

In [None]:
df_per_1loan.compute()


In [None]:
#df_per_1loan=df_per_1loan.compute()

In [None]:
df_per_desc=df_per.describe()
progress(df_per_desc)      # watch progress 

In [None]:
df_per_desc=df_per_desc.compute()
df_per_desc

In [None]:
print(type(df_per_1loan))
print(df_per_1loan.npartitions)
print(df_per_1loan.shape)

In [None]:
#df_per_1loan=df_per_1loan.compute() 
print(type(df_per_1loan))
#print(df_per_1loan.npartitions)
print(df_per_1loan.shape)

In [None]:
#df_per_1loan.hist(column='LoanAge')
loan_age=df_per_1loan[['LoanAge','LoanID']].groupby('LoanAge',as_index=False).count()


In [None]:
type(df_per_1loan)
#type(loan_age)

In [None]:
type(loan_age_df)

In [None]:
loan_age_df.plot.line()

In [None]:
print(type(out))
y=out.compute()
print(type(y))
y.head()

In [None]:
y= x.compute()
type(df_per)

In [None]:
type(test)

In [None]:
rows=df_acq.shape[0].compute()
cols=df_acq.shape[1]
print(rows,',', cols) 
rows=df_per.shape[0].compute()
cols=df_per.shape[1]
print(rows,',', cols) 

In [None]:
#df_per.head()
df_per.iloc[:,1:5].describe()
df_per.describe()

In [None]:
df_acq['RelMortInd'].describe()
df_acq['RelMortInd'].unique()
df_per['CLDS'].describe()


In [None]:
#%time print('Describe:',df_per.describe())
#%time print('Describe:',df_per.isna().any())


In [None]:
df_per_1loan=df_per.drop_duplicates(subset='LoanID', keep='last', inplace=False)
print(df_per.shape)
print(df_per_1loan.shape)
df_per_1loan.head()

## Data Quality Check against Summary Statistics 
- Data Dictionary: https://loanperformancedata.fanniemae.com/lppub-docs/FNMA_SF_Loan_Performance_Glossary.pdf
- Sumary Statistics: https://loanperformancedata.fanniemae.com/lppub-docs/FNMA_SF_Loan_Performance_Stat_Summary_Primary.pdf

## Data Profiling & Exploration
- MonthRep --> Monthly Reporting Period MMYYYY
- Channel ---> R= Retail, C=Correspondent, B=Broker 
- Servicer
- CLDS = Current Loan Delinquency Status 
- PropertyState
- ForeclosureDate

In [None]:
active loands by year-month

In [None]:
by year-month: # of defaults / active loands = default rates