### Resources Used

- https://docs.rapids.ai/api/cudf/stable/10min.html
- https://www.dataquest.io/blog/data-science-portfolio-machine-learning/
- https://docs.dask.org/en/latest/dataframe-best-practices.html
- https://docs.dask.org/en/latest/setup/single-distributed.html#localcluster
- Dataset: https://docs.rapids.ai/datasets/mortgage-data
- https://dask-cuda.readthedocs.io/en/latest/specializations.html
- #!conda create -n rapids-0.16 -c rapidsai -c nvidia -c conda-forge -c defaults rapids=0.16 python=3.7 cudatoolkit=11.0 matplotlib=3.3.3 gcsfs=0.7.1


## Check Environment

In [None]:
%%bash
nvidia-smi
nvcc --version

In [1]:
import numpy as np; print('numpy Version:', np.__version__)
import pandas as pd; print('pandas Version:', pd.__version__)
import xgboost as xgb; print('XGBoost Version:', xgb.__version__)
import cudf; print('cudf Version:', cudf.__version__)
import cuml; print('cudf Version:', cuml.__version__)
import gcsfs; print('gcsfs Version:', gcsfs.__version__)
import time
import dask_cudf; print('dask_cudf Version:', gcsfs.__version__)
import dask; print('dask Version:', gcsfs.__version__)
import dask.dataframe as dask_df
import glob;
import matplotlib; print('matplotlib Version:', matplotlib.__version__)
from dask.diagnostics import ProgressBar
from dask.distributed import Client, progress, wait


numpy Version: 1.19.4
pandas Version: 1.1.5
XGBoost Version: 1.3.0-SNAPSHOT
cudf Version: 0.16.0
cudf Version: 0.16.0
gcsfs Version: 0.7.1
dask_cudf Version: 0.7.1
dask Version: 0.7.1
matplotlib Version: 3.3.2


## Data Types

In [2]:
col_acq = ['LoanID','Channel','SellerName','OrInterestRate','OrUnpaidPrinc','OrLoanTerm',
        'OrDate','FirstPayment','OrLTV','OrCLTV','NumBorrow','DTIRat','CreditScore',
        'FTHomeBuyer','LoanPurpose','PropertyType','NumUnits','OccStatus','PropertyState',
        'Zip','MortInsPerc','ProductType','CoCreditScore','MortInsType','RelMortInd']

col_per = ['LoanID','MonthRep','Servicer','CurrInterestRate','CAUPB','LoanAge','MonthsToMaturity',
          'AdMonthsToMaturity','MaturityDate','MSA','CLDS','ModFlag','ZeroBalCode','ZeroBalDate',
          'LastInstallDate','ForeclosureDate','DispositionDate','PPRC','AssetRecCost','MHRC',
          'ATFHP','NetSaleProceeds','CreditEnhProceeds','RPMWP','OFP','NIBUPB','PFUPB','RMWPF',
          'FPWA','ServicingIndicator']

parse_dates_acq =['OrDate','FirstPayment']
parse_dates_per =['MonthRep','MaturityDate','ZeroBalDate','LastInstallDate','ForeclosureDate','DispositionDate']

dtype_acq={ "LoanID":"int","Channel":"str","SellerName":"str","OrInterestRate":"float","OrUnpaidPrinc":"float","OrLoanTerm":"float","OrDate":"str",
   "FirstPayment":"str","OrLTV":"float","OrCLTV":"float",  "NumBorrow":"float", "DTIRat":"float", "CreditScore":"float", "FTHomeBuyer":"str",
   "LoanPurpose":"str", "PropertyType":"str", "NumUnits":"float", "OccStatus":"str",  "PropertyState":"str",  "Zip":"int", "MortInsPerc":"float",
   "ProductType":"str", "CoCreditScore":"float", "MortInsType":"float", "RelMortInd":"str"}

dtype_per={"LoanID":"int","MonthRep":"str","Servicer":"str", "CurrInterestRate":"float", "CAUPB":"float", "LoanAge":"float","MonthsToMaturity":"float",
   "AdMonthsToMaturity":"float", "MaturityDate":"str", "MSA":"float", "CLDS":"float", "ModFlag":"str", "ZeroBalCode":"float", "ZeroBalDate":"str",
    "LastInstallDate":"str",  "ForeclosureDate":"str", "DispositionDate":"str", "PPRC":"float", "AssetRecCost":"float", "MHRC":"float", "ATFHP":"float",
    "NetSaleProceeds":"float", "CreditEnhProceeds":"float","RPMWP":"float","OFP":"float","NIBUPB":"float", "PFUPB":"float", "RMWPF":"float",
   "FPWA":"str", "ServicingIndicator":"str"
}


## Test on small sample

In [3]:
# sample_acq_fnames='gs://mchrestkha-github-ml-examples/fannie_mae_loans/acq/Acquisition_2016Q1.txt'
# sample_perf_fnames='gs://mchrestkha-github-ml-examples/fannie_mae_loans/perf/Performance_2016Q1.txt'

# df_acq = pd.read_csv(sample_acq_fnames, sep='|', names=col_acq, index_col=False, dtype=dtype_acq, parse_dates=parse_dates_acq)
# df_per = pd.read_csv(sample_perf_fnames, sep='|', names=col_per, index_col=False, dtype=dtype_per, parse_dates=parse_dates_per)

## Run on full population

### Dask_cuDF

In [4]:
from dask_cuda import LocalCUDACluster
from dask.distributed import Client
# Create a Dask Cluster with one worker per GPU
num_workers=2
processes=True
cluster = LocalCUDACluster(n_workers=num_workers, processes=processes)
# cluster = LocalCUDACluster()
client = Client(cluster)
client

0,1
Client  Scheduler: tcp://127.0.0.1:43775  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 2  Cores: 2  Memory: 179.38 GB


In [5]:
acq_fnames='gs://mchrestkha-github-ml-examples/fannie_mae_loans/acq/Acquisition_201*'
perf_fnames='gs://mchrestkha-github-ml-examples/fannie_mae_loans/perf/Performance_201*'
!gsutil du -sh 'gs://mchrestkha-github-ml-examples/fannie_mae_loans/acq/'
!gsutil du -sh 'gs://mchrestkha-github-ml-examples/fannie_mae_loans/perf/'

4.01 GiB     gs://mchrestkha-github-ml-examples/fannie_mae_loans/acq
190.96 GiB   gs://mchrestkha-github-ml-examples/fannie_mae_loans/perf


In [6]:
#%time df_acq = dask_cudf.read_csv(acq_fnames, sep='|', names=col_acq, dtype=dtype_acq, parse_dates=parse_dates_acq)
%time df_per = dask_cudf.read_csv(perf_fnames, sep='|', names=col_per, dtype=dtype_per, parse_dates=parse_dates_per)

CPU times: user 1.15 s, sys: 661 ms, total: 1.81 s
Wall time: 4.11 s


In [None]:
print(type(df_per))
print(df_per.npartitions)
print(df_per.shape)

In [None]:
rows=df_per.shape[0].persist() # start computation in the background
progress(rows)      # watch progress (takes ~3 min)

In [None]:
rows=rows.compute()
cols=df_per.shape[1]
print(rows,',', cols) 

In [None]:
df_per_1loan=df_per.drop_duplicates(subset='LoanID', keep='last', inplace=False).persist()
progress(df_per_1loan)      # watch progress (takes ~3 min)

In [None]:
del df_per_1loan
del rows
# del df_per_desc

In [7]:
df_per_desc=df_per.describe().persist()
progress(df_per_desc)      # watch progress 

VBox()

In [8]:
df_per_desc=df_per_desc.compute()
df_per_desc

Unnamed: 0,ATFHP,AdMonthsToMaturity,AssetRecCost,CAUPB,CLDS,CreditEnhProceeds,CurrInterestRate,LoanAge,LoanID,MHRC,MSA,MonthsToMaturity,NIBUPB,NetSaleProceeds,OFP,PFUPB,PPRC,RMWPF,RPMWP,ZeroBalCode
count,11404.0,568356000.0,9884.0,485373000.0,568980300.0,13006.0,568980300.0,568980300.0,568980300.0,5901.0,568980300.0,568980300.0,6924.0,10722.0,448.0,624314.0,13263.0,329.0,3450.0,5468040.0
mean,1253.420575,263.9612,7169.169378,189863.6,-0.00133115,126106.352694,4.019165,26.29323,47.50422,1915.180191,27585.39,272.5358,5367.8224,4813.221921,83046.175643,1348.223943,5001.977686,10178.437254,44136.579741,1.030975
std,2888.108139,96.23637,8358.342868,114206.6,0.4714148,92911.662259,0.6430933,19.82052,,1839.677489,13917.56,90.56833,25376.995677,5949.186262,87317.360591,8639.325233,4447.388388,16802.095683,50615.718478,0.4692321
min,-16901.599609,0.0,5.0,0.0,-2.0,0.0,1.75,-2.0,-2147483000.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.6,0.0,0.0,1.0
25%,297.910004,283.0,3188.434937,129504.4,0.0,81175.927734,3.75,13.0,-1002994000.0,1094.352509,17820.0,296.0,356.035004,1916.119965,42278.132812,0.0,3534.849976,0.0,25076.794922,1.0
50%,1166.715027,334.0,5411.087402,204594.2,0.0,132757.625,4.375,29.0,83443160.0,1851.905029,31100.0,337.0,1234.119995,3676.180176,67659.992188,0.0,5213.122559,0.0,39710.777344,1.0
75%,7691.0,355.0,14549.40332,791838.7,0.0,238664.765625,5.25,53.0,1223547000.0,3830.0,40140.0,355.0,4503.345093,8769.165039,137807.601562,94666.671875,9328.490234,13529.689453,68713.5625,15.0
max,80564.023438,480.0,85684.234375,1403000.0,91.0,942587.375,8.5,131.0,2147484000.0,34452.0,49740.0,483.0,450003.6875,102833.742188,473630.25,238809.25,142121.171875,52994.332031,597175.6875,16.0


In [None]:
print(type(df_per_1loan))
print(df_per_1loan.npartitions)
print(df_per_1loan.shape)

In [None]:
#df_per_1loan=df_per_1loan.compute() 
print(type(df_per_1loan))
#print(df_per_1loan.npartitions)
print(df_per_1loan.shape)

In [None]:
#df_per_1loan.hist(column='LoanAge')
loan_age=df_per_1loan[['LoanAge','LoanID']].groupby('LoanAge',as_index=False).count()
loan_age.plot.line()

In [None]:
print(type(out))
y=out.compute()
print(type(y))
y.head()

In [None]:
y= x.compute()
type(df_per)

In [None]:
type(test)

In [None]:
rows=df_acq.shape[0].compute()
cols=df_acq.shape[1]
print(rows,',', cols) 
rows=df_per.shape[0].compute()
cols=df_per.shape[1]
print(rows,',', cols) 

In [None]:
#df_per.head()
df_per.iloc[:,1:5].describe()
df_per.describe()

In [None]:
df_acq['RelMortInd'].describe()
df_acq['RelMortInd'].unique()
df_per['CLDS'].describe()


In [None]:
#%time print('Describe:',df_per.describe())
#%time print('Describe:',df_per.isna().any())


In [None]:
df_per_1loan=df_per.drop_duplicates(subset='LoanID', keep='last', inplace=False)
print(df_per.shape)
print(df_per_1loan.shape)
df_per_1loan.head()