### Load libs

In [20]:
import pandas as pd
import json
import numpy as np
import random
from pandarallel import pandarallel
import re
import time

pandarallel.initialize()


INFO: Pandarallel will run on 12 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [21]:
n = 750
pd.set_option('display.max_columns', n)
pd.set_option('display.max_rows', n)
pd.set_option('display.max_colwidth', -1)


  after removing the cwd from sys.path.


### Load data

In [22]:
term = 'AmeriCredit Automobile Receivables Trust 2017-4 Data Tape'
finder = re.compile('\d{4,}\W\d{1,}')
add_id = re.findall(finder, term)[0]
add_id


'2017-4'

In [23]:
# load abs
folder = 'data/combined/'
file = '{}.csv'.format(term)
path = folder + file
data = pd.read_csv(path)
data.shape


  interactivity=interactivity, compiler=compiler, result=result)


(1875238, 73)

In [24]:
# load fields
f_folder = 'data/json/fields/'
f_file = 'fields.json'
f_path = f_folder + f_file
with open(f_path) as f:
    fields = json.load(f)


In [25]:
# load mapper
m_folder = 'data/dictionary/mapper/'
m_file = 'mapper.json'
m_path = m_folder + m_file
with open(m_path) as f:
    mapper = json.load(f)
    

### Setting fields

In [26]:
init_id = fields['init_id'][0]
date_cols = fields['dates']
replacer_cols = fields['replace_dash']
clean_cols = fields['clean']
m_cols = fields['map']
event_cols = fields['event']
loc_cols = fields['all_loc']
numeric_cols = fields['numeric']
all_vals_cols = fields['all_vals']
min_max_cols = fields['min_max']


### ID and dates

In [27]:
def reorder_date(init):
    
    """
    Reorder date
    """
    
    init = str(init)
    if init != '-':
        if '/' not in init:
            y = init[6:10]
            m = init[0:2]
            d = init[3:5]
            date = y + '-' + m + '-' + d
        elif '/' in init:
            y = init[3:7]
            m = init[0:2]
            date = y + '-' + m
    else:
        date = ''
    
    return date


In [28]:
data['ID'] = data[init_id].str.replace('=', '').str.replace('"', '').str.strip() + '-' + add_id


In [29]:
for col in date_cols:
    print(col)
    values = data[col].values
    dates = [reorder_date(v) for v in values]
    data['{}R'.format(col)] = dates
    

reportingPeriodEndingDate
originalFirstPaymentDate
loanMaturityDate
interestPaidThroughDate
reportingPeriodBeginningDate
DemandResolutionDate
originationDate
mostRecentServicingTransferReceivedDate
zeroBalanceEffectiveDate


In [30]:
# s_col = 'loanMaturityDate'
# t_col = '{}R'.format(s_col)
# data[[s_col, t_col]]


### Replacing values

In [31]:
data[replacer_cols] = data[replacer_cols].replace('-', np.nan)


In [32]:
# clean cols
for col in clean_cols:
    data[col] = data[col].str.strip()
    data[col] = data[col].astype(float)
    

### Replacing values

In [37]:
def replace_val(init, column):
    
    """
    Replace numeric values
    """
    
    init = str(init).strip().replace(';', '')
    if init in ['0', '1', '2', '3', '4', '5', '98', '99']:
        mapped = mapper[column][init]
        return mapped
    else:
        if init[0] in ['0', '1', '2', '3', '4', '5']:
            use = init[0]
        elif init == '-':
            use_keys = list(mapper[column].keys())
            if '98' in use_keys:
                use = '98'
            elif '99' in use_keys:
                use = '99'
        else:
            use = init
        mapped = mapper[column][use]
    return mapped
    

In [38]:
for col in m_cols:
    print(col)
    values = data[col].values
    ret_vals = [replace_val(v, col) for v in values]
    data['{}M'.format(col)] = ret_vals
    

zeroBalanceCode
modificationTypeCode
interestCalculationTypeCode
obligorIncomeVerificationLevelCode
vehicleValueSourceCode
servicingAdvanceMethodCode
obligorEmploymentVerificationCode
originalInterestRateTypeCode
subvented
assetSubjectDemandStatusCode
repurchaseReplacementReasonCode
vehicleTypeCode
vehicleNewUsedCode
paymentTypeCode


In [41]:
# s_col = 'subvented'
# t_col = '{}M'.format(s_col)
# data[[s_col, t_col]]

### Account status

In [42]:
def acct_status(row, b_col, e_col, zero_col, thresh):
    
    """
    Create karus account status
    """
    
    b = float(row[b_col])
    e = float(row[e_col])
    z = str(row[zero_col])
    
    if z in ['Charged-off', 'Repurchased or Replaced']:
        res = z
        return res
    if b < thresh and e < thresh:
        res = 'Prepaid or Matured'
        return res
    if z in ['Unavailable', 'Prepaid or Matured']:
        res = z
        return res
    

In [43]:
b_col = 'reportingPeriodBeginningLoanBalanceAmount'
e_col = 'nextReportingPeriodPaymentAmountDue'
z_col = 'zeroBalanceCodeM'
thresh = 50


In [44]:
data['accountStatus'] = data.parallel_apply(acct_status, args = (b_col, e_col, z_col, thresh, ), axis = 1)


In [45]:
data['accountStatus'].value_counts()


Unavailable                1818268
Prepaid or Matured         30236  
Charged-off                26681  
Repurchased or Replaced    53     
Name: accountStatus, dtype: int64

### Numeric conversion

In [46]:
# force convert cols to numeric
for col in numeric_cols:
    print(col)
    data[col] = pd.to_numeric(data[col], errors='coerce')
    

servicingFlatFeeAmount
nextInterestRatePercentage
obligorCreditScore
scheduledPrincipalAmount
otherAssessedUncollectedServicerFeeAmount
actualOtherCollectedAmount
actualPrincipalCollectedAmount
nextReportingPeriodPaymentAmountDue
currentDelinquencyStatus
originalInterestRatePercentage
reportingPeriodBeginningLoanBalanceAmount
remainingTermToMaturityNumber
originalLoanAmount
repurchaseAmount
paymentExtendedNumber
originalLoanTerm
reportingPeriodScheduledPaymentAmount
otherPrincipalAdjustmentAmount
reportingPeriodActualEndBalanceAmount
gracePeriodNumber
actualInterestCollectedAmount
paymentToIncomePercentage
servicingFeePercentage
chargedoffPrincipalAmount
reportingPeriodInterestRatePercentage
servicerAdvancedAmount
totalActualAmountPaid
repossessedProceedsAmount
scheduledInterestAmount
vehicleValueAmount
recoveredAmount


### Application

In [47]:
all_ids = list(data['ID'].unique())
#all_ids = all_ids[:1000]
print_vals = list(range(0, len(all_ids), 100))
len(all_ids)

69343

In [48]:
#_id = '0001694010 - 000010'
#all_ids = ['0001694010 - 000010']
id_col = 'ID'
status_col = 'accountStatus'
values = ['Charged-off', 'Prepaid or Matured', 'Repurchased or Replaced']


In [49]:
#sample = 0
#use_ids = [all_ids[sample]]


In [24]:
holder = []
counter = 0
start = time.time()
try:
    for _id in all_ids:
        #print(_id)
        counter = counter + 1
        if counter in print_vals:
            print(counter, counter/len(all_ids))
            end = time.time()
            print (end - start)
            print('------------------------------')

        df = data[data[id_col] == _id].reset_index(drop = True)
        df = df.sort_values('reportingPeriodBeginningDateR', ascending = False)

        # dict
        account_dict = {}
        account_dict['id'] = _id
        account_dict['records'] = len(df)

        # current status of loan
        for col in loc_cols:
            account_dict['{}LocCurrent'.format(col)] = df[col].iloc[0]
        for col in min_max_cols:
            account_dict['{}MaxCurrent'.format(col)] = df[col].max()
            account_dict['{}MinCurrent'.format(col)] = df[col].min()
        for col in all_vals_cols:
            vals = list(df[col].unique())
            use_vals = ' | '.join(str(val) for val in vals)
            account_dict['{}ValsCurrent'.format(col)] = use_vals
        for col in numeric_cols:
            _sum = df[col].sum()
            account_dict['{}SumCurrent'.format(col)] = _sum
            vec = list(df[col])
            vec = [v for v in vec if str(v) != 'nan']
            if len(vec) > 0:
                _len = len(vec)
                weights = sorted([1 + i for i in list(range(_len))], reverse=True)
                wa = np.average(vec, weights=weights)
                account_dict['{}WeightedCurrent'.format(col)] = wa
            else:
                account_dict['{}WeightedCurrent'.format(col)] = 0

        # event information
        init_vals = list(df[status_col].unique())
        inter = list(set(values).intersection(init_vals))
        if len(inter) > 0:
            account_dict['eventOccurred'] = 1
            n = df[status_col].where(df[status_col].isin(values)).last_valid_index()
            n_bool = True
            single = df.loc[[n]]
            for col in event_cols:
                account_dict['{}Event'.format(col)] = single[col].iloc[0]

            # prior to event
            init = n+1
            sub = df[init:len(df)]
            sub.reset_index(drop = True, inplace = True)
            account_dict['priorHistory'] = len(sub)
            sub_bool = True
            if len(sub) > 0:
                for col in loc_cols:
                    account_dict['{}LocPrior'.format(col)] = sub[col].iloc[0]
                for col in min_max_cols:
                    account_dict['{}MinPrior'.format(col)] = sub[col].min()
                    account_dict['{}MaxPrior'.format(col)] = sub[col].max()
                for col in all_vals_cols:
                    vals = list(sub[col].unique())
                    use_vals = ' | '.join(str(val) for val in vals)
                    account_dict['{}ValsPrior'.format(col)] = use_vals
                for col in numeric_cols:
                    account_dict['{}SumPrior'.format(col)] = sub[col].sum()
                    vec = list(sub[col])
                    vec = [v for v in vec if str(v) != 'nan']
                    if len(vec) > 0:
                        _len = len(vec)
                        weights = sorted([1 + i for i in list(range(_len))], reverse=True)
                        wa = np.average(vec, weights=weights)
                        account_dict['{}WeightedPrior'.format(col)] = wa
                    else:
                        account_dict['{}WeightedPrior'.format(col)] = 0

                # random
                len_sub = len(sub)
                s = random.randint(0, len_sub)
                if s == len_sub:
                    s = s -1
                r_sub = sub[s:len_sub].reset_index(drop = True)
                account_dict['randomIndex'] = s
                for col in loc_cols:
                    account_dict['{}LocRandom'.format(col)] = r_sub[col].iloc[0]
                for col in min_max_cols:
                    account_dict['{}MinRandom'.format(col)] = r_sub[col].min()
                    account_dict['{}MaxRandom'.format(col)] = r_sub[col].max()
                for col in all_vals_cols:
                    vals = list(r_sub[col].unique())
                    use_vals = ' | '.join(str(val) for val in vals)
                    account_dict['{}ValsRandom'.format(col)] = use_vals
                for col in numeric_cols:
                    account_dict['{}SumRandom'.format(col)] = r_sub[col].sum()
                    vec = list(r_sub[col])
                    vec = [v for v in vec if str(v) != 'nan']
                    if len(vec) > 0:
                        _len = len(vec)
                        weights = sorted([1 + i for i in list(range(_len))], reverse=True)
                        wa = np.average(vec, weights=weights)
                        account_dict['{}WeightedRandom'.format(col)] = wa
                    else:
                        account_dict['{}WeightedRandom'.format(col)] = 0

            # if event is first row of sub       
            else:
                for col in loc_cols:
                    account_dict['{}LocPrior'.format(col)] = df[col].iloc[0]
                for col in min_max_cols:
                    account_dict['{}MinPrior'.format(col)] = df[col].min()
                    account_dict['{}MaxPrior'.format(col)] = df[col].max()
                for col in all_vals_cols:
                    vals = list(df[col].unique())
                    use_vals = ' | '.join(str(val) for val in vals)
                    account_dict['{}ValsPrior'.format(col)] = use_vals
                for col in numeric_cols:
                    account_dict['{}SumPrior'.format(col)] = df[col].sum()
                    account_dict['{}WeightedPrior'.format(col)] = df[col].iloc[0]

        # if no event        
        else:
            account_dict['eventOccurred'] = 0
            account_dict['priorHistory'] = len(df)
            sub_bool = False
            n_bool = False
            for col in event_cols:
                account_dict['{}Event'.format(col)] = np.nan
            for col in loc_cols:
                account_dict['{}LocPrior'.format(col)] = np.nan
            for col in min_max_cols:
                account_dict['{}MinPrior'.format(col)] = np.nan
                account_dict['{}MaxPrior'.format(col)] = np.nan
            for col in all_vals_cols:
                account_dict['{}ValsPrior'.format(col)] = np.nan
            for col in numeric_cols:
                account_dict['{}SumPrior'.format(col)] = np.nan
                account_dict['{}WeightedPrior'.format(col)] = np.nan

            # random set to nan
            account_dict['randomIndex'] = np.nan
            for col in loc_cols:
                account_dict['{}LocRandom'.format(col)] = np.nan
            for col in min_max_cols:
                account_dict['{}MinRandom'.format(col)] = np.nan
                account_dict['{}MaxRandom'.format(col)] = np.nan
            for col in all_vals_cols:
                account_dict['{}ValsRandom'.format(col)] = np.nan
            for col in numeric_cols:
                account_dict['{}SumRandom'.format(col)] = np.nan
                account_dict['{}WeightedRandom'.format(col)] = np.nan

        holder.append(account_dict)
    end = time.time()
except:
    print('cant run {}'.format(_id))


100 0.0014421066293641752
14.295034885406494
------------------------------
200 0.0028842132587283504
25.530581951141357
------------------------------
300 0.004326319888092525
36.45538783073425
------------------------------
400 0.005768426517456701
47.28371596336365
------------------------------
500 0.007210533146820876
58.05179214477539
------------------------------
600 0.00865263977618505
68.86745381355286
------------------------------
700 0.010094746405549226
79.53614902496338
------------------------------
800 0.011536853034913402
90.2856240272522
------------------------------
900 0.012978959664277577
101.01146388053894
------------------------------
1000 0.014421066293641751
111.64490604400635
------------------------------
1100 0.015863172923005927
122.27599787712097
------------------------------
1200 0.0173052795523701
132.94104599952698
------------------------------
1300 0.018747386181734278
143.5415689945221
------------------------------
1400 0.020189492811098452
154.

11200 0.16151594248878762
1211.8994088172913
------------------------------
11300 0.1629580491181518
1222.574215888977
------------------------------
11400 0.16440015574751596
1233.3498349189758
------------------------------
11500 0.16584226237688016
1244.0569338798523
------------------------------
11600 0.16728436900624433
1254.8659510612488
------------------------------
11700 0.1687264756356085
1265.5966258049011
------------------------------
11800 0.17016858226497267
1276.335457086563
------------------------------
11900 0.17161068889433684
1287.179321050644
------------------------------
12000 0.17305279552370104
1297.863116979599
------------------------------
12100 0.1744949021530652
1308.5310168266296
------------------------------
12200 0.17593700878242938
1319.1990478038788
------------------------------
12300 0.17737911541179355
1330.0225059986115
------------------------------
12400 0.17882122204115772
1341.8206160068512
------------------------------
12500 0.18026332867

22100 0.3187055650894827
2387.155003786087
------------------------------
22200 0.3201476717188469
2398.3410308361053
------------------------------
22300 0.32158977834821106
2409.690412044525
------------------------------
22400 0.32303188497757523
2420.8310651779175
------------------------------
22500 0.3244739916069394
2431.9594700336456
------------------------------
22600 0.3259160982363036
2443.169937849045
------------------------------
22700 0.32735820486566775
2454.4762239456177
------------------------------
22800 0.3288003114950319
2465.8103148937225
------------------------------
22900 0.33024241812439614
2477.089668035507
------------------------------
23000 0.3316845247537603
2488.1852140426636
------------------------------
23100 0.3331266313831245
2499.701272010803
------------------------------
23200 0.33456873801248865
2510.90762591362
------------------------------
23300 0.3360108446418528
2522.164906978607
------------------------------
23400 0.337452951271217
2533

33100 0.47733729431954197
3699.710457086563
------------------------------
33200 0.47877940094890614
3713.156219959259
------------------------------
33300 0.4802215075782703
3726.5884280204773
------------------------------
33400 0.48166361420763454
3739.5959129333496
------------------------------
33500 0.4831057208369987
3751.983922958374
------------------------------
33600 0.4845478274663629
3764.6063179969788
------------------------------
33700 0.48598993409572705
3777.055892944336
------------------------------
33800 0.4874320407250912
3789.4746408462524
------------------------------
33900 0.4888741473544554
3801.807466983795
------------------------------
34000 0.49031625398381956
3813.9608261585236
------------------------------
34100 0.49175836061318373
3826.5045669078827
------------------------------
34200 0.4932004672425479
3838.8592398166656
------------------------------
34300 0.4946425738719121
3851.1090779304504
------------------------------
34400 0.4960846805012762

44200 0.6374111301789654
5175.796409130096
------------------------------
44300 0.6388532368083296
5189.223845005035
------------------------------
44400 0.6402953434376938
5202.735237836838
------------------------------
44500 0.641737450067058
5216.17409992218
------------------------------
44600 0.6431795566964221
5229.689533948898
------------------------------
44700 0.6446216633257863
5243.0453062057495
------------------------------
44800 0.6460637699551505
5256.433856964111
------------------------------
44900 0.6475058765845146
5269.893727064133
------------------------------
45000 0.6489479832138788
5283.284387111664
------------------------------
45100 0.650390089843243
5296.774693965912
------------------------------
45200 0.6518321964726072
5310.19748711586
------------------------------
45300 0.6532743031019713
5323.69274187088
------------------------------
45400 0.6547164097313355
5337.284775972366
------------------------------
45500 0.6561585163606997
5350.776952981949

55300 0.7974849660383889
7208.161486148834
------------------------------
55400 0.7989270726677531
7221.089684009552
------------------------------
55500 0.8003691792971173
7233.974897861481
------------------------------
55600 0.8018112859264814
7246.8395819664
------------------------------
55700 0.8032533925558456
7259.608600139618
------------------------------
55800 0.8046954991852098
7272.452246904373
------------------------------
55900 0.8061376058145739
7285.228537082672
------------------------------
56000 0.8075797124439381
7298.19615316391
------------------------------
56100 0.8090218190733023
7311.13763999939
------------------------------
56200 0.8104639257026665
7324.059973955154
------------------------------
56300 0.8119060323320306
7336.9851450920105
------------------------------
56400 0.8133481389613948
7349.880887031555
------------------------------
56500 0.814790245590759
7362.695641040802
------------------------------
56600 0.8162323522201231
7375.455540895462

66500 0.9590009085271765
8663.339828014374
------------------------------
66600 0.9604430151565406
8676.140782117844
------------------------------
66700 0.9618851217859048
8688.939778089523
------------------------------
66800 0.9633272284152691
8701.703783988953
------------------------------
66900 0.9647693350446332
8714.52054977417
------------------------------
67000 0.9662114416739974
8727.272186756134
------------------------------
67100 0.9676535483033616
8740.089843034744
------------------------------
67200 0.9690956549327258
8752.883719921112
------------------------------
67300 0.9705377615620899
8765.681957960129
------------------------------
67400 0.9719798681914541
8778.490895032883
------------------------------
67500 0.9734219748208183
8791.489531993866
------------------------------
67600 0.9748640814501824
8804.869840145111
------------------------------
67700 0.9763061880795466
8818.762475013733
------------------------------
67800 0.9777482947089108
8831.561114072

In [25]:
master = pd.DataFrame(holder)


In [26]:
master['accountStatusEvent'].value_counts(dropna = False)


NaN                        29508
Prepaid or Matured         29042
Charged-off                10740
Repurchased or Replaced    53   
Name: accountStatusEvent, dtype: int64

In [27]:
master['securitization'] = term


In [28]:
master.shape

(69343, 701)

### Export

In [29]:
e_folder = 'data/static/'
e_file = '{} static.csv'.format(term)
e_path = e_folder + e_file
e_path


'data/static/AmeriCredit Automobile Receivables Trust 2017-4 Data Tape static.csv'

In [30]:
master.to_csv(e_path, index = False)


In [31]:
print('continue...')

continue...


### End