# Learning Stuff

In [1]:
import os
import numpy as np
import pandas as pd

from pyspark.sql import functions as F

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import dextra.dna.core as C
import dextra.dna.text as T
import dextra.dna.commons as P

Using local environment.


In [3]:
spark = P.config.spark

pd.set_option('display.max_colwidth', 200)

sns.set()

## Reading Dataset

In [4]:
DATA_FILE = P.config.lakes.refined + '/issues.parquet'

x = C.io.stream.read(DATA_FILE)

x.limit(5).toPandas()

Unnamed: 0,complaint_id,consumer_message,customer_name,date_received,disputed,issue,product,resolution,state,sub_issue,sub_product,tags,timely_response,via,zip_code,ingested_at
0,fbf77b8d5880,3463 Averst Bank 2008 this is to old to be on my credit repot. Its 4985. I want this removed. I gave this back to then. They sold me a lemon. The transmission had went out of this thing after a 44...,ecbc39add660,2017-01-16,False,Cont'd attempts collect debt not owed,Debt collection,Closed with explanation,960999033556,Debt was paid,Auto,,True,Web,73123,2020-12-03 20:01:53.662609
1,fbf7e0affa38,"My loan was sold to Nationstar. At the time there was a {$1400.00} escrow surplus. However after one month, NS sent a letter stating that an audit revealed I had a shortage of {$500.00} and my pym...",dd0bf53c5982,2015-09-02,True,"Loan servicing, payments, escrow account",Mortgage,Closed with explanation,577ce5eff01b,,Conventional fixed mortgage,,True,Web,91113,2020-12-03 20:01:53.662609
2,fbf7fda793c5,Dear Sir or Madam : I am writing to file a complaint against USAA following their handling of a dispute of a billing error in the amount of {$1300.00} 0662 on my USAA {phone} account. The amount i...,8c6ba4882fdb,2015-09-09,False,Billing disputes,Credit card,Closed with explanation,86266b7866cc,,,Servicemember,True,Web,10064,2020-12-03 20:01:53.662609
3,fbf80c5bdf5f,"PLEASE FIND ATTACHED LETTER I WROTE TO THE MORTGAGE COMPANY THAT I USED TO BE WITH AND PAY PAYMENTS. THEY ARE REPORTING LATE PAYMENT, I WAS 8556 TO 6951 BACK THEN AND MY PARENTS MADE A MISTAKE AN...",9d59e25a9a81,2017-08-30,,Incorrect information on your report,"Credit reporting, credit repair services, or other personal consumer reports",Closed with explanation,577ce5eff01b,Account status incorrect,Credit reporting,Servicemember,True,Web,91687,2020-12-03 20:01:53.662609
4,fbf897be7d3d,Merchants Credit continues to report 8030 paid medical accounts that were the result of a billing error despite my written letters and being notified in writing by original creditor ( 9160 97...,ba5bdf5ece97,2017-05-22,,Attempts to collect debt not owed,Debt collection,Untimely response,b621dda48fda,Debt was paid,Medical debt,,False,Web,98095,2020-12-03 20:01:53.662609


## Learning Products

In [5]:
frequent_products = (
    x.groupBy('product')
     .count()
     .orderBy(F.desc('count'))
     .limit(2))

frequent_products.toPandas()

Unnamed: 0,product,count
0,Debt collection,47915
1,Mortgage,36582


In [6]:
z = x.join(frequent_products.drop('count'), how='inner', on='product')

print('samples:', x.count())
print('samples of freq. products:', z.count())

samples: 199970
samples of freq. products: 84497


In [7]:
z.limit(5).toPandas()

Unnamed: 0,product,complaint_id,consumer_message,customer_name,date_received,disputed,issue,resolution,state,sub_issue,sub_product,tags,timely_response,via,zip_code,ingested_at
0,Debt collection,fbf77b8d5880,3463 Averst Bank 2008 this is to old to be on my credit repot. Its 4985. I want this removed. I gave this back to then. They sold me a lemon. The transmission had went out of this thing after a 44...,ecbc39add660,2017-01-16,False,Cont'd attempts collect debt not owed,Closed with explanation,960999033556,Debt was paid,Auto,,True,Web,73123,2020-12-03 20:01:53.662609
1,Mortgage,fbf7e0affa38,"My loan was sold to Nationstar. At the time there was a {$1400.00} escrow surplus. However after one month, NS sent a letter stating that an audit revealed I had a shortage of {$500.00} and my pym...",dd0bf53c5982,2015-09-02,True,"Loan servicing, payments, escrow account",Closed with explanation,577ce5eff01b,,Conventional fixed mortgage,,True,Web,91113,2020-12-03 20:01:53.662609
2,Debt collection,fbf897be7d3d,Merchants Credit continues to report 8030 paid medical accounts that were the result of a billing error despite my written letters and being notified in writing by original creditor ( 9160 97...,ba5bdf5ece97,2017-05-22,,Attempts to collect debt not owed,Untimely response,b621dda48fda,Debt was paid,Medical debt,,False,Web,98095,2020-12-03 20:01:53.662609
3,Debt collection,fbfb467dbe38,I used a template found on the CFPB site to dispute this collection account and they failed to properly validate the account. I followed all the steps the CFPB told me to take so I feel that this ...,4236e72fafcd,2017-03-08,False,Disclosure verification of debt,Closed with explanation,577ce5eff01b,Not given enough info to verify debt,Medical,,True,Web,91667,2020-12-03 20:01:53.662609
4,Debt collection,fbfbb891f22f,I had a contract with 8067. I left 1929 and owed them a balance of {$650.00}. They turned it over to 3956 for collections. I agreed to settle this debt for {$500.00} due on {phone} 2016 which I pa...,1c9936eb6708,2016-06-17,False,Cont'd attempts collect debt not owed,Closed with explanation,b8ff863a5a18,Debt was paid,"Other (i.e. phone, health club, etc.)",Servicemember,True,Web,16049,2020-12-03 20:01:53.662609


In [8]:
class Learning(T.processors.Refining):
    META = ('complaint_id', 'product', 'date_received', 'ingested_at')
    
    TRAINING_SAMPLES = 1000
    TRAINING_SEED = 10282012

    @property
    def encoder_weights(self):
        return os.path.join(self.config.lakes.models, self.fullname().lower(), 'word2vec')
    
    def call(self, x):
        x = self.preprocess(x)
        t = self.subsample(x)
        e = self.fit_encoder(t)
        self.ml.learn(e, features='features', target='product')

        return e.transform(x)
    
    def preprocess(self, x):
        text = x.consumer_message
        text = T.functions.replace(text, r'\d+', '{numeric}')
        text = T.functions.clean(text)

        x = x.select(*self.META, text.alias('text'))

        return x
    
    def subsample(self, x):
        t = x.sample(self.TRAINING_SAMPLES / x.count(), seed=self.TRAINING_SEED)
        return t
    
    def fit_encoder(self, x):
        model = T.models.word2vec(input_col='text',
                                  stop_words='english')
        model = model.fit(x)
        model.save(self.encoder_weights)
        
        return model

y = Learning(z, ..., config=P.config).perform().processed

In [9]:
y.limit(5).toPandas()

Unnamed: 0,complaint_id,product,date_received,ingested_at,text,text_words,text_filtered,features
0,fbf77b8d5880,Debt collection,2017-01-16,2020-12-03 20:01:53.662609,{numeric} averst bank {numeric} this is to old to be on my credit repot its {numeric} i want this removed i gave this back to then they sold me a lemon the transmission had went out of this thing ...,"[{numeric}, averst, bank, {numeric}, this, is, to, old, to, be, on, my, credit, repot, its, {numeric}, i, want, this, removed, i, gave, this, back, to, then, they, sold, me, a, lemon, the, transmi...","[{numeric}, averst, bank, {numeric}, old, credit, repot, {numeric}, want, removed, gave, back, sold, lemon, transmission, went, thing, {numeric}, four, months, lost, thousends]","[-0.007452496644956144, -0.0040992580867499455, 0.00812168908966917, 0.0002747858988269317, 0.0019473653883324005, 0.008941182173492218, 0.0031275975607885894, -0.0010154872506973334, -0.009723882..."
1,fbf7e0affa38,Mortgage,2015-09-02,2020-12-03 20:01:53.662609,my loan was sold to nationstar at the time there was a { {numeric} {numeric}} escrow surplus however after one month ns sent a letter stating that an audit revealed i had a shortage of { {numeric}...,"[my, loan, was, sold, to, nationstar, at, the, time, there, was, a, {, {numeric}, {numeric}}, escrow, surplus, however, after, one, month, ns, sent, a, letter, stating, that, an, audit, revealed, ...","[loan, sold, nationstar, time, {, {numeric}, {numeric}}, escrow, surplus, however, one, month, ns, sent, letter, stating, audit, revealed, shortage, {, {numeric}, {numeric}}, pymt, needed, increas...","[-0.008456923689590466, 0.00011022385740442944, -0.0006187655113896436, 0.0028223562162252957, 0.0023272281123575263, 0.004575580528105759, -0.004063060425404791, -0.0011756991590387358, -0.003731..."
2,fbf897be7d3d,Debt collection,2017-05-22,2020-12-03 20:01:53.662609,merchants credit continues to report {numeric} paid medical accounts that were the result of a billing error despite my written letters and being notified in writing by original creditor {numeric}...,"[merchants, credit, continues, to, report, {numeric}, paid, medical, accounts, that, were, the, result, of, a, billing, error, despite, my, written, letters, and, being, notified, in, writing, by,...","[merchants, credit, continues, report, {numeric}, paid, medical, accounts, result, billing, error, despite, written, letters, notified, writing, original, creditor, {numeric}, {numeric}, {numeric}...","[-0.004341403685669464, -0.007890601384324805, 0.007526868744446945, 0.0005406771428840186, -0.0010239364735178625, 0.0032502023453784396, 0.005520653685763184, 0.0013862107344290338, -0.007284823..."
3,fbfb467dbe38,Debt collection,2017-03-08,2020-12-03 20:01:53.662609,i used a template found on the cfpb site to dispute this collection account and they failed to properly validate the account i followed all the steps the cfpb told me to take so i feel that this a...,"[i, used, a, template, found, on, the, cfpb, site, to, dispute, this, collection, account, and, they, failed, to, properly, validate, the, account, i, followed, all, the, steps, the, cfpb, told, m...","[used, template, found, cfpb, site, dispute, collection, account, failed, properly, validate, account, followed, steps, cfpb, told, take, feel, account, deleted, credit, report, since, collection,...","[-0.00565766300257117, -0.0030262599071387997, 0.02302716119031954, -0.0010210261023617239, 0.0032292542314352763, 0.01404081759992839, 0.004508671209473432, -0.005565028710638188, -0.012263808831..."
4,fbfbb891f22f,Debt collection,2016-06-17,2020-12-03 20:01:53.662609,i had a contract with {numeric} i left {numeric} and owed them a balance of { {numeric} {numeric}} they turned it over to {numeric} for collections i agreed to settle this debt for { {numeric} {nu...,"[i, had, a, contract, with, {numeric}, i, left, {numeric}, and, owed, them, a, balance, of, {, {numeric}, {numeric}}, they, turned, it, over, to, {numeric}, for, collections, i, agreed, to, settle...","[contract, {numeric}, left, {numeric}, owed, balance, {, {numeric}, {numeric}}, turned, {numeric}, collections, agreed, settle, debt, {, {numeric}, {numeric}}, due, {phone}, {numeric}, paid, full,...","[-0.007041349749280406, -0.002386878312141302, 0.0183588678441343, 0.0006002551378644837, 0.004596844489177393, 0.01282208995196145, 0.0011230772298793973, -0.005766826645218056, -0.00977447184465..."
