# Learning Stuff

In [1]:
import os
import numpy as np
import pandas as pd

from pyspark.sql import functions as F

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import dextra.dna.core as C
import dextra.dna.text as T
import dextra.dna.commons as P

Using local environment.


In [3]:
spark = P.config.spark

pd.set_option('display.max_colwidth', 200)

sns.set()

## Reading Dataset

In [28]:
DATA_FILE = P.config.lakes.refined + '/issues.parquet'

x = C.io.stream.read(DATA_FILE)

x.limit(2).toPandas()

Unnamed: 0,complaint_id,consumer_message,customer_name,date_received,disputed,issue,product,resolution,state,sub_issue,sub_product,tags,timely_response,via,zip_code,ingested_at,tags_trusted_labels,tags_split,text_cleaned,committed_at
0,0052ac552c28,Hello My name is {hash} and I have a {hash} through Santander Consumer USA. I got the truck back in {hash} and had some financial hardship when my got sick and had to stop working. The vehicle was...,26d89fd7bd44,2017-01-30,False,Problems when you are unable to pay,Consumer Loan,Closed with explanation,GA,,Vehicle loan,,True,Web,300XX,2020-12-08 21:12:29.188027,False,train,hello my name is {hash} and i have a {hash} through santander consumer usa i got the truck back in {hash} and had some financial hardship when my got sick and had to stop working the vehicle was r...,2020-12-08 21:14:25.362164
1,00a844d52aec,I have written several letters to the creditor ; {hash} and to TransUnion disputing the validity of the information that I found on my credit report. The first dispute letter was sent on {hash} di...,89dbbe6ad0b5,2016-11-11,True,Credit reporting company's investigation,Credit reporting,Closed with explanation,OH,Investigation took too long,,,True,Web,440XX,2020-12-08 21:12:29.188027,False,train,i have written several letters to the creditor {hash} and to transunion disputing the validity of the information that i found on my credit report the first dispute letter was sent on {hash} dispu...,2020-12-08 21:14:25.362164


## Learning Products

In [29]:
frequent_products = (
    x.groupBy('product')
     .count()
     .orderBy(F.desc('count'))
     .limit(2))

frequent_products.toPandas()

Unnamed: 0,product,count
0,Debt collection,47915
1,Mortgage,36582


In [30]:
z = x.join(frequent_products.drop('count'), how='inner', on='product')

print('samples:', x.count())
print('samples of freq. products:', z.count())

samples: 199970
samples of freq. products: 84497


In [31]:
z.limit(2).toPandas()

Unnamed: 0,product,complaint_id,consumer_message,customer_name,date_received,disputed,issue,resolution,state,sub_issue,sub_product,tags,timely_response,via,zip_code,ingested_at,tags_trusted_labels,tags_split,text_cleaned,committed_at
0,Mortgage,024f01fdf2f6,"After realizing that my rescission rights were still effective due to the fact that my loan has not yet been consummated under the laws of California, I sent Ocwen Loan Servicing , LLC a letter re...",bddd00d386a3,2015-09-09,False,"Loan modification,collection,foreclosure",Closed with explanation,CA,,Conventional adjustable mortgage (ARM),,True,Web,913XX,2020-12-08 21:12:29.188027,False,train,after realizing that my rescission rights were still effective due to the fact that my loan has not yet been consummated under the laws of california i sent ocwen loan servicing llc a letter resci...,2020-12-08 21:14:25.362164
1,Mortgage,0264246c03dd,Bank of America is not reporting correctly on my credit. They included my mortgage in a Bankruptcy when I had paperwork with in my bankruptcy indicating possible re-affirmation. I continued to mak...,77ba487b3ab6,2015-07-27,True,"Loan servicing, payments, escrow account",Closed with non-monetary relief,UT,,Conventional fixed mortgage,,True,Web,840XX,2020-12-08 21:12:29.188027,False,train,bank of america is not reporting correctly on my credit they included my mortgage in a bankruptcy when i had paperwork with in my bankruptcy indicating possible re affirmation i continued to make ...,2020-12-08 21:14:25.362164


In [8]:


y = Learning(z, ..., config=P.config).perform().processed

In [9]:
y.limit(5).toPandas()

Unnamed: 0,complaint_id,product,date_received,ingested_at,text,text_words,text_filtered,features
0,fbf77b8d5880,Debt collection,2017-01-16,2020-12-03 20:01:53.662609,{numeric} averst bank {numeric} this is to old to be on my credit repot its {numeric} i want this removed i gave this back to then they sold me a lemon the transmission had went out of this thing ...,"[{numeric}, averst, bank, {numeric}, this, is, to, old, to, be, on, my, credit, repot, its, {numeric}, i, want, this, removed, i, gave, this, back, to, then, they, sold, me, a, lemon, the, transmi...","[{numeric}, averst, bank, {numeric}, old, credit, repot, {numeric}, want, removed, gave, back, sold, lemon, transmission, went, thing, {numeric}, four, months, lost, thousends]","[-0.007452496644956144, -0.0040992580867499455, 0.00812168908966917, 0.0002747858988269317, 0.0019473653883324005, 0.008941182173492218, 0.0031275975607885894, -0.0010154872506973334, -0.009723882..."
1,fbf7e0affa38,Mortgage,2015-09-02,2020-12-03 20:01:53.662609,my loan was sold to nationstar at the time there was a { {numeric} {numeric}} escrow surplus however after one month ns sent a letter stating that an audit revealed i had a shortage of { {numeric}...,"[my, loan, was, sold, to, nationstar, at, the, time, there, was, a, {, {numeric}, {numeric}}, escrow, surplus, however, after, one, month, ns, sent, a, letter, stating, that, an, audit, revealed, ...","[loan, sold, nationstar, time, {, {numeric}, {numeric}}, escrow, surplus, however, one, month, ns, sent, letter, stating, audit, revealed, shortage, {, {numeric}, {numeric}}, pymt, needed, increas...","[-0.008456923689590466, 0.00011022385740442944, -0.0006187655113896436, 0.0028223562162252957, 0.0023272281123575263, 0.004575580528105759, -0.004063060425404791, -0.0011756991590387358, -0.003731..."
2,fbf897be7d3d,Debt collection,2017-05-22,2020-12-03 20:01:53.662609,merchants credit continues to report {numeric} paid medical accounts that were the result of a billing error despite my written letters and being notified in writing by original creditor {numeric}...,"[merchants, credit, continues, to, report, {numeric}, paid, medical, accounts, that, were, the, result, of, a, billing, error, despite, my, written, letters, and, being, notified, in, writing, by,...","[merchants, credit, continues, report, {numeric}, paid, medical, accounts, result, billing, error, despite, written, letters, notified, writing, original, creditor, {numeric}, {numeric}, {numeric}...","[-0.004341403685669464, -0.007890601384324805, 0.007526868744446945, 0.0005406771428840186, -0.0010239364735178625, 0.0032502023453784396, 0.005520653685763184, 0.0013862107344290338, -0.007284823..."
3,fbfb467dbe38,Debt collection,2017-03-08,2020-12-03 20:01:53.662609,i used a template found on the cfpb site to dispute this collection account and they failed to properly validate the account i followed all the steps the cfpb told me to take so i feel that this a...,"[i, used, a, template, found, on, the, cfpb, site, to, dispute, this, collection, account, and, they, failed, to, properly, validate, the, account, i, followed, all, the, steps, the, cfpb, told, m...","[used, template, found, cfpb, site, dispute, collection, account, failed, properly, validate, account, followed, steps, cfpb, told, take, feel, account, deleted, credit, report, since, collection,...","[-0.00565766300257117, -0.0030262599071387997, 0.02302716119031954, -0.0010210261023617239, 0.0032292542314352763, 0.01404081759992839, 0.004508671209473432, -0.005565028710638188, -0.012263808831..."
4,fbfbb891f22f,Debt collection,2016-06-17,2020-12-03 20:01:53.662609,i had a contract with {numeric} i left {numeric} and owed them a balance of { {numeric} {numeric}} they turned it over to {numeric} for collections i agreed to settle this debt for { {numeric} {nu...,"[i, had, a, contract, with, {numeric}, i, left, {numeric}, and, owed, them, a, balance, of, {, {numeric}, {numeric}}, they, turned, it, over, to, {numeric}, for, collections, i, agreed, to, settle...","[contract, {numeric}, left, {numeric}, owed, balance, {, {numeric}, {numeric}}, turned, {numeric}, collections, agreed, settle, debt, {, {numeric}, {numeric}}, due, {phone}, {numeric}, paid, full,...","[-0.007041349749280406, -0.002386878312141302, 0.0183588678441343, 0.0006002551378644837, 0.004596844489177393, 0.01282208995196145, 0.0011230772298793973, -0.005766826645218056, -0.00977447184465..."
