# trainings

In [1]:
import yaml
from pyspark.sql import types
import pyspark.sql.functions as F
from pyspark.ml.feature import Normalizer

import dextra.dna.core as C
import dextra.dna.commons as P

Using local environment.


In [2]:
spark = P.config.spark

In [3]:
PQT = P.config.lakes.models + '/logs/encoder_trainings.parquet'

t = C.io.stream.read(PQT)
t.limit(2).toPandas()

Unnamed: 0,model_name,model_weights_path,training_proc,training,trained_at,date_received_stats,committed_at_stats,records
0,word2vec,file:///datalake/models/products/word2vec,dextra.dna.commons.processors.products.Learn,"{'model': {'input_col': 'text_cleaned', 'stop_...",2020-12-20 13:22:14.130229,"(2015-03-19 00:00:00, 2017-11-03 00:00:00, 146...","(2020-12-20 12:51:11.543374, 2020-12-20 12:51:...","[(a1d321eb007c, 2020-12-20 12:51:11.543374), (..."


### Stats from Last Training

In [4]:
last_training = (
    t.drop('records')
     .orderBy(F.desc('trained_at'))
     .limit(1)
     .collect()[0])

In [5]:
print(yaml.dump(last_training.asDict(recursive=True)))

committed_at_stats:
  avg: 1608468671.5433939
  count: 9913
  countDistinct: 1
  max: 2020-12-20 12:51:11.543374
  min: 2020-12-20 12:51:11.543374
date_received_stats:
  avg: 1469004856.8142843
  count: 9913
  countDistinct: 911
  max: 2017-11-03 00:00:00
  min: 2015-03-19 00:00:00
model_name: word2vec
model_weights_path: file:///datalake/models/products/word2vec
trained_at: 2020-12-20 13:22:14.130229
training: '{''model'': {''input_col'': ''text_cleaned'', ''stop_words'': ''english'',
  ''features'': 128}}'
training_proc: dextra.dna.commons.processors.products.Learn



### Encoding Example

In [6]:
x = C.io.stream.read(P.config.lakes.refined + '/issues.parquet')

In [7]:
word2vec_training = (
    t.where(t.model_name == 'word2vec')
     .orderBy(F.desc('trained_at'))
     .limit(1)
     .collect()[0])

In [8]:
from pyspark.ml import PipelineModel

encoder = PipelineModel.load(word2vec_training.model_weights_path)

In [9]:
s = x.limit(5)
s = encoder.transform(s)

s.toPandas()

Unnamed: 0,complaint_id,consumer_message,customer_name,date_received,disputed,issue,product,resolution,state,sub_issue,...,via,zip_code,ingested_at,tags_trusted_labels,tags_split,text_cleaned,committed_at,text_cleaned_words,text_cleaned_filtered,features
0,0052ac552c28,Hello My name is {numeric} {numeric} and I hav...,d141ec2a2e9b,2017-01-30,False,Problems when you are unable to pay,Consumer Loan,Closed with explanation,GA,,...,Web,30037,2020-12-20 12:41:50.904213,False,test,hello my name is {numeric} {numeric} and i hav...,2020-12-20 12:51:11.543374,"[hello, my, name, is, {numeric}, {numeric}, an...","[hello, name, {numeric}, {numeric}, {numeric},...","[0.045425932517641085, 0.02799160317434663, 0...."
1,00a844d52aec,I have written several letters to the creditor...,be1642a5a4b5,2016-11-11,True,Credit reporting company's investigation,Credit reporting,Closed with explanation,OH,Investigation took too long,...,Web,44025,2020-12-20 12:41:50.904213,False,train,i have written several letters to the creditor...,2020-12-20 12:51:11.543374,"[i, have, written, several, letters, to, the, ...","[written, several, letters, creditor, {numeric...","[-0.02753447112270954, -0.06790772339386143, -..."
2,00ae385f4af9,This agency is reporting an account on my redi...,65de1a711356,2016-10-27,False,Other,Credit card,Closed with explanation,IL,,...,Web,60432,2020-12-20 12:41:50.904213,False,train,this agency is reporting an account on my redi...,2020-12-20 12:51:11.543374,"[this, agency, is, reporting, an, account, on,...","[agency, reporting, account, redit, profile, b...","[-0.0427374845991532, -0.1365463479111592, -0...."
3,00da7e528dac,In {numeric}/{numeric}/{numeric} or {numeric}/...,c9f3a3e31f7f,2017-08-30,,Fraud or scam,"Money transfer, virtual currency, or money ser...",Untimely response,PA,,...,Web,16060,2020-12-20 12:41:50.904213,False,test,in {numeric} {numeric} {numeric} or {numeric} ...,2020-12-20 12:51:11.543374,"[in, {numeric}, {numeric}, {numeric}, or, {num...","[{numeric}, {numeric}, {numeric}, {numeric}, {...","[0.045079315079736615, 0.010142262126018684, 0..."
4,010554da4e3a,"In my previous complaint Equifax states "" Equi...",c9c5b5b96800,2016-01-06,False,Incorrect information on credit report,Credit reporting,Closed with explanation,CA,Reinserted previously deleted info,...,Web,91365,2020-12-20 12:41:50.904213,False,test,in my previous complaint equifax states equifa...,2020-12-20 12:51:11.543374,"[in, my, previous, complaint, equifax, states,...","[previous, complaint, equifax, states, equifax...","[-0.010242463575883045, -0.05689138625231054, ..."


In [10]:
def normalize(x):
    normalizer = Normalizer(inputCol='features', outputCol='features_norm')
    return normalizer.transform(x)

In [11]:
s = normalize(s)

In [12]:
dot_udf = F.udf(lambda x,y: float(x.dot(y)), types.DoubleType())

In [13]:
(s.alias('i')
  .join(s.alias("j"), F.col("i.complaint_id") < F.col("j.complaint_id"))\
  .select(
      F.col("i.complaint_id").alias("i"), 
      F.col("j.complaint_id").alias("j"),
      dot_udf("i.features_norm", "j.features_norm").alias("dot"))
  .sort("i", "j")
  .toPandas()
  .round(1))

Unnamed: 0,i,j,dot
0,0052ac552c28,00a844d52aec,-0.0
1,0052ac552c28,00ae385f4af9,-0.1
2,0052ac552c28,00da7e528dac,0.8
3,0052ac552c28,010554da4e3a,0.0
4,00a844d52aec,00ae385f4af9,0.4
5,00a844d52aec,00da7e528dac,0.0
6,00a844d52aec,010554da4e3a,0.8
7,00ae385f4af9,00da7e528dac,-0.3
8,00ae385f4af9,010554da4e3a,0.6
9,00da7e528dac,010554da4e3a,-0.0
