In [1]:
import os
import numpy as np
import pandas as pd
import torch
from scipy.sparse import csr_matrix
from sklearn import preprocessing as pp, cluster as cl, metrics as mt, decomposition as dc, neighbors as nb, pipeline
import humanfriendly as hf
import sentence_transformers as st

os.chdir("C:\\Users\\ndgig\\Repositories\\phrase_foundry")
print(os.getcwd())

%load_ext autoreload
%autoreload 2

from phrase_foundry import PhraseFoundry, ApproxPhraseFoundry

C:\Users\ndgig\Repositories\phrase_foundry


In [2]:
splits = {
    "train": "train_consumer_complaints.csv",
    "test": "test_consumer_complaints.csv",
}
df = pd.read_csv(
    "hf://datasets/Johnade/consumer_complaints_cfpb/" + splits["train"]
).sample(10000, random_state=32)

docs = df["consumer_complaint_narrative"].to_list()

docs[:5]

['The credit bureaus did not fix any dispute or investigation. The automative system continues to fail and not do anything about the problem.',
 "I first noticed that my online university of XXXX account was audited when logging in after 4 years, due to data breaches on me and my kids information. When printing out the loans that I supposedly taken out I went to the bank I was banking with at the time because I knew they never deposited any student loans into my account. When retrieving the old bank STATEMENTS from XXXX XXXX XXXX I became stressed to the max due to the account showing XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX FL then XXXX XXXX XXXX XXXX fL and over 20 check cards being used and many deposits being made and the acct was reopened without my consent after closing it 6 months prior to it being reopened without my consent. Then I noticed the acct showing financial cons data etc removing money from an altered acct I knew nothing about. The withdrawls and deposits and

In [3]:
st_model = st.SentenceTransformer("sentence-transformers/paraphrase-MiniLM-L3-v2", device="cuda")
st_model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)

In [4]:
doc_vecs = st_model.encode(docs, show_progress_bar=True, batch_size=256)
doc_vecs.shape

Batches:   0%|          | 0/40 [00:00<?, ?it/s]

  attn_output = torch.nn.functional.scaled_dot_product_attention(


(10000, 384)

In [5]:
model = ApproxPhraseFoundry(
    "sentence-transformers/paraphrase-MiniLM-L3-v2",
    n_pca_components=64,
    n_pca_training_samples=int(2e6),
    device="cuda",
    amp=True,
    amp_dtype=torch.bfloat16,
    quantize_embeds=True,
)
model

<phrase_foundry.phrase_foundry.ApproxPhraseFoundry at 0x1a5af105010>

In [6]:
samp_idx, ngrams, ngram_vecs = model.encode_extract(
    docs,
    batch_size=512,
    ngram_range=(4, 6),
)

Encoding:   0%|          | 0/22 [00:00<?, ?it/s]

PCA training complete after 8 batches.
Applying PCA to all n-gram embeddings.


In [24]:
from collections import Counter

def search_ng(
    queries: list[str],
    query_vecs: np.ndarray,
    ngrams: np.ndarray = ngrams,
    ngram_vecs: np.ndarray = ngram_vecs,
    samp_idx: np.ndarray = samp_idx,
    radius: float = 0.5,
    metric: str = "cosine",
):
    search_index = nb.NearestNeighbors(radius=radius, metric=metric, ).fit(ngram_vecs)
    dists, idx = search_index.radius_neighbors(query_vecs, return_distance=True)
    rankings = [np.argsort(d) for d in dists]
    idx = [i[r] for i, r in zip(idx, rankings)]
    return {
        queries[i]: (ngrams[idx[i]], np.unique(samp_idx[idx[i]]))
        for i in range(len(queries))
    }

def search_ng_hit_count(queries, query_vecs, ngram_vecs=ngram_vecs, samp_idx=samp_idx, radius=0.5, metric="cosine"):
    search_index = nb.NearestNeighbors(radius=radius, metric=metric).fit(ngram_vecs)
    idx = search_index.radius_neighbors(query_vecs, return_distance=False)
    return {q: Counter(samp_idx[i]) for i, q in zip(idx, queries)}

def search_ng_doc_hits(queries, query_vecs, ngrams=ngrams, ngram_vecs=ngram_vecs, samp_idx=samp_idx, radius=0.5, metric="cosine"):
    search_index = nb.NearestNeighbors(radius=radius, metric=metric).fit(ngram_vecs)
    idx = search_index.radius_neighbors(query_vecs, return_distance=False)
    hits = {q: pd.Series(ngrams[i]).groupby(samp_idx[i]).apply(list).to_dict() for i, q in zip(idx, queries)}
    return hits

def search_ng_nearest_docs(queries, query_vecs, ngram_vecs=ngram_vecs, samp_idx=samp_idx, radius=0.5, metric="cosine"):
    search_index = nb.NearestNeighbors(radius=radius, metric=metric).fit(ngram_vecs)
    dists, idx = search_index.radius_neighbors(query_vecs, return_distance=True)
    avg_dists = {q: pd.Series(d).groupby(samp_idx[i]).mean().sort_values() for q, d, i in zip(queries, dists, idx)}
    return avg_dists

def search_ng_best_docs(queries, query_vecs, ngram_vecs=ngram_vecs, samp_idx=samp_idx, radius=0.5, metric="cosine"):
    search_index = nb.NearestNeighbors(radius=radius, metric=metric).fit(ngram_vecs)
    dists, idx = search_index.radius_neighbors(query_vecs, return_distance=True)
    sims = [1 - d for d in dists]
    total_sims = {q: pd.Series(s).groupby(samp_idx[i]).sum().sort_values(ascending=False) for q, s, i in zip(queries, sims, idx)}
    return total_sims

def search_st_nearest_docs(queries, query_vecs, doc_vecs=doc_vecs, radius=0.5, metric="cosine"):
    search_index = nb.NearestNeighbors(radius=radius, metric=metric).fit(doc_vecs)
    dists, idx = search_index.radius_neighbors(query_vecs, return_distance=True)
    nearest = {q: pd.Series(d, index=i).sort_values() for q, d, i in zip(queries, dists, idx)}
    return nearest

queries = [
    "damaged credit score"
]
ng_query_vecs = model.encode_queries(queries)
results_ng_nearest = search_ng_nearest_docs(queries, ng_query_vecs, ngram_vecs=ngram_vecs[:int(3e6)], radius=0.4)
results_ng_best = search_ng_best_docs(queries, ng_query_vecs, ngram_vecs=ngram_vecs[:int(3e6)], radius=0.4)
results_ng_nearest, results_ng_best

Encoding:   0%|          | 0/1 [00:00<?, ?it/s]

Pooling:   0%|          | 0/1 [00:00<?, ?it/s]

({'damaged credit score': 2600    0.316419
  788     0.317594
  2425    0.326003
  3809    0.334584
  3521    0.335607
            ...   
  2674    0.399733
  4194    0.399774
  2039    0.399777
  3236    0.399838
  2065    0.399972
  Length: 308, dtype: float64},
 {'damaged credit score': 839     20.720142
  1484    13.240956
  2558     9.841272
  4071     9.757520
  788      9.553687
            ...    
  2674     0.600267
  4194     0.600226
  2039     0.600223
  3236     0.600162
  2065     0.600028
  Length: 308, dtype: float64})

In [34]:
docs[2065]

"AES/XXXX is reporting XXXX fraudulent loans on my credit report and have refused to cease the credit reporting after several promises to do so. I have been calling and disputing this information sinceXX/XX/XXXX2017, and for the last five months I have been lied to, harassed for payments, told false information, and basically have been running around in circles trying to get someone to actually take action and do an investigation. In the past I made payments on these loans by sending in one lump sum to the creditor to cover ALL my loans, which included those that I did take out in my name. There are four loans that I DID take out in my name but there are three loans that I DID NOT authorize to be taken out in my name. I am a victim of identity theft. I sent in a police report, XXXX XXXX XXXX XXXX, and proof of identity to AES/XXXX back in XXXX 2017. After multiple calls I did n't receive an XX/XX/XXXX XXXX XXXX from their company for completion until XXXX XXXX, 2017. Five months later,

In [26]:
st_query_vecs = st_model.encode(queries)
results_st = search_st_nearest_docs(queries, st_query_vecs, radius=0.4)
results_st

{'damaged credit score': 2600    0.280519
 6495    0.311360
 8759    0.315148
 2198    0.318799
 5652    0.328548
           ...   
 5662    0.397677
 8416    0.398095
 984     0.398307
 7913    0.399240
 870     0.399771
 Length: 88, dtype: float32}

In [35]:
results_st["damaged credit score"].loc[2065]

KeyError: 2065

In [30]:
results_st[queries[0]].loc[839]

0.37679285

In [29]:
docs[6495]

'Dear Consumer Financial Protection Bureau, I am writing to file a complaint against the credit bureau, as they have failed to upgrade my credit score despite the fact that my bank has confirmed that a mistake was made and assured me that my score would be upgraded within three days. Despite this, my credit score has not been upgraded and I have proof from the bank to back up my claim. \n\nThis issue is causing me a significant financial loss as I am unable to close on certain deals due to my low credit score. I am requesting your assistance in resolving this issue as soon as possible. \n\nThank you for your time and consideration. \n\nsee attached the email that i received from the bank Sincerely, XXXX'

In [7]:
ngram_vecs.shape

(6052104, 64)