In [1]:
import os
import numpy as np
import pandas as pd
import torch
from scipy.sparse import csr_matrix
from sklearn import preprocessing as pp, cluster as cl, metrics as mt, decomposition as dc, neighbors as nb, pipeline
import humanfriendly as hf
import cProfile

os.chdir("C:\\Users\\ndgig\\Repositories\\finephrase")
print(os.getcwd())

%load_ext autoreload
%autoreload 2

from FinePhrase import FinePhrase, FinePhrasePCA

C:\Users\ndgig\Repositories\phrase_foundry


In [2]:
splits = {
    "train": "train_consumer_complaints.csv",
    "test": "test_consumer_complaints.csv",
}
df = pd.read_csv(
    "hf://datasets/Johnade/consumer_complaints_cfpb/" + splits["train"]
).sample(100000, random_state=32)

docs = df["consumer_complaint_narrative"].to_list()

docs[:5]

['The credit bureaus did not fix any dispute or investigation. The automative system continues to fail and not do anything about the problem.',
 "I first noticed that my online university of XXXX account was audited when logging in after 4 years, due to data breaches on me and my kids information. When printing out the loans that I supposedly taken out I went to the bank I was banking with at the time because I knew they never deposited any student loans into my account. When retrieving the old bank STATEMENTS from XXXX XXXX XXXX I became stressed to the max due to the account showing XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX FL then XXXX XXXX XXXX XXXX fL and over 20 check cards being used and many deposits being made and the acct was reopened without my consent after closing it 6 months prior to it being reopened without my consent. Then I noticed the acct showing financial cons data etc removing money from an altered acct I knew nothing about. The withdrawls and deposits and

In [3]:
if "model" in locals():
    del model
model = FinePhrasePCA(
    # "sentence-transformers/all-mpnet-base-v2",
    "sentence-transformers/paraphrase-MiniLM-L3-v2",
    n_pca_components=64,
    n_pca_training_samples=int(8e6),
    device="cuda",
    amp=True,
    amp_dtype=torch.bfloat16,
    quantize_embeds=True,
)
model

<phrase_foundry.phrase_foundry.FinePhrasePCA at 0x23853623710>

In [4]:
samp_idx, ngrams, ngram_vecs = model.encode_extract(
    docs,
    batch_size=512,
    ngram_range=(4, 6),
)

ngram_vecs

Encoding:   0%|          | 0/220 [00:00<?, ?it/s]

  attn_output = torch.nn.functional.scaled_dot_product_attention(


PCA training complete after 29 batches.
Applying PCA to all n-gram embeddings.


array([[-0.7627  ,  0.3835  , -0.1067  , ...,  0.293   , -0.5923  ,
         0.1082  ],
       [-0.897   , -0.1803  , -0.1865  , ...,  0.2622  , -0.4448  ,
        -0.06433 ],
       [-0.8496  ,  0.4868  , -0.8403  , ...,  0.2515  , -0.1067  ,
        -0.1312  ],
       ...,
       [-0.2698  ,  0.283   , -1.088   , ...,  0.02473 ,  0.2585  ,
        -0.003689],
       [-0.3718  , -0.535   , -0.3804  , ...,  0.2133  , -0.3616  ,
        -0.1526  ],
       [ 0.4812  , -1.089   , -0.202   , ...,  0.245   ,  0.00561 ,
         0.1714  ]], dtype=float16)

In [5]:
ngram_vecs.shape

(60690119, 64)

In [6]:
model.pca_.n_samples_seen_

8026291

In [12]:
hf.format_size(ngram_vecs.nbytes)

'7.77 GB'

In [7]:
docs[1], ngrams[samp_idx == 1].tolist()

("I first noticed that my online university of XXXX account was audited when logging in after 4 years, due to data breaches on me and my kids information. When printing out the loans that I supposedly taken out I went to the bank I was banking with at the time because I knew they never deposited any student loans into my account. When retrieving the old bank STATEMENTS from XXXX XXXX XXXX I became stressed to the max due to the account showing XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX FL then XXXX XXXX XXXX XXXX fL and over 20 check cards being used and many deposits being made and the acct was reopened without my consent after closing it 6 months prior to it being reopened without my consent. Then I noticed the acct showing financial cons data etc removing money from an altered acct I knew nothing about. The withdrawls and deposits and counter credits being made in different states and cities when i could never be in 2 states at the same time. When checking the emails attached

In [8]:
hf.format_size(ngram_vecs.nbytes)

'7.77 GB'

In [19]:
import faiss

index = faiss.IndexHNSWFlat(ngram_vecs.shape[1], 32, faiss.METRIC_INNER_PRODUCT)
index.add(pp.normalize(ngram_vecs[:int(1e6)]))
index

<faiss.swigfaiss.IndexHNSWFlat; proxy of <Swig Object of type 'faiss::IndexHNSWFlat *' at 0x00000239E5F2A040> >

In [20]:
queries = [
    "close my account",
    "credit card fraud",
    "debt collection",
    "identity theft",
    "mortgage fraud",
    "overdraft fees",
    "payday loan",
    "student loan",
    "unauthorized transaction",
    "vehicle loan",
]
query_vecs = model.encode_queries(queries)

Encoding:   0%|          | 0/1 [00:00<?, ?it/s]

Pooling:   0%|          | 0/1 [00:00<?, ?it/s]

In [48]:
radius = 0.3
index.hnsw.efSearch = 200  # Higher values improve recall but slow down queries
results = index.range_search(pp.normalize(query_vecs), radius)
results[0]

array([    0,  3399,  6297,  8756, 10494, 13442, 16155, 19976, 22345,
       25155, 28668], dtype=uint64)

In [21]:
# efSearch controls the tradeoff between speed and accuracy during search
dists, idx = index.search(pp.normalize(query_vecs), 25)
for i in range(len(idx)):
    display(queries[i])
    display(ngrams[idx[i]])
    print()

'close my account'

array(['to close my account', 'i closed my account',
       'i closed my account with', 'closed my account with',
       'close my account by', 'i closed my account with us',
       'closed my account with us', '2019 to close my account',
       'close my account and', 'bank closed my account',
       's bank closed my account', 'to close my account',
       'close my account and', 'i closed my account on',
       'closed my account with us bank',
       'bank closed my account because it',
       'closed my account because it', 'i opened an account',
       'i closed my account', '/ 2019 to close my account',
       'my account with us', 'my account this account',
       'closing the account i', 'i closed my account on good',
       'to close my account by'], dtype='<U66')




'credit card fraud'

array(['of credit card fraud', 'credit card fraud and',
       'of credit card fraud and', 'victim of credit card fraud',
       'credit card fraud and identity',
       'fraud department of my credit card',
       'a victim of credit card fraud',
       'of credit card fraud and identity',
       'credit card fraud and identity theft',
       'credit card through credit', 'victim of credit card fraud and',
       'victim of credit card', "' s credit card",
       'a credit card through credit', 'credit card number as card',
       'my credit card and', 'my credit card with', 'of my credit card',
       'for a credit card debt', 'credit card company,',
       'loan or credit card', 'fraudulent accounts from my credit',
       'fraudulent accounts from my credit',
       'fraudulent accounts from my credit', 's credit card for'],
      dtype='<U66')




'debt collection'

array(['. ic systems debt collection', 'a collection debt is',
       'ic systems debt collection', 'debt collection company for',
       'collection of a debt', 'ic systems debt collection company for',
       'debt collection agency to', '}. ic systems debt collection',
       'systems debt collection company for', 'debt collector asked for',
       '. ic systems debt collection company', 'a debt collection was',
       'regarding two debt collections', 'debt collection agency by the',
       'different debt collection agency by the',
       'for collection of a debt', 'for collection the debt',
       'debt and the collection', 'two debt collections that',
       'ic systems debt collection company', 'debt collection agency by',
       'debt collections that are',
       'a different debt collection agency by',
       'systems debt collection company',
       'different debt collection agency by'], dtype='<U66')




'identity theft'

array(['victim of identity theft', 'victim of identity theft',
       'victim of identity theft', 'a victim of identity theft',
       'a victim of identity theft', 'of identity theft.',
       'a victim of identity theft', 'victim of identity theft',
       'am a victim of identity theft', 'am a victim of identity theft',
       'a victim of identity theft', 'victim of identity theft.',
       'victim of identity theft', 'a victim of identity theft',
       'a victim of identity theft.', 'a victim of identity theft.',
       'm a victim of identity theft', 'been a victim of identity theft',
       'identity theft. someone', 'a victim of identity theft',
       'a victim of identity theft.', 'victim of identity theft',
       'of identity theft. someone', 'of identity theft.',
       'identity theft. the'], dtype='<U66')




'mortgage fraud'

array(['my mortgage loan since 2005', 'mortgage loan since 2005',
       'fraudulent inquiries on my credit', 'my mortgage loan since',
       'mortgage loan since 2005 on my', 'fraud occurs. credit fraud',
       'fraudulent inquiries on my credit report',
       'unauthorized and fraudulent inquiries',
       'my mortgage loan since 2005 on', 'was denied an mortgage loan',
       'and fraudulent inquiries on my credit', 'denied an mortgage loan',
       'fraudulent accounts from my credit report',
       'fraudulent accounts from my credit report',
       'fraudulent accounts from my credit report',
       'fraudulent accounts from my credit',
       'fraudulent accounts from my credit',
       'fraudulent accounts from my credit',
       'fraud department : remove fraudulent accounts',
       'fraud department : remove fraudulent accounts',
       'fraud department : remove fraudulent accounts',
       'has my mortgage loan since 2005',
       'fraudulent information from my credit'




'overdraft fees'

array(['5 overdraft fees for', '5 overdraft fees',
       'fees charged 5 overdraft', 'overdraft fees for',
       'charged 5 overdraft fees', 'overdraft fees for 3',
       'overdraft fees charged 5', 'overdraft fees',
       'charged 5 overdraft', 'the overdraft fees', 'overdraft fees',
       'fees charged 5 overdra', '3 overdraft fees',
       '3 overdraft fees charged', '5 overdraft', 'overdraft fees',
       'overdraft fees charged', 'charged 3 overdraft fees',
       'cover the overdraft fees', '2 overdraft fees',
       '- charged 3 overdraft', 'fees charged 5 over',
       'with an overdraft fee', 'overdraft fees xx /',
       'discount for paying 24'], dtype='<U66')




'payday loan'

array(['scheduled payment for my student loans',
       'if your monthly payment', 'repaying the loan and money',
       'payments for when the loans', ', if your monthly payment',
       'payment for my student loans', 'discount for paying 24',
       'my student loan pay', 'afford those loan payments and',
       'loan to pay for tuition', 'tax payer dollars, since',
       'payments on my student loans with', 'loan prior to payments',
       'signed loans to pay', 'if your monthly payment is',
       'funded by tax pay', 'loan and moneylion',
       'payments for my student loan', 'by tax payer dollars',
       'signed loans to pay for', 'payments to my loan',
       'funded by tax payer dollars', 'by tax payer dollars,',
       'in repaying the loan', 'loan payments and try to'], dtype='<U66')




'student loan'

array(['small student loans (', 'dollars for " student loan',
       'for " student loan', 'for " student loan repay',
       'a student loan from', 'federal student loans have',
       'my student loans with', 'dollars for " student loan repay',
       'for my student loans', 'for " student loan',
       'education services - student loan i', 'a student loan with',
       '" student loan repay', 'private student loans with',
       'student loans with the', 'student loans with a',
       'services - student loan i have', 'services - student loan',
       'student loans my student loans are', 'a student loan through',
       'for " student loan repay', 'student loan i have',
       'from my student loan', 'for my student loans',
       'my student loans with'], dtype='<U66')




'unauthorized transaction'

array(['several unauthorized transactions -',
       'made several unauthorized transactions -',
       'security breach resulting in unauthorized',
       'unauthorized transactions - a transfer for',
       'fraudulent transaction / opened a fraudulent',
       'theft, unauthorized use',
       'unauthorized release of private consumer',
       'unauthorized transactions - a transfer',
       'made several unauthorized transactions',
       'security breach resulting in unauthorized release',
       'fraudulent activity and unauthorized',
       'thief [ made a fraudulent transaction',
       'fraudulent activity and unauthorized use',
       'made several unauthorized transactions - a',
       'a fraudulent transaction / opened',
       'unauthorized transactions - a',
       'fraudulent transaction / opened a',
       'withdrew funds from an unauthorized',
       'funds from an unauthorized', 'fraud transactions i had',
       'unauthorized release of private consumer information',




'vehicle loan'

array(['purchased a vehicle using this lend', 'qualify for a home loan',
       'for a home loan', 'application for a home loan',
       'vehicle using this lend', 'a vehicle using this lend',
       'student loan carrier (', 'qualify for a home loan because',
       'small student loans (', 'a vehicle using this lender',
       'statements for the loan', 'loan with ally financial. a',
       'loan with ally financial.', 'vehicle using this lender,',
       'a student loan from', 'afford those loan payments and',
       'loan with ally financial', 'auto loan with ally financial.',
       'to qualify for a home loan', 'auto loan with ally',
       'for a home loan because', 'vehicle using this lender',
       'afford those loan payments and try', 'carmax loan and',
       'my student loan carrier ('], dtype='<U66')




In [85]:
from collections import Counter

def search(
    queries: list[str],
    query_vecs: np.ndarray,
    ngrams: np.ndarray = ngrams,
    ngram_vecs: np.ndarray = ngram_vecs,
    samp_idx: np.ndarray = samp_idx,
    radius: float = 0.5,
    metric: str = "cosine",
):
    search_index = nb.NearestNeighbors(radius=radius, metric=metric, ).fit(ngram_vecs)
    dists, idx = search_index.radius_neighbors(query_vecs, return_distance=True)
    rankings = [np.argsort(d) for d in dists]
    idx = [i[r] for i, r in zip(idx, rankings)]
    return {
        queries[i]: (ngrams[idx[i]], np.unique(samp_idx[idx[i]]))
        for i in range(len(queries))
    }

def search_hit_count(queries, query_vecs, ngram_vecs=ngram_vecs, samp_idx=samp_idx, radius=0.5, metric="cosine"):
    search_index = nb.NearestNeighbors(radius=radius, metric=metric).fit(ngram_vecs)
    idx = search_index.radius_neighbors(query_vecs, return_distance=False)
    return {q: Counter(samp_idx[i]) for i, q in zip(idx, queries)}

def search_doc_hits(queries, query_vecs, ngrams=ngrams, ngram_vecs=ngram_vecs, samp_idx=samp_idx, radius=0.5, metric="cosine"):
    search_index = nb.NearestNeighbors(radius=radius, metric=metric).fit(ngram_vecs)
    idx = search_index.radius_neighbors(query_vecs, return_distance=False)
    hits = {q: pd.Series(ngrams[i]).groupby(samp_idx[i]).apply(list).to_dict() for i, q in zip(idx, queries)}
    return hits

queries = [
    "damaged credit score"
]
query_vecs = model.encode_queries(queries)
results = search_doc_hits(queries, query_vecs, ngram_vecs=ngram_vecs[:int(3e6)], radius=0.4)
results

Encoding:   0%|          | 0/1 [00:00<?, ?it/s]

Pooling:   0%|          | 0/1 [00:00<?, ?it/s]

{'damaged credit score': {0: ['credit bureaus did'],
  2: ['affected my credit score',
   'that has affected my credit',
   'has affected my credit score',
   'credit score tremendously.',
   'credit score tremendously',
   'has affected my credit',
   'my credit score tremendously.',
   'affected my credit score tremendously',
   'has affected my credit score tremendous',
   'that has affected my credit score',
   'claim that has affected my credit'],
  7: ['corrected on my credit report',
   'on my credit report',
   'corrected on my credit'],
  11: ['my credit report that is caused', 'credit report that is caused'],
  20: ['on my credit report is incorrect', 'on my credit report is'],
  33: ['showing on credit reports needs corrected',
   'on credit reports needs corrected and',
   'showing on credit reports needs',
   'on credit reports needs corrected',
   'credit reports needs corrected and',
   'on credit reports needs',
   'showing on credit reports'],
  37: ['credit report. co

In [88]:
print(docs[2])

" " '' I have disputed this collection XXXX XXXX XXXX XXXX   XXXX for months.XXXX and XXXX have Deleted them but Equifax is refusing too. Not to mention they will not give me an explanation as to why they wont overturn or fix this erroneous inaccurate claim that has affected my credit score tremendously. 
I am a victim of identity theft.CFPB please step up and fix this ASAP, pursuant to section 605B of the Fair Credit Reporting Act. '' '' ''


In [None]:
results["unauthorized transaction"][0].tolist()

['a unauthorized transaction for', 'an unauthorized transaction in']