In [2]:
import os
import torch
from torch.amp import autocast
import pandas as pd
import transformers, datasets
from rouge_score.rouge_scorer import RougeScorer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import nltk

In [4]:
data = datasets.load_dataset("Johnade/consumer_complaints_cfpb")
narr = data["train"]["consumer_complaint_narrative"] + data["test"]["consumer_complaint_narrative"]
narr[:10]

['Im submitting this complaint to inform you again that I am a victim of identity theft and I write to dispute certain items in my file resulting from the crime. The items I am disputing do not relate to any transactions obtaining any possession of goods, services or money that I have made or authorized. Please block the reporting of any information in my credit file that resulted from the alleged identity theft. \nThese following items listed below are unknown to me. \nXXXX XXXX XXXX XXXXXXXX XXXX XXXXXXXX XXXX XXXXXXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXXXX/XX/2022 ; XXXX XX/XX/2022 ; XXXX XXXX XX/XX/2022 ; XXXXXXXX XXXX XXXXXX/XX/2022.',
 "The account is reporting on my credit report but is not my account. It 's not even my name on the account. I tried to talk to the company and the first gentleman got aggravated and starting talking over me and told me to hold on, and then his supervisor got on the phone and she said I needed to file a dispute and allow 72 hours for dispute

In [8]:
device = 0 if torch.cuda.is_available() else "cpu"
tokenizer = transformers.AutoTokenizer.from_pretrained("sshleifer/distilbart-cnn-12-6", use_fast=True)
model = transformers.AutoModelForSeq2SeqLM.from_pretrained("sshleifer/distilbart-cnn-12-6").to(device)
model



BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(50264, 1024, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(50264, 1024, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0): BartEncoderLayer(
          (self_attn): BartAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (final_layer_norm): LayerNorm((102

In [6]:
long_narr = [n for n in narr if len(n) > 2000]
len(long_narr)

14188

In [34]:
long_narr_samp = pd.Series(long_narr).sample(10, random_state=42).tolist()
long_narr_samp

["I have been getting phone calls from this number several times a day now for over 2 months. When I Googled this number, it comes up as a scam number -- high risk. Then last week, a ma n left a message twice on my recorder that he is attempting to collect a debt on behalf of Apelles LLC, but  the phone number he left to call back was  XXXX , not as shown on my call logs. I have no idea wh o this Apelles LLC is . I have never received any letter from a creditor informing me that this company has been retained to collect a debt on their behalf. I have filed a complaint with the BBB in  XXXX , OH where this company is headquartered and asked that they cease and desist in calling me and leaving messages on my recorder ( done last week ). I have not received anything back from BBB with a response from this Apelles. I do not have a private voice messaging center as caretakers have access to my recorder as I am in and out of the hospital much due to   XXXX   XXXX   XXXX  . The FDCPA prohibit

In [35]:
def predict(docs, max_length=1024, batch_size=64, max_new_tokens=1024, do_chunking=True, stride=100):
    inputs = tokenizer.batch_encode_plus(
        docs,
        return_tensors="pt",
        add_special_tokens=True,
        max_length=max_length,
        truncation=True,
        padding="max_length",
        return_overflowing_tokens=do_chunking,
        stride=stride,
    )
    n_inputs = len(inputs.input_ids)
    n_tokens_in = (inputs.input_ids != tokenizer.pad_token_id).sum(dim=None).numpy()
    print(f"{n_tokens_in:,.0f} tokens in {len(docs):,.0f} documents with {n_inputs:,.0f} chunks.")
    print(f"Mean tokens per document: {n_tokens_in / len(docs):,.2f}")
    sample_mapping = inputs["overflow_to_sample_mapping"]
    del inputs["overflow_to_sample_mapping"]
    with torch.no_grad():
        with autocast(device_type=model.device.type):
            for i in range(0, n_inputs, batch_size):
                inputs_ = {k: v[i : i + batch_size] for k, v in inputs.items()}
                inputs_["input_ids"] = inputs_["input_ids"].to(device)
                inputs_["attention_mask"] = inputs_["attention_mask"].to(device)
                outputs = model.generate(**inputs_, max_new_tokens=max_new_tokens, do_sample=True, temperature=1.0, num_return_sequences=1, num_beams=10)
                outputs.to("cpu")
                if i == 0:
                    preds = outputs
                else:
                    pad_to = max(preds.shape[1], outputs.shape[1])
                    outputs = torch.nn.functional.pad(outputs, (0, pad_to - outputs.shape[1]), value=tokenizer.pad_token_id)
                    preds = torch.nn.functional.pad(preds, (0, pad_to - preds.shape[1]), value=tokenizer.pad_token_id)
                    preds = torch.cat((preds, outputs), dim=0)
                print(f"Summarized {preds.shape[0]:,.0f} of {n_inputs:,.0f} chunks.")

    decoded = tokenizer.batch_decode(preds, skip_special_tokens=True, clean_up_tokenization_spaces=True)
    preds = pd.DataFrame({"summary": decoded, "sample_id": sample_mapping})
    preds["summary"] = preds["summary"].str.strip()
    preds = preds.groupby("sample_id")["summary"].agg(" ".join).reset_index()
    return preds["summary"].to_list()

summs = predict(long_narr_samp, batch_size=4, max_new_tokens=1024)
summs

7,053 tokens in 10 documents with 11 chunks.
Mean tokens per document: 705.30
Summarized 4 of 11 chunks.
Summarized 8 of 11 chunks.
Summarized 11 of 11 chunks.


['Apelles LLC left messages twice on my recorder that he is trying to collect a debt on behalf of a debt. I do not have a private voice messaging center as caretakers have access to my recorder. FDCPA prohibits debt collects from leaving voicemail messages when the voicemail is not completely private with others having access to voicemail.',
 'There is a malicious takeover of commercial interests and extends back to personal matters. Some of her family has been involved and I believe has used her co signing on this loan and other things in various matters they have sought to deny and deprive me information of. My accounts are being monitored and cut off and the communications I do receive are hijacked. I intend to ensure the copyright I made is put in my name.',
 "The new credit reports ( XX/XX/XXXX) all omitted this valuable information ( listed as NA '' or nothing at all with not payment amounts ) and it lowers the evaluation score pulled by lenders. This past XXXX I utilized approxi

In [12]:
# device = 0 # the device to load the model onto

# access_token = "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"
# model = transformers.AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2", token=access_token)
# tokenizer = transformers.AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2", token=access_token)

# messages = [
#     {"role": "user", "content": "I'm going to give you a description of a financial industry complaint submitted to the CFPB, and I want you to generate a summary of it."},
#     {"role": "assistant", "content": "Sure, I can help with that. Please provide the description of the complaint."},
#     {"role": "user", "content": long_narr[5]},
# ]

# encodeds = tokenizer.apply_chat_template(messages, return_tensors="pt")

# model_inputs = encodeds.to(device)
# model.to(device)

# generated_ids = model.generate(model_inputs, max_new_tokens=1000, do_sample=True)
# summ = tokenizer.batch_decode(generated_ids)
# print(summ)

In [13]:
# import re
# summ = re.split(r"</?s>", summ[0])[-2].strip()
# summ

In [36]:
import re


def sent_rouge(cand, ref, min_ref_len=3, use_stemmer=False):
    print(f"Full candidate: '{cand}'")
    cands = nltk.sent_tokenize(cand)
    refs = nltk.sent_tokenize(ref)
    refs = [r for r in refs if len(re.findall(r"\b\w+\b", r)) >= min_ref_len]
    scorer = RougeScorer(["rouge1"], use_stemmer=use_stemmer)
    best_scores = []
    for cand in cands:
        print(f"Candidate: '{cand}'")
        scores = pd.Series({ref: scorer.score(cand, ref)["rouge1"].fmeasure for ref in refs})
        best_ref = scores.idxmax()
        best_score = scores.max()
        print(f"Best reference ({best_score:.2f} F1): '{best_ref}'")
        best_scores.append({"candidate": cand, "best_ref": best_ref, "best_score": best_score})
        print("-" * 80)
    return pd.DataFrame(best_scores)

with pd.option_context("display.max_colwidth", 1000):
    index = 1
    display(sent_rouge(summs[index], long_narr_samp[index], min_ref_len=3, use_stemmer=False))


Full candidate: 'There is a malicious takeover of commercial interests and extends back to personal matters. Some of her family has been involved and I believe has used her co signing on this loan and other things in various matters they have sought to deny and deprive me information of. My accounts are being monitored and cut off and the communications I do receive are hijacked. I intend to ensure the copyright I made is put in my name.'
Candidate: 'There is a malicious takeover of commercial interests and extends back to personal matters.'
Best reference (1.00 F1): 'There is a malicious takeover of commercial interests and extends back to personal matters.'
--------------------------------------------------------------------------------
Candidate: 'Some of her family has been involved and I believe has used her co signing on this loan and other things in various matters they have sought to deny and deprive me information of.'
Best reference (1.00 F1): 'Some of her family has been inv

Unnamed: 0,candidate,best_ref,best_score
0,There is a malicious takeover of commercial interests and extends back to personal matters.,There is a malicious takeover of commercial interests and extends back to personal matters.,1.0
1,Some of her family has been involved and I believe has used her co signing on this loan and other things in various matters they have sought to deny and deprive me information of.,Some of her family has been involved and I believe has used her co signing on this loan and other things in various matters they have sought to deny and deprive me information of.,1.0
2,My accounts are being monitored and cut off and the communications I do receive are hijacked.,"Not only is information being denied, but my accounts are being monitored and cut off and the communications I do receive are hijacked.",0.820513
3,I intend to ensure the copyright I made is put in my name.,I intend to ensure the copyright I made is put in my name.,1.0


In [38]:
import numpy as np
import scipy as sp
import nltk

def sent_rouge2(cand, ref, min_ref_len=3, ngram=1, token_pattern=r"\b\w\w+\b", stop_words="english"):
    cands = nltk.sent_tokenize(cand)
    refs = nltk.sent_tokenize(ref)
    refs = [r for r in refs if len(re.findall(token_pattern, r)) >= min_ref_len]
    vectorizer = CountVectorizer(token_pattern=token_pattern, ngram_range=(ngram, ngram), stop_words=stop_words, binary=True)
    vectorizer.fit(cands + refs)
    cands_vec = vectorizer.transform(cands)
    cand_counts = cands_vec.sum(axis=1).squeeze().A1
    # print("cands_vec", cands_vec.shape, cand_counts)
    refs_vec = vectorizer.transform(refs)
    ref_counts = refs_vec.sum(axis=1).squeeze().A1
    # print("refs_vec", refs_vec.shape, ref_counts)
    matches = cands_vec @ refs_vec.T
    matches = matches.toarray()
    # print(matches.shape)
    # print(matches)
    recall = matches / ref_counts
    # print(recall)
    precision = matches / cand_counts.reshape(-1, 1)
    # print(precision)
    f1 = sp.stats.hmean([recall, precision], axis=0)
    # print(f1)
    best_ref = pd.Series(refs)[f1.argmax(axis=1)]
    best_score = f1.max(axis=1)
    return pd.DataFrame({"candidate": cands, "best_ref": best_ref, "best_score": best_score}).reset_index(drop=True)

with pd.option_context("display.max_colwidth", 1000):
    index = 5
    print(len(summs[index]))
    print(len(long_narr_samp[index]))
    display(sent_rouge2(summs[index], long_narr_samp[index], ngram=1, min_ref_len=3, token_pattern=r"\b\w\w+\b", stop_words="english"))

532
2138


Unnamed: 0,candidate,best_ref,best_score
0,"At the end of the Fall semester in XXXX, I spoke to a counselor, about discontinuing my enrollment, because my grandmother and mother were having serious issues with their house, that was in the flood zone of XXXX.","At the end of the Fall semester in XXXX, I spoke to a counselor, about discontinuing my enrollment, because my grandmother and mother were having serious issues with their house, that was in the flood zone of XXXX.",1.0
1,"I was forced to withdraw, and move back to XXXX XXXX to help with clean up and to rebuild, and now the VA says I owe the money for withdrawing.","Well, I was forced to withdraw, and move back to XXXX XXXX to help with clean up and to rebuild, and now the VA says I owe the money for withdrawing.",1.0
2,There is no mention of mitigating circumstances anywhere on my transcript.,"Also, there is no mention of mitigating circumstances anywhere on my transcript.",1.0
3,"I attempted to go to the school, but the building was empty, and the school had changed locations.","I attempted to go to the school, but the building was empty, and when I tried to contact the school, I found out that they had changed locations, and none of the same people were still there, and I was told that there was no record of them having talked to me about mitigating circumstances.",0.526316


In [42]:
from sklearn.metrics.pairwise import cosine_similarity

def sent_cosine(cand, ref, min_ref_len=3, ngram=1, token_pattern=r"\b\w\w+\b", stop_words="english"):
    cands = nltk.sent_tokenize(cand)
    refs = nltk.sent_tokenize(ref)
    refs = [r for r in refs if len(re.findall(token_pattern, r)) >= min_ref_len]
    vectorizer = CountVectorizer(token_pattern=token_pattern, ngram_range=(ngram, ngram), stop_words=stop_words)
    vectorizer.fit(cands + refs)
    cands_vec = vectorizer.transform(cands)
    refs_vec = vectorizer.transform(refs)
    sims = cosine_similarity(cands_vec, refs_vec)
    print(sims.shape)
    best_ref = pd.Series(refs)[sims.argmax(axis=1)]
    best_score = sims.max(axis=1)
    return pd.DataFrame({"candidate": cands, "best_ref": best_ref, "best_score": best_score}).reset_index(drop=True)

with pd.option_context("display.max_colwidth", 1000):
    index = 0
    print(len(summs[index]))
    print(len(long_narr_samp[index]))
    display(sent_cosine(summs[index], long_narr_samp[index], min_ref_len=3, ngram=1))

339
2271
(3, 16)


Unnamed: 0,candidate,best_ref,best_score
0,Apelles LLC left messages twice on my recorder that he is trying to collect a debt on behalf of a debt.,"Then last week, a ma n left a message twice on my recorder that he is attempting to collect a debt on behalf of Apelles LLC, but the phone number he left to call back was XXXX , not as shown on my call logs.",0.620174
1,I do not have a private voice messaging center as caretakers have access to my recorder.,I do not have a private voice messaging center as caretakers have access to my recorder as I am in and out of the hospital much due to XXXX XXXX XXXX .,0.641689
2,FDCPA prohibits debt collects from leaving voicemail messages when the voicemail is not completely private with others having access to voicemail.,The FDCPA prohibits debt collects from leaving voicemail messages when the voicemail is not completely private with others having access to voicemail messages.,0.978232
