# End-to-End LLM based solution
Using sentence semantic symilarity and summaring models

## Tests and Tries

In [1]:
import numpy as np

In [2]:
COLAB = True

In [3]:
if COLAB:
    from google.colab import drive
    from os.path import exists

    amazon_train_file = 'train.ft.txt.bz2'

    if exists(amazon_train_file):
        train_file = amazon_train_file
    else:
        drive.mount('/content/drive/')

        # For Hadas' drive
        my_dir = 'drive/MyDrive/Y-data/Intuit-K-anonimity/'

        # For Lior's drive
        #my_dir = 'drive/MyDrive/Y-data/Y-DATA_PROJECT/'

        train_file = my_dir + '/train.ft.txt.bz2'
else:
    train_file = '../data/' + 'train.ft.txt.bz2'


Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [4]:
# Credit https://www.kaggle.com/code/anshulrai/cudnnlstm-implementation-93-7-accuracy

import bz2

# Readling the file to list of comments
train_file = bz2.BZ2File(train_file)
train_file_lines = train_file.readlines()

# Converting from raw binary strings to strings that can be parsed
train_file_lines = [x.decode('utf-8') for x in train_file_lines]

# Extracting the labels and sentences
#train_labels = [0 if x.split(' ')[0] == '__label__1' else 1 for x in train_file_lines]
train_sentences = [x.split(' ', 1)[1][:-1].lower() for x in train_file_lines] # And converting to lower case

del(train_file_lines)  # Free RAM


In [5]:
len(train_sentences)

3600000

### Working with short sentences - temporarly

In [6]:
short_train_sentences = [x for x in train_sentences if len(x.split(' ')) < 20]
len(short_train_sentences)

28005

In [7]:
train_sentences, all_sentences = short_train_sentences, train_sentences

### Sentences similarity

In [8]:
!pip install sentence_transformers -q

In [9]:
from sentence_transformers import SentenceTransformer

Embedding

In [10]:
%%time
model = SentenceTransformer('sentence-transformers/all-MiniLM-L12-v1')
embeddings = model.encode(train_sentences[:100])


CPU times: user 5.99 s, sys: 181 ms, total: 6.17 s
Wall time: 8 s


Finding the most similar sentences to the first sentence

In [11]:
one_sent = embeddings[0]
sim = []
idx, max_idx = 1, 1
for idx, e in enumerate(embeddings[:]):
    cos_sim = np.dot(e, one_sent)/(np.linalg.norm(e)*np.linalg.norm(one_sent))
    sim.append((cos_sim, idx))

sim.sort(reverse=True)
sim[0:4]

[(1.0000001, 0), (0.6311436, 32), (0.55656093, 45), (0.46512488, 67)]

In [12]:
train_sentences[0]

'textbook: book shipped quickly and was in excellent condition as stated. easy transaction would buy again'

In [13]:
train_sentences[32]

'recent purchase: the book i ordered was exactly as described and delivery was even faster than promised.'

Finding K nearest neighbors using Annoy

In [14]:
!pip install annoy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [15]:
k = 3

In [16]:
from annoy import AnnoyIndex

# Build an Annoy index with 10 trees. angular = cosine similarity
annoy_index = AnnoyIndex(embeddings.shape[1], metric='angular')
for i, x in enumerate(embeddings):
    annoy_index.add_item(i, x)
annoy_index.build(10)

# Find the k nearest neighbors to the first sentence
nearest_neighbors = annoy_index.get_nns_by_vector(embeddings[0], k)

print('Nearest neighbors:', nearest_neighbors)


Nearest neighbors: [0, 32, 45]


In [17]:
print(train_sentences[0])
print(train_sentences[32])
print(train_sentences[45])

textbook: book shipped quickly and was in excellent condition as stated. easy transaction would buy again
recent purchase: the book i ordered was exactly as described and delivery was even faster than promised.
recommend this seller: i recieved this book on time and in excellent condition. i'd definitely recommend this seller.


### Summaring multiple sentences into one

In [18]:
# from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# model_name = "snrspeaks/t5-one-line-summary"
# sum_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
# tokenizer = AutoTokenizer.from_pretrained(model_name)

In [19]:
#sum_text([train_sentences[0], train_sentences[32], train_sentences[45]], sum_model, tokenizer)

## Putting it all together - function definition

In [20]:
%%time
emb_model = SentenceTransformer('sentence-transformers/all-MiniLM-L12-v1')

CPU times: user 643 ms, sys: 126 ms, total: 769 ms
Wall time: 860 ms


In [21]:
def print_example(indexes):
    print('Before:')
    for i in indexes:
        print(train_sentences[i])
    print('After:')
    for i in indexes:
        print(annon_sents[i])
    

In [22]:
def find_k_nearesr_neighbors(emb, emb_list, k):
    """ Find the K nearest neighbors using Annoy package """

    # Build an Annoy index with 10 trees. angular = cosine similarity
    annoy_index = AnnoyIndex(emb_list.shape[1], metric='angular')
    for i, x in enumerate(emb_list):
        annoy_index.add_item(i, x)
    annoy_index.build(10)

    # Find the k nearest neighbors to the first sentence
    nearest_neighbors = annoy_index.get_nns_by_vector(emb, k)

    return nearest_neighbors

In [23]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("t5-large")
model = AutoModelForSeq2SeqLM.from_pretrained("t5-large")

def sum_text(doc_list):
    # define the input sentences
    #input_text = '. '.join(doc_list)
    input_text = ''
    i = 1
    for doc in doc_list:
       input_text = f'{input_text}{i}: {doc}. ' 
       i += 1

    # preprocess the input sentences
    input_ids = tokenizer.encode(f'summarize the {i} documents:' + input_text, return_tensors="pt")

    # generate the summary sentence
    output_ids = model.generate(input_ids=input_ids, max_length=32, num_beams=4, early_stopping=True)
    output = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    return output

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-large automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [28]:
def run_anonymization_on_txt(docs, k):
    """ Finding K nearest neighbors and summarize them """
    annon_docs = docs.copy()
    used_indexes = set([])
    
    # Embedding
    docs_emb = emb_model.encode(docs)
    temp_docs_emb = docs_emb.copy()

    for i, d in enumerate(docs):
        #print('i:', i, '\t', used_indexes)
        # To prevent redandent
        if i not in used_indexes:
            used_indexes.add(i)  # Adding to the used items
            similar_doc_ind = find_k_nearesr_neighbors(temp_docs_emb[i], temp_docs_emb, k)
            print('similar_doc_ind', similar_doc_ind)
            curr_docs = []
            for sd in similar_doc_ind:
                # Adding the document to the similar doc list
                curr_docs.append(docs[sd])
                # Adding the index to the used items
                used_indexes.add(sd)  
                # Prevent repeating comparison by changing the vector
                temp_docs_emb[sd] = 1000 * np.random.randint(10, size=len(temp_docs_emb[sd]))
                #temp_docs_emb[sd] = [1000] * len(temp_docs_emb[sd])
            sum_doc = sum_text(curr_docs)
            #print('sum_doc:', sum_doc)
            for sd in similar_doc_ind:
                annon_docs[sd] = sum_doc
        if  len(used_indexes) > (len(docs) - k):
            print('Breaking! \tlen(used_indexes)', len(used_indexes), '\tlen(docs)', len(docs), '\tlen(docs)-k', (len(docs) - k))
            break
    return annon_docs



## Running on 100 examples

In [26]:
%%time
annon_sents = run_anonymization_on_txt(train_sentences[:100], k=3)

similar_doc_ind [0, 32, 45]
similar_doc_ind [1, 72, 73]
similar_doc_ind [2, 63, 41]
similar_doc_ind [3, 49, 56]
similar_doc_ind [4, 35, 47]
similar_doc_ind [5, 6, 80]
similar_doc_ind [7, 36, 37]
similar_doc_ind [8, 50, 12]
similar_doc_ind [9, 92, 86]
similar_doc_ind [10, 26, 70]
similar_doc_ind [11, 54, 40]
similar_doc_ind [13, 88, 82]
similar_doc_ind [14, 69, 29]
similar_doc_ind [15, 30, 24]
similar_doc_ind [16, 85, 94]
similar_doc_ind [17, 34, 98]
similar_doc_ind [18, 59, 83]
similar_doc_ind [19, 78, 87]
similar_doc_ind [20, 90, 28]
similar_doc_ind [21, 33, 25]
similar_doc_ind [22, 64, 39]
similar_doc_ind [23, 99, 55]
similar_doc_ind [27, 42, 95]
similar_doc_ind [31, 96, 43]
similar_doc_ind [38, 76, 58]
similar_doc_ind [44, 57, 79]
similar_doc_ind [46, 93, 66]
similar_doc_ind [48, 61, 74]
similar_doc_ind [51, 53, 52]
similar_doc_ind [60, 67, 75]
similar_doc_ind [62, 81, 91]
similar_doc_ind [65, 97, 68]
similar_doc_ind [71, 77, 84]
Breaking! 	len(used_indexes) 99 	len(docs) 100 	len(d

Output examples

In [27]:
print_example([0, 32, 45])

Before:
textbook: book shipped quickly and was in excellent condition as stated. easy transaction would buy again
recent purchase: the book i ordered was exactly as described and delivery was even faster than promised.
recommend this seller: i recieved this book on time and in excellent condition. i'd definitely recommend this seller.
After:
i recieved this book on time and in excellent condition. i'd definitely recommend this seller.
i recieved this book on time and in excellent condition. i'd definitely recommend this seller.
i recieved this book on time and in excellent condition. i'd definitely recommend this seller.


In [28]:
print_example([1, 72, 73])

Before:
janes all the worlds aircraft 1996-7: great to deal with. very quick delivery. very highly recommended. thank you.
good bike, bad packing: when i received the bike, the packing board was broken.the bike quality is good.
great purchase: installed in seconds. great performance. excellent to eliminate phone line dependency.
After:
janes all the worlds aircraft 1996-7: great deal with very quick delivery. when i received the bike, the packing board was broken.
janes all the worlds aircraft 1996-7: great deal with very quick delivery. when i received the bike, the packing board was broken.
janes all the worlds aircraft 1996-7: great deal with very quick delivery. when i received the bike, the packing board was broken.


In [29]:
print_example([19, 78, 87])

Before:
beach boys and the satan: a totally off and enjoyable movie. all brian wilson fans should see this movie.
great: one of adams best films along side the wedding singer and mr. deeds. funny movie. worth buying
solid follow up: very good movie. for children 7 and up...any younger they may not grasp the concept.
After:
beach boys and the satan: a totally off and enjoyable movie. one of adams best films along side the wedding singer
beach boys and the satan: a totally off and enjoyable movie. one of adams best films along side the wedding singer
beach boys and the satan: a totally off and enjoyable movie. one of adams best films along side the wedding singer


Test anonymity

In [30]:
#get_pesonal_docs(annon_sents)

## Running on 1000 examples

In [47]:
%%time
annon_sents = run_anonymization_on_txt(train_sentences[:501], k=3)

similar_doc_ind [0, 32, 178]
similar_doc_ind [1, 200, 487]
similar_doc_ind [2, 393, 158]
similar_doc_ind [3, 367, 234]
similar_doc_ind [4, 243, 285]
similar_doc_ind [5, 126, 332]
similar_doc_ind [6, 312, 403]
similar_doc_ind [7, 264, 274]
similar_doc_ind [8, 410, 395]
similar_doc_ind [9, 137, 136]
similar_doc_ind [10, 111, 326]
similar_doc_ind [11, 462, 106]
similar_doc_ind [12, 218, 186]
similar_doc_ind [13, 339, 88]
similar_doc_ind [14, 486, 224]
similar_doc_ind [15, 92, 157]
similar_doc_ind [16, 30, 335]
similar_doc_ind [17, 146, 180]
similar_doc_ind [18, 475, 63]
similar_doc_ind [19, 370, 470]
similar_doc_ind [20, 344, 153]
similar_doc_ind [21, 152, 352]
similar_doc_ind [22, 213, 302]
similar_doc_ind [23, 366, 210]
similar_doc_ind [24, 474, 305]
similar_doc_ind [25, 163, 286]
similar_doc_ind [26, 235, 303]
similar_doc_ind [27, 293, 471]
similar_doc_ind [28, 62, 70]
similar_doc_ind [29, 40, 328]
similar_doc_ind [31, 361, 368]
similar_doc_ind [33, 237, 427]
similar_doc_ind [34, 36, 2

In [32]:
# For running on 1000 documents
print_example([0, 546, 933])

Before:
textbook: book shipped quickly and was in excellent condition as stated. easy transaction would buy again
mathbook: excellent condition of book. description is true to the condition of the mathbook.shipped quickly excellent seller.
good deal!: used library book, but still in good condition. book came quickly and was cheap. overall good deal!
After:
used library book, but still in good condition. book came quickly and was cheap. overall good deal!
used library book, but still in good condition. book came quickly and was cheap. overall good deal!
used library book, but still in good condition. book came quickly and was cheap. overall good deal!


In [31]:
print_example([0, 32, 178])

Before:
textbook: book shipped quickly and was in excellent condition as stated. easy transaction would buy again
recent purchase: the book i ordered was exactly as described and delivery was even faster than promised.
great: i received this book promptly and at a good price. i definitely appreciate that kind of service.
After:
book shipped quickly and was in excellent condition as stated. easy transaction would buy again.
book shipped quickly and was in excellent condition as stated. easy transaction would buy again.
book shipped quickly and was in excellent condition as stated. easy transaction would buy again.


In [32]:
print_example([1, 200, 487])

Before:
janes all the worlds aircraft 1996-7: great to deal with. very quick delivery. very highly recommended. thank you.
great service: great customer service and shipping was quick product in great condition. love the movie. thanks.
fantastic: great service, quick shipping, no hassles, item was in perfect condition. no complaints at all.
After:
janes all the worlds aircraft 1996-7: great to deal with. very quick delivery. very highly recommended.
janes all the worlds aircraft 1996-7: great to deal with. very quick delivery. very highly recommended.
janes all the worlds aircraft 1996-7: great to deal with. very quick delivery. very highly recommended.


In [33]:
print_example([2, 393, 158])

Before:
edge of danger: 1 star - only because that's the minimum.this book proves the famous can publish anything.
the five star ratings don't lie on this one: an absolute masterpiece.... there's nothing more to be said
if ever a book deserved an 11...: absolutely wonderful...surely one of the greatest novels to come out from america.
After:
edge of danger is rated 1 star - only because that's the minimum. 5 star ratings don't lie on this one: an
edge of danger is rated 1 star - only because that's the minimum. 5 star ratings don't lie on this one: an
edge of danger is rated 1 star - only because that's the minimum. 5 star ratings don't lie on this one: an


### Another example for summary

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("t5-large")
model = AutoModelForSeq2SeqLM.from_pretrained("t5-large")

In [27]:
# define the input sentences
input_text = "1: textbook: book shipped quickly and was in excellent condition as stated. easy transaction would buy again. 2: recent purchase: the book i ordered was exactly as described and delivery was even faster than promised. 3: recommend this seller: i received this book on time and in excellent condition. i'd definitely recommend this seller."
#input_text = "disappointed: this is what happens when artists get too ambitious, ohh man i'm gonna miss the old nelly. curious, so i bought it: another trying hard wannabe dance diva. she should stick to singing ballads. tangerine dream is great.: i love tangerine dream, but this album is not my favorite. still worth owning though."
input_text = "1: janes all the worlds aircraft 1996-7: great to deal with. very quick delivery. very highly recommended. thank you. 2: good bike, bad packing: when i received the bike, the packing board was broken.the bike quality is good. 3: great purchase: installed in seconds. great performance. excellent to eliminate phone line dependency."

# preprocess the input sentences
input_ids = tokenizer.encode("summarize the 3 documents:" + input_text, return_tensors="pt")

# generate the summary sentence
output_ids = model.generate(input_ids=input_ids, max_length=32, num_beams=4, early_stopping=True)
output = tokenizer.decode(output_ids[0], skip_special_tokens=True)

print(output)


janes all the worlds aircraft 1996-7: great deal with very quick delivery. when i received the bike, the packing board was broken.


## Test anonymity

### Define functions

In [34]:
# Credit: https://www.kaggle.com/code/pierremegret/gensim-word2vec-tutorial 

import spacy

nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser']) # disabling Named Entity Recognition for speed

def cleaning(doc):
    # Defining the document
    doc = nlp(doc) 

    # Lemmatizes and removes stopwords
    # doc needs to be a spacy Doc object
    txt = [token.lemma_ for token in doc if not token.is_stop]
    
    # Word2Vec uses context words to learn the vector representation of a target word,
    # if a sentence is only one or two words long,
    # the benefit for the training is very small
    #if len(txt) > 2:
    #    return ' '.join(txt)
    clean_doc = ' '.join(txt)
    return clean_doc

In [35]:
import re

def clean_corpus(corpus):
    """ Cleans the corpus """
    brief_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in corpus)
    corpus_lemmas = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=5000)]
    return corpus_lemmas

In [36]:
from sklearn.feature_extraction.text import CountVectorizer

# CountVectorizer is defined only once
vectorizer = CountVectorizer(ngram_range=(1,1), # to use bigrams ngram_range=(2,2)
                           stop_words='english')

def get_pesonal_docs(docs, min_k = None):
    """ If K not given, returns the minimal current k and the corresponding documents.
        If k is given, return the documents with k or less neighbohrs  """
    
    # Lemmatizing the documents
    ldocs = clean_corpus(docs)

    # Vectorizing
    count_data = vectorizer.fit_transform(ldocs)
    
    # Counting unique values
    uniq_arr, uniq_cnt = np.unique(count_data.toarray(), axis=0, return_counts=True)
    if not min_k:
        min_k = min(uniq_cnt)
    
    # All the unique vectors
    un_anon = uniq_arr[uniq_cnt <= min_k]

    # Getting the unique vectore indeces
    indeces_list = []
    for row in un_anon:
        # Get the similar rows
        similar_vals = np.where((count_data.toarray() == (row)).all(axis=1))
        indeces_list.append(similar_vals[0].tolist())

    return min_k, indeces_list

In [66]:
from sklearn.feature_extraction.text import CountVectorizer

# CountVectorizer is defined only once
vectorizer = CountVectorizer(ngram_range=(1,1), # to use bigrams ngram_range=(2,2)
                           stop_words='english')

def get_anonym_degree(docs, min_k = None):
    """ If K not given, returns the minimal current k and the corresponding documents.
        If k is given, return the documents with k or less neighbohrs  """
    
    # Lemmatizing the documents
    ldocs = clean_corpus(docs)

    # Vectorizing
    count_data = vectorizer.fit_transform(ldocs)
    count_data = count_data.toarray()
    # Converting any number larger than 1 into 1
    count_data[count_data > 1] = 1
    # Counting unique values
    uniq_arr, uniq_cnt = np.unique(count_data, axis=0, return_counts=True)
    if not min_k:
        min_k = min(uniq_cnt)
        # All the unique vectors
        un_anon = uniq_arr[uniq_cnt <= min_k]
    else:
        # All the unique vectors
        un_anon = uniq_arr[uniq_cnt < min_k]
        min_k = min(uniq_cnt) # For the return value

    # Getting the unique vectore indeces
    indeces_list = []
    for row in un_anon:
        # Get the similar rows
        similar_vals = np.where((count_data == (row)).all(axis=1))
        indeces_list.append(similar_vals[0].tolist())

    return min_k, indeces_list

### Running

In [63]:
get_anonym_degree(annon_sents)

(1, [[420], [434]])

In [67]:
get_anonym_degree(annon_sents, min_k=3)

(1, [[210, 366], [48, 289], [420], [434]])

## GPT-Chat output

**Input**:   
* janes all the worlds aircraft 1996-7: great to deal with. very quick delivery. very highly recommended. thank you.
* good bike, bad packing: when i received the bike, the packing board was broken.the bike quality is good.
* great purchase: installed in seconds. great performance. excellent to eliminate phone line dependency.

**Output**:   
"The reviews indicate positive experiences with quick delivery and great performance of the products, but with some issues in packaging."

____

**Input**:   
* textbook: book shipped quickly and was in excellent condition as stated. easy transaction would buy again
* recent purchase: the book i ordered was exactly as described and delivery was even faster than promised.
* recommend this seller: i recieved this book on time and in excellent condition. i'd definitely recommend this seller.

**Output**:   
"The reviews are positive and recommend the seller for their fast shipping and excellent condition of the books."
____

**Input**:   
* beach boys and the satan: a totally off and enjoyable movie. all brian wilson fans should see this movie.
* great: one of adams best films along side the wedding singer and mr. deeds. funny movie. worth buying
* solid follow up: very good movie. for children 7 and up...any younger they may not grasp the concept.

**Output**:   
"The reviews are positive and recommend the movies for their enjoyable and entertaining qualities."		