# End-to-End LLM based solution
Using sentence semantic symilarity and summaring models

In [21]:
!pip install k-means-constrained -q


## Tests and Tries

In [22]:
import numpy as np

In [23]:
COLAB = True

In [24]:
if COLAB:
    from google.colab import drive
    from os.path import exists

    amazon_train_file = 'train.ft.txt.bz2'

    if exists(amazon_train_file):
        train_file = amazon_train_file
    else:
        drive.mount('/content/drive/')

        # For Hadas' drive
        my_dir = 'drive/MyDrive/Y-data/Intuit-K-anonimity/'

        # For Lior's drive
        #my_dir = 'drive/MyDrive/Y-data/Y-DATA_PROJECT/'

        train_file = my_dir + '/train.ft.txt.bz2'
else:
    train_file = '../data/' + 'train.ft.txt.bz2'


Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [25]:
# Credit https://www.kaggle.com/code/anshulrai/cudnnlstm-implementation-93-7-accuracy

import bz2

# Readling the file to list of comments
train_file = bz2.BZ2File(train_file)
train_file_lines = train_file.readlines()

# Converting from raw binary strings to strings that can be parsed
train_file_lines = [x.decode('utf-8') for x in train_file_lines]

# Extracting the labels and sentences
#train_labels = [0 if x.split(' ')[0] == '__label__1' else 1 for x in train_file_lines]
train_sentences = [x.split(' ', 1)[1][:-1].lower() for x in train_file_lines] # And converting to lower case

del(train_file_lines)  # Free RAM


In [26]:
len(train_sentences)

3600000

### Working with short sentences - temporarly

In [27]:
short_train_sentences = [x for x in train_sentences if len(x.split(' ')) < 20]
len(short_train_sentences)

28005

In [28]:
# train_sentences, all_sentences = short_train_sentences, train_sentences

In [29]:
from random import sample

train_sentences = sample(train_sentences, 100)

### Sentences similarity

In [30]:
!pip install sentence_transformers -q

In [31]:
from sentence_transformers import SentenceTransformer

Embedding

In [32]:
%%time
model = SentenceTransformer('sentence-transformers/all-MiniLM-L12-v1')
embeddings = model.encode(train_sentences)  # Already sampled


CPU times: user 16.5 s, sys: 117 ms, total: 16.7 s
Wall time: 8.71 s


In [33]:
type(embeddings)

numpy.ndarray

## Using K-means with constrains

Defining the hyper-parameters

In [34]:
k = 2

In [35]:
num_clusters = len(train_sentences) // k

In [36]:
from k_means_constrained import KMeansConstrained

clf = KMeansConstrained(
     n_clusters=num_clusters,
     size_min=k,
     size_max=k,
     random_state=0
)
clf.fit_predict(embeddings)
print(clf.cluster_centers_)
print(clf.labels_)

[[-0.03709989  0.04431973  0.02570675 ... -0.05153409  0.00998777
   0.0389981 ]
 [-0.00331573  0.01557269 -0.00049056 ... -0.03938213 -0.0203734
   0.07345776]
 [-0.04112616 -0.02924925 -0.01480457 ... -0.03343952 -0.01853525
   0.08223258]
 ...
 [-0.03135222 -0.00972221 -0.00269012 ... -0.00746237  0.0257678
   0.01774795]
 [-0.03032733  0.03304454 -0.09535258 ...  0.04236696  0.0183112
   0.02095491]
 [-0.0166149   0.02031434  0.04433139 ... -0.03455397  0.02233887
   0.06630941]]
[ 5  5 32 31 44 36 27 18 46 35 18 11 46 19  9 39  8 23 16 43 16 21 45  0
 19 43 37 44  6 41 40 10 48  6 28 33 21  4 15  3 14 36 29 30 14 13 15 10
 35  0 13  3 25  8 25 38 23 17 48 28 17 49 11 45 41 26 22 42 40 42 24 31
 37 39 20 20 30 38  7  2 47  1  4 32 47 27 12 34  2 22 29 34 24  7 33 26
 12  1  9 49]


In [65]:
100 % 3

1

In [81]:
from k_means_constrained import KMeansConstrained

def ckmeans(data, k):
    """
    Runs k-means with constrains.
    Credit: https://towardsdatascience.com/advanced-k-means-controlling-groups-sizes-and-selecting-features-a998df7e6745
    """
    num_clusters = len(data) // k
    min_size = k
    max_size = k

    # For example, if k=3 and there are 100 sequences,
    # allow one cluster with k+1
    mod_data = len(data) % k
    if mod_data != 0:
        max_size += mod_data
    
    clf = KMeansConstrained(
     n_clusters=num_clusters,
     size_min=min_size,
     size_max=max_size,
     random_state=0
    )
    clf.fit_predict(data)
    pair_list = []
    for i in range(1, num_clusters):
        curr_pair = np.where(clf.labels_ == (i))[0].tolist()
        if curr_pair not in pair_list:
            pair_list.append(tuple(curr_pair))
        
    return pair_list

In [82]:
ckmeans(embeddings, k=3)

[(15, 31, 49),
 (17, 56, 63),
 (57, 60, 78),
 (7, 36, 59),
 (6, 28, 33),
 (1, 35, 94),
 (18, 20, 23),
 (37, 40, 58),
 (74, 75, 91),
 (34, 62, 66),
 (27, 85, 96),
 (3, 70, 92),
 (32, 73, 86),
 (39, 51, 84),
 (26, 44, 71),
 (41, 47, 54),
 (52, 55, 89),
 (25, 81, 97),
 (14, 22, 98),
 (42, 90, 93),
 (45, 50, 72),
 (5, 24, 79, 88),
 (0, 61, 99),
 (2, 38, 46),
 (29, 64, 87),
 (13, 19, 21),
 (8, 16, 69),
 (11, 43, 76),
 (12, 67, 83),
 (4, 53, 80),
 (30, 65, 95),
 (10, 68, 77)]

## Tries

Finding the most similar sentences to the first sentence

In [None]:
one_sent = embeddings[0]
sim = []
idx, max_idx = 1, 1
for idx, e in enumerate(embeddings[:]):
    cos_sim = np.dot(e, one_sent)/(np.linalg.norm(e)*np.linalg.norm(one_sent))
    sim.append((cos_sim, idx))

sim.sort(reverse=True)
sim[0:4]

[(1.0000001, 0), (0.6311436, 32), (0.55656093, 45), (0.46512488, 67)]

In [None]:
train_sentences[0]

'textbook: book shipped quickly and was in excellent condition as stated. easy transaction would buy again'

In [None]:
train_sentences[32]

'recent purchase: the book i ordered was exactly as described and delivery was even faster than promised.'

Finding K nearest neighbors using Annoy

In [None]:
!pip install annoy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting annoy
  Downloading annoy-1.17.2.tar.gz (647 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m647.4/647.4 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: annoy
  Building wheel for annoy (setup.py) ... [?25l[?25hdone
  Created wheel for annoy: filename=annoy-1.17.2-cp310-cp310-linux_x86_64.whl size=582708 sha256=0d100be4232ad17affd6a5470d321a7174b1fef449fb5b83978971a121819b19
  Stored in directory: /root/.cache/pip/wheels/7a/d9/59/473fa56df8e39430eeda369500b4e7127f5b243ba24c3c4297
Successfully built annoy
Installing collected packages: annoy
Successfully installed annoy-1.17.2


In [None]:
k = 3

In [None]:
from annoy import AnnoyIndex

# Build an Annoy index with 10 trees. angular = cosine similarity
annoy_index = AnnoyIndex(embeddings.shape[1], metric='angular')
for i, x in enumerate(embeddings):
    annoy_index.add_item(i, x)
annoy_index.build(10)

# Find the k nearest neighbors to the first sentence
nearest_neighbors = annoy_index.get_nns_by_vector(embeddings[0], k)

print('Nearest neighbors:', nearest_neighbors)


Nearest neighbors: [0, 32, 45]


In [None]:
embeddings

array([[-0.0036093 ,  0.05210925, -0.12970327, ..., -0.02006356,
        -0.04063172,  0.02199112],
       [-0.0052765 ,  0.06117417, -0.03607288, ..., -0.00327409,
         0.02298493,  0.00951664],
       [ 0.05052014,  0.05758742, -0.08671991, ..., -0.03691006,
         0.05248274,  0.05065772],
       ...,
       [-0.0529007 , -0.02020332,  0.01042636, ...,  0.01338089,
         0.01423867,  0.01405939],
       [ 0.04678291, -0.04269627,  0.12175517, ..., -0.01851123,
         0.04590768, -0.03081301],
       [-0.04308654, -0.05753473, -0.03191447, ..., -0.02193826,
         0.0352149 ,  0.03438053]], dtype=float32)

In [None]:
print(train_sentences[0])
print(train_sentences[32])
print(train_sentences[45])

textbook: book shipped quickly and was in excellent condition as stated. easy transaction would buy again
recent purchase: the book i ordered was exactly as described and delivery was even faster than promised.
recommend this seller: i recieved this book on time and in excellent condition. i'd definitely recommend this seller.


### Summaring multiple sentences into one

In [None]:
# from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# model_name = "snrspeaks/t5-one-line-summary"
# sum_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
# tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
#sum_text([train_sentences[0], train_sentences[32], train_sentences[45]], sum_model, tokenizer)

## Putting it all together - function definition

In [None]:
%%time
emb_model = SentenceTransformer('sentence-transformers/all-MiniLM-L12-v1')

CPU times: user 442 ms, sys: 39 ms, total: 481 ms
Wall time: 424 ms


In [None]:
def print_example(indexes, origina_docs, new_docs):
    print('Before:')
    for i in indexes:
        print(origina_docs[i])
    print('\nAfter:')
    for i in indexes:
        print(new_docs[i])
    

In [None]:
def find_k_nearesr_neighbors(emb, emb_list, k):
    """ Find the K nearest neighbors using Annoy package """

    # Build an Annoy index with 10 trees. angular = cosine similarity
    annoy_index = AnnoyIndex(emb_list.shape[1], metric='angular')
    for i, x in enumerate(emb_list):
        annoy_index.add_item(i, x)
    annoy_index.build(10)

    # Find the k nearest neighbors to the first sentence
    nearest_neighbors = annoy_index.get_nns_by_vector(emb, k)

    return nearest_neighbors

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("t5-large")
model = AutoModelForSeq2SeqLM.from_pretrained("t5-large")

def sum_text(doc_list):
    # define the input sentences
    #input_text = '. '.join(doc_list)
    input_text = ''
    i = 1
    for doc in doc_list:
       input_text = f'{input_text}{i}: {doc}. ' 
       i += 1

    # preprocess the input sentences
    input_ids = tokenizer.encode(f'summarize:' + input_text, return_tensors="pt")

    # generate the summary sentence
    output_ids = model.generate(input_ids=input_ids, max_length=32, num_beams=4, early_stopping=True)
    output = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    return output

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-large automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


Downloading pytorch_model.bin:   0%|          | 0.00/2.95G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
def run_anonymization_on_txt(docs, k):
    """ Finding K nearest neighbors and summarize them """
    annon_docs = docs.copy()
    used_indexes = set([])
    
    # Embedding
    docs_emb = emb_model.encode(docs)
    temp_docs_emb = docs_emb.copy()

    neighbor_list = []
    for i, d in enumerate(docs):
        #print('i:', i, '\t', used_indexes)
        # To prevent redandent
        if i not in used_indexes:
            #used_indexes.add(i)  # Adding to the used items
            similar_doc_ind = find_k_nearesr_neighbors(temp_docs_emb[i], temp_docs_emb, k)
            
            # Saving the neighbors
            neighbor_list.append(similar_doc_ind)

            curr_docs = []
            for sd in similar_doc_ind:
                if sd in used_indexes:
                    print('Error: the index', sd, 'was already used')
                # Adding the document to the similar doc list
                curr_docs.append(docs[sd])
                # Adding the index to the used items
                used_indexes.add(sd)  
                # Prevent repeating comparison by changing the vector
                temp_docs_emb[sd] = 1000 * np.random.randint(10, size=len(temp_docs_emb[sd]))
                #temp_docs_emb[sd] = [1000] * len(temp_docs_emb[sd])
            sum_doc = sum_text(curr_docs)
            print('similar_doc_ind', similar_doc_ind, '\tSummary:', sum_doc)
            #print('sum_doc:', sum_doc)
            for sd in similar_doc_ind:
                annon_docs[sd] = sum_doc
        if  len(used_indexes) > (len(docs) - k):
            print('Breaking! \tlen(used_indexes)', len(used_indexes), '\tlen(docs)', len(docs), '\tlen(docs)-k', (len(docs) - k))
            break
    return annon_docs, neighbor_list



## Running on 100 examples

In [None]:
%%time
annon_sents_100, neighbor_list_100 = run_anonymization_on_txt(train_sentences[:100], k=3)

similar_doc_ind [0, 32, 45] 	Summary: the book shipped quickly and was in excellent condition as stated. i'd definitely recommend this seller.
similar_doc_ind [1, 72, 73] 	Summary: janes all the worlds aircraft 1996-7: great deal, very quick delivery. when i received the bike, the packing board was broken.
similar_doc_ind [2, 63, 41] 	Summary: edge of danger: 1 star - only because that's the minimum. the write-up was not good. this is definitely a children
similar_doc_ind [3, 49, 56] 	Summary: maeve's forte is character development, dialogue, and interaction. there wasn't enough romance though. great performances by all of the
similar_doc_ind [4, 35, 47] 	Summary: frida's more attractive soundtrack sister: utterly amazing and unique sound. america the beautiful: this book works great for teaching the
similar_doc_ind [5, 6, 80] 	Summary: wishmaster is a fantastic album, that should grace everyone's music collection. innovating and always refreshing. truly a masterpiece.
similar_doc_ind 

Output examples

In [None]:
print_example([0, 32, 45], train_sentences, annon_sents_100)

Before:
textbook: book shipped quickly and was in excellent condition as stated. easy transaction would buy again
recent purchase: the book i ordered was exactly as described and delivery was even faster than promised.
recommend this seller: i recieved this book on time and in excellent condition. i'd definitely recommend this seller.

After:
the book shipped quickly and was in excellent condition as stated. i'd definitely recommend this seller.
the book shipped quickly and was in excellent condition as stated. i'd definitely recommend this seller.
the book shipped quickly and was in excellent condition as stated. i'd definitely recommend this seller.


In [None]:
print_example([1, 72, 73], train_sentences, annon_sents_100)

Before:
janes all the worlds aircraft 1996-7: great to deal with. very quick delivery. very highly recommended. thank you.
good bike, bad packing: when i received the bike, the packing board was broken.the bike quality is good.
great purchase: installed in seconds. great performance. excellent to eliminate phone line dependency.

After:
janes all the worlds aircraft 1996-7: great deal, very quick delivery. when i received the bike, the packing board was broken.
janes all the worlds aircraft 1996-7: great deal, very quick delivery. when i received the bike, the packing board was broken.
janes all the worlds aircraft 1996-7: great deal, very quick delivery. when i received the bike, the packing board was broken.


In [None]:
print_example([19, 78, 87], train_sentences, annon_sents_100)

Before:
beach boys and the satan: a totally off and enjoyable movie. all brian wilson fans should see this movie.
great: one of adams best films along side the wedding singer and mr. deeds. funny movie. worth buying
solid follow up: very good movie. for children 7 and up...any younger they may not grasp the concept.

After:
beach boys and the satan: a totally off and enjoyable movie. one of adams best films along side the wedding singer
beach boys and the satan: a totally off and enjoyable movie. one of adams best films along side the wedding singer
beach boys and the satan: a totally off and enjoyable movie. one of adams best films along side the wedding singer


Test anonymity

In [None]:
#get_pesonal_docs(annon_sents)

## Running on 1000 examples

### K = 3

In [None]:
%%time
annon_sents_1000, neighbor_list_1000 = run_anonymization_on_txt(train_sentences[:1001], k=3)

similar_doc_ind [0, 546, 933] 	Summary: used library book, but still in good condition. book came quickly and was cheap. overall good deal!
similar_doc_ind [1, 200, 713] 	Summary: janes all the worlds aircraft 1996-7: great to deal with. very quick delivery. very highly recommended.
similar_doc_ind [2, 526, 934] 	Summary: this claptrap would have never been published had the author not been black and female. the plot lines are distinctly implausible 
similar_doc_ind [3, 367, 989] 	Summary: the movie has a very weak storyline, loosely linked chapters, total disappointment from a good cast. it was definitely not a classic
similar_doc_ind [4, 243, 285] 	Summary: i love enya's work and this cd is exceptional. my son is thoroughly enjoying this audio version of the music found
similar_doc_ind [5, 518, 126] 	Summary: the things this man can do is amazing. this cd is incredible. exciting the whole way through.
similar_doc_ind [6, 312, 773] 	Summary: wishmaster is a fantastic album, that shoul

KeyboardInterrupt: ignored

### k = 2

In [None]:
annon_sents_1000_2, neighbor_list_1000_2 = run_anonymization_on_txt(train_sentences[:1001], k=2)

similar_doc_ind [17, 890] 	Summary: dr. eby claimed jesus did not return in his (eby's) lifetime. contrary to dr.
similar_doc_ind [18, 579] 	Summary: everyons should read m. mead's research. her research explains many issues that remain current topics of contention.
similar_doc_ind [19, 370] 	Summary: beach boys and the satan: a totally off and enjoyable movie. moves fast, good characters, sense of humor, and good music
similar_doc_ind [20, 809] 	Summary: this book is the second-best book on toefl after educational testing services' (ets) toefl book. these
similar_doc_ind [21, 534] 	Summary: sea runners will etch the coastline of british columbia in your mind. caribou hunting by larry bart
similar_doc_ind [22, 683] 	Summary: a must have if you were recently diagnosed with partial seizures..
similar_doc_ind [23, 366] 	Summary: under the tuscan sun was such an enjoyable movie. keanu reeves and charlize theron are
similar_doc_ind [24, 641] 	Summary: while the look is dated it is wonderful

KeyboardInterrupt: ignored

Saving output

In [None]:
# df_output_1000 = train_sentences[:1001].copy().to_frame()
# df_output_1000['annon_sents'] = annon_sents_1000
import pandas as pd

df_output_1000 = pd.DataFrame(train_sentences[:1001])
df_output_1000['annon_sents'] = annon_sents_1000
# Adding the neighbor list
df_output_1000['neigbors'] = None
for k_neighbors in neighbor_list_1000:
    for n in k_neighbors:
        df_output_1000.loc[n, 'neigbors'] = str(k_neighbors)

df_output_1000.to_csv('df_output_1000.csv', index=False)

## Output examples

### Examples after running

In [None]:
df_output_1000

In [None]:
# For running on 1000 documents
print_example([54, 750, 324], train_sentences, annon_sents_1000)

In [None]:
print_example([1, 619, 200], train_sentences, annon_sents_1000)

In [None]:
print_example([2, 526, 934], train_sentences, annon_sents_1000)

In [None]:
print_example([3, 367, 989], train_sentences, annon_sents_1000)

In [None]:
print_example([5, 518, 126], train_sentences, annon_sents_1000)

In [None]:
print_example([72, 712, 159], train_sentences, annon_sents_1000)

In [None]:
print_example([86, 349, 117], train_sentences, annon_sents_1000)

### Similar documents

#### k = 2

**[18, 579]**
* m. mead: everyons should read m. mead's research. her research explains many issues that remain current topics of contention.   
* good read very informitive: very good book , informitive content relates to the real world.buy it to believe it!   


**[20, 809]**
* second-best resource: this book is the second-best book on toefl after educational testing services' (ets) toefl book.
* extremely helpful!: these practice tests improved my score by 2-3 points. this is a great mcat resource!

**[26, 940]**
* the alchemist: sure this is an interesting book. unfortunately the copy we received was written in spanish!
* immensee: a beautiful story, but the translation from the original language (german) is unbelievably poor.

**[29, 992]**
* low budget: some good action and scenery, but otherwise pretty low budget movie. only recommended for die-hard horror fans.
* horrible movie!: please listen everybody this movie is terrible-don't bother ok!it is just dumb not scary!


**[35, 779]**
* america the beautiful: this book works great for teaching the song. the hardback cover is great for multiple uses.
* enjoyable music: bought this to use for our church's hawaiaan-harvest festival for children ages 4-12. great music!

**[44, 604]**
* invaluable: the authors have provided the quintessential study guide to the canterbury tales. this book is invaluable.
* magical: wonderful characters, stories within stories. this book offers beautiful writing, a deep spirituality and a happy ending!


#### k = 3

In [None]:
# [8, 410, 895]
print(train_sentences[8])
print(train_sentences[410])
print(train_sentences[895])

dont know!: the program looks fun but the software i recieved was corupt it wont load on any machine.
sad: initially i thought the software was difficult to use. then i realized that it was just unuseable.
out of this world: best product microsoft thas ever bought. superb. pity there isnt a linux version available!


**[0, 546, 933]**    
* textbook: book shipped quickly and was in excellent condition as stated. easy transaction would buy again
* mathbook: excellent condition of book. description is true to the condition of the mathbook.shipped quickly excellent seller.
* good deal!: used library book, but still in good condition. book came quickly and was cheap. overall good deal!

**[6, 312, 773]**
  * great: wishmaster is a fantastic album, that should grace everyone's music collection. innovating and always refreshing. truly a masterpiece.
  * fantastic debut: a great debut record from a band who deserve a lot of attention. buy this record twice.
  * great music, great arrangements and performance.: very satisfied with this cd. cem duruoz is a genius.

**[8, 410, 895]**
* dont know!: the program looks fun but the software i recieved was corupt it wont load on any machine.
* sad: initially i thought the software was difficult to use. then i realized that it was just unuseable.
* out of this world: best product microsoft thas ever bought. superb. pity there isnt a linux version available!

**[9, 136, 160]**
* bad cover condition: bad cover. half of the book is ruined because the spine has come unglued...false product description!
* beautiful books.: books arrived in good time and in excellent condition. the cover designs are wonderful.
* poor quality.: the seat covers started coming loose at the seams immediately upon putting them on the seats.

**[11, 802, 817]**
* action: lots of action, but totally unbelievable, just like watching wrestling. amusing enough to watch but not to own.
* 13 minutes of action crammed into 2 1/2 hours.....: formulaic. dull. insipid.one sore thumb down...(too much fast forwarding..)
* good adapation from best selling book.: thought this was an excellent movie. lots of action, adventure, and humor.

**[16, 30, 803]**
* expensive: 38.39 is a ridiculous price for 12 red bull. especially considering they cost 2.09 in a store.
* thomas the train tracks: this is a good product, but highly over-priced. $10 for four pieces of track ???????
* very dissapointing: these are soooo awful!!thin ,cheap and lightweight. not even worth the sale price.


Write a general sentence that best represents each of the following sentences and is true to all of them.

Sentences:
- the alchemist: sure this is an interesting book. unfortunately the copy we received was written in spanish!
- immensee: a beautiful story, but the translation from the original language (german) is unbelievably poor.

Result:
Good book, problem with the translation.

Sentences:
- low budget: some good action and scenery, but otherwise pretty low budget movie. only recommended for die-hard horror fans.
- horrible movie!: please listen everybody this movie is terrible-don't bother ok!it is just dumb not scary!

Result:
The movie was bad.

Sentences:
- america the beautiful: this book works great for teaching the song. the hardback cover is great for multiple uses.
- enjoyable music: bought this to use for our church's hawaiaan-harvest festival for children ages 4-12. great music!

Result:
Good and useful music.

Sentences:
 - invaluable: the authors have provided the quintessential study guide to the canterbury tales. this book is invaluable.
- magical: wonderful characters, stories within stories. this book offers beautiful writing, a deep spirituality and a happy ending!

Result:
Wonderful book and great writing.

Sentences:
- second-best resource: this book is the second-best book on toefl after educational testing services' (ets) toefl book.
- extremely helpful!: these practice tests improved my score by 2-3 points. this is a great mcat resource!

Result:
**Helpful resource, but not the best.**


m. mead: everyons should read m. mead's research. her research explains many issues that remain current topics of contention.
good read very informitive: very good book , informitive content relates to the real world.buy it to believe it!
**Informative book with relevant content.**








Sentences:
- textbook: book shipped quickly and was in excellent condition as stated. easy transaction would buy again
- mathbook: excellent condition of book. description is true to the condition of the mathbook.shipped quickly excellent seller.
- good deal!: used library book, but still in good condition. book came quickly and was cheap. overall good deal!

Result:
Good condition book, good delivery and good seller.

Sentences:
- great: wishmaster is a fantastic album, that should grace everyone's music collection. innovating and always refreshing. truly a masterpiece.
- fantastic debut: a great debut record from a band who deserve a lot of attention. buy this record twice.
- great music, great arrangements and performance.: very satisfied with this cd. cem duruoz is a genius.

Result:
**Great music, highly recommended.**

## Test anonymity

### Define functions

In [None]:
# Credit: https://www.kaggle.com/code/pierremegret/gensim-word2vec-tutorial 

import spacy

nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser']) # disabling Named Entity Recognition for speed

def cleaning(doc):
    # Defining the document
    doc = nlp(doc) 

    # Lemmatizes and removes stopwords
    # doc needs to be a spacy Doc object
    txt = [token.lemma_ for token in doc if not token.is_stop]
    
    # Word2Vec uses context words to learn the vector representation of a target word,
    # if a sentence is only one or two words long,
    # the benefit for the training is very small
    #if len(txt) > 2:
    #    return ' '.join(txt)
    clean_doc = ' '.join(txt)
    return clean_doc

In [None]:
import re

def clean_corpus(corpus):
    """ Cleans the corpus """
    brief_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in corpus)
    corpus_lemmas = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=5000)]
    return corpus_lemmas

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# CountVectorizer is defined only once
vectorizer = CountVectorizer(ngram_range=(1,1), # to use bigrams ngram_range=(2,2)
                           stop_words='english')

def get_anonym_degree(docs, min_k = None):
    """ If K not given, returns the minimal current k and the corresponding documents.
        If k is given, return the documents with k or less neighbohrs  """
    
    # Lemmatizing the documents
    ldocs = clean_corpus(docs)

    # Vectorizing
    count_data = vectorizer.fit_transform(ldocs)
    count_data = count_data.toarray()
    # Converting any number larger than 1 into 1
    count_data[count_data > 1] = 1
    # Counting unique values
    uniq_arr, uniq_cnt = np.unique(count_data, axis=0, return_counts=True)
    if not min_k:
        min_k = min(uniq_cnt)
        # All the unique vectors
        un_anon = uniq_arr[uniq_cnt <= min_k]
    else:
        # All the unique vectors
        un_anon = uniq_arr[uniq_cnt < min_k]
        min_k = min(uniq_cnt) # For the return value

    # Getting the unique vectore indeces
    indeces_list = []
    for row in un_anon:
        # Get the similar rows
        similar_vals = np.where((count_data == (row)).all(axis=1))
        indeces_list.append(similar_vals[0].tolist())

    return min_k, indeces_list

### Running

In [None]:
get_anonym_degree(annon_sents)

In [None]:
get_anonym_degree(annon_sents, min_k=3)

### Another example for summary

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("t5-large")
model = AutoModelForSeq2SeqLM.from_pretrained("t5-large")

In [None]:
# define the input sentences
input_text = "1: textbook: book shipped quickly and was in excellent condition as stated. easy transaction would buy again. 2: recent purchase: the book i ordered was exactly as described and delivery was even faster than promised. 3: recommend this seller: i received this book on time and in excellent condition. i'd definitely recommend this seller."
#input_text = "disappointed: this is what happens when artists get too ambitious, ohh man i'm gonna miss the old nelly. curious, so i bought it: another trying hard wannabe dance diva. she should stick to singing ballads. tangerine dream is great.: i love tangerine dream, but this album is not my favorite. still worth owning though."
input_text = "1: janes all the worlds aircraft 1996-7: great to deal with. very quick delivery. very highly recommended. thank you. 2: good bike, bad packing: when i received the bike, the packing board was broken.the bike quality is good. 3: great purchase: installed in seconds. great performance. excellent to eliminate phone line dependency."

# preprocess the input sentences
input_ids = tokenizer.encode("summarize:" + input_text, return_tensors="pt")

# generate the summary sentence
output_ids = model.generate(input_ids=input_ids, max_length=32, num_beams=4, early_stopping=True)
output = tokenizer.decode(output_ids[0], skip_special_tokens=True)

print(output)


**Input**:   
* janes all the worlds aircraft 1996-7: great to deal with. very quick delivery. very highly recommended. thank you.
* good bike, bad packing: when i received the bike, the packing board was broken.the bike quality is good.
* great purchase: installed in seconds. great performance. excellent to eliminate phone line dependency.

**Output**:   
"The reviews indicate positive experiences with quick delivery and great performance of the products, but with some issues in packaging."

____

**Input**:   
* textbook: book shipped quickly and was in excellent condition as stated. easy transaction would buy again
* recent purchase: the book i ordered was exactly as described and delivery was even faster than promised.
* recommend this seller: i recieved this book on time and in excellent condition. i'd definitely recommend this seller.

**Output**:   
"The reviews are positive and recommend the seller for their fast shipping and excellent condition of the books."
____

**Input**:   
* beach boys and the satan: a totally off and enjoyable movie. all brian wilson fans should see this movie.
* great: one of adams best films along side the wedding singer and mr. deeds. funny movie. worth buying
* solid follow up: very good movie. for children 7 and up...any younger they may not grasp the concept.

**Output**:   
"The reviews are positive and recommend the movies for their enjoyable and entertaining qualities."		

## GPT-Chat output