In [1]:
!git clone https://github.com/facebookresearch/muss.git
%cd muss/
!pip install -e .   # Install package
!python -m spacy download en_core_web_md 

!pip uninstall fastbpe -y
!pip3 install fastbpe

!python -m spacy download en_core_web_md
!python -m spacy download en_core_web_sm

!pip install -U sentence-transformers

Cloning into 'muss'...
remote: Enumerating objects: 395, done.[K
remote: Counting objects: 100% (395/395), done.[K
remote: Compressing objects: 100% (238/238), done.[K
remote: Total 395 (delta 188), reused 354 (delta 155), pack-reused 0[K
Receiving objects: 100% (395/395), 5.38 MiB | 11.31 MiB/s, done.
Resolving deltas: 100% (188/188), done.
/kaggle/working/muss
Obtaining file:///kaggle/working/muss
  Preparing metadata (setup.py) ... [?25l- \ done
[?25hCollecting easse@ git+git://github.com/feralvam/easse.git
  Cloning git://github.com/feralvam/easse.git to /tmp/pip-install-04lask0n/easse_e5ec217b1ebc42bbac3b0a52138fdb56
  Running command git clone --filter=blob:none -q git://github.com/feralvam/easse.git /tmp/pip-install-04lask0n/easse_e5ec217b1ebc42bbac3b0a52138fdb56
  Resolved git://github.com/feralvam/easse.git to commit 9f3351917b751a1e51aeca9064046cda49feaf0a
  Preparing metadata (setup.py) ... [?25l- \ done
[?25hCollecting kenlm@ git+git://github.c

## Sentence simplification mining

<font size="4"> The sentence simplification procedure goes over three main steps</font> 
                                                                       
    
1. Tokenizing sentences using NLTK (separating sentence).
2. Filtering out: very long sentences, very short sentences, sentences with a lot of punctuation and poorly formatted sentences. This will reduce the noise in the dataset fed to the model. 

3. Embedding the sentences using SBERT models (https://www.sbert.net/). SBERT models are BERT model trained to generate sentence embeddings that are close when two sentences are close in meaning. It was trained using a Siamese network ([Wikipedia](https://en.wikipedia.org/wiki/Siamese_neural_network)).

4. Used FAISS (https://github.com/facebookresearch/faiss) facebook library to get the top_k closest sentences to each sentence. FAISS is a library developed by facebook for this type of tasks: optimized similarity search using embeddings. It is written in C++, and designed to work even if the tensors do no fit into ram.
    
5. Filtering out distant sentences (sentences with a l2 distance), and pairing up sentences that are close. The threshold for this is `12`. Note that this threshold is dependent mainly on the embedding model (`paraphrase-MiniLM-L6-v2` for this case), and if the model changes, you'll need to find a good new threshold value.
    
6. Some sentences were paired because they were duplicates, some others were paired because they overlaped (one sentences is contained in the other one). For that reason, we removed duplicates and overlapping sentences. We also used the [levenshtein distance](https://en.wikipedia.org/wiki/Levenshtein_distance) (minmum number of changes to get from one sentence to the other) to filter really similar sentences.
    
7. Exporting the sentences as csv.
    
                                                                                                                                                          

In [2]:
import faiss
import numpy as np
from string import punctuation

import pandas as pd

from muss.mining.filtering import (
    is_contained,
    is_overlapping,
    is_different_enough
)

from muss.mining.filtering import SimplicityScorer

## The dataset

<font size="4">Blog posts corpus containing over 600k blog posts scraped from the internet in 2004. We are using it to mine for simplified paraphrases.</font>
<font size="4">More on the dataset: https://www.kaggle.com/rtatman/blog-authorship-corpus </font>

In [3]:
#Choose any articles as you want too
df = pd.read_csv("/kaggle/input/blog-authorship-corpus/blogtext.csv").iloc[:25000]
text_dump = ". ".join(df.text.tolist())

In [4]:
#The sentence embedding model: https://www.sbert.net/docs/pretrained_models.html
embedding_model_name = "paraphrase-MiniLM-L6-v2"

#The threshold that we're using to eliminate semantically distant sentences
similarity_threshold = 13
language = "en"
top_k = 8

In [5]:
def has_too_much_punctuation(text):
    """
    Return True if the sentence has more than 10% punctuation 
    """
    characters = text.replace(' ', '')
    punctuation_count = len([c for c in characters if c in punctuation])
    return punctuation_count / len(characters) > 0.1

## Tokenization and and filtering

In [6]:
import nltk
from tqdm import tqdm

#Separate into sentence, clean sentences, tokenize (using NLTK tokenizer), then filter out short sentences, or sentences with too much punctuation

sentences = []

for ids, article in tqdm(df[["id", "text"]].values):
    
    sent = nltk.sent_tokenize(article.strip())
    sent = [i.strip() for i in sent if i != "Poorly formatted and corrupted."]
    
    sent = list(filter(lambda sent: len(sent) >= 30, sent))
    sent = list(filter(lambda sent: len(sent) < 300, sent))
    sent = list(filter(lambda sent: not has_too_much_punctuation(sent), sent))
    
    sentences.extend([(ids, i) for i in sent])

100%|██████████| 25000/25000 [00:15<00:00, 1615.79it/s]


## Embedding sentences

In [7]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer(embedding_model_name)

Downloading:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.69k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/314 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [8]:
embeddings = model.encode([i[1] for i in sentences])

Batches:   0%|          | 0/7010 [00:00<?, ?it/s]

## Get nearest top-k sentences

In [9]:
def get_nearest_sentence_ids(embeddings, top_k):
    """
    Use FAISS to get the top-k the nearest sentences
    """
    
    index = faiss.IndexFlatL2(embeddings.shape[1]) 
    index.add(embeddings)                 

    all_distances, all_sentence_ids = index.search(embeddings, top_k)


    if np.all(np.diff(all_distances) <= 0):
            # This is taylored for transforming cosine similarity into a pseudo-distance: the maximum cosine similarity is 1 (vectors are equal).
            # Hence distance = 1 - cosine will always be positive and will be be equal to 0 when vectors are equal.
            all_distances = 1 - all_distances

    return all_distances, all_sentence_ids.astype(int)

In [10]:
distances, ids = get_nearest_sentence_ids(embeddings, top_k)
sentences = np.array(sentences)

## Filtering out distant sentences pairs

In [11]:
pairs = []

for i, distance in enumerate(distances):
    
    if (distance[1:] < similarity_threshold).sum() > 0:  
        
        close_sentences = sentences[ids[i]]
        j = 1
        
        for close_sentence in close_sentences[1:]:
            
            if close_sentences[0][0] != close_sentence[0]:
                if ids[i][0] > ids[i][j]:
                    pairs.append((close_sentences[0][1].lower(), close_sentences[j][1].lower()))
                else:
                    pairs.append((close_sentences[j][1].lower(), close_sentences[0][1].lower()))
                    
            j += 1
            
            
#Removing duplicate pairs
pairs = list(set(pairs))

## Removing duplicates, overlapping sentences, and sentences with a small levenshtein ratio

In [12]:
pairs = list(filter(lambda x: not is_contained(x[0], x[1]), pairs))
pairs = list(filter(lambda x: not is_overlapping(x[0], x[1]), pairs))
pairs = list(filter(lambda x: is_different_enough(x[0], x[1]), pairs))

In [13]:
pairs = pd.DataFrame(pairs)

In [14]:
pairs

Unnamed: 0,0,1
0,"today she was online, and we chatted for a whi...",i talked to her for quite a while online.
1,so now i'll just leave you in suspence forever.,"if you want to save me, i'll be endebted to yo..."
2,"as to what exactly happened, i'll just keep it...",perhaps i don't really need to be fixed or cur...
3,"it saddens me, though, that the general public...","as it will turn out, trying to understand the ..."
4,lol well thats pretty much all that happened y...,"anyway, that was about it fer today!"
...,...,...
51421,i've added a blogroll over to the side with so...,"oh, by the way, i have added several links to ..."
51422,you are only coming through in waves.,"really weak waves with miserable rides, but yo..."
51423,i don't really feel that i have anything i wan...,"but since i have this blog, i feel the overwhe..."
51424,"other than this, my day was pretty uneventful.",today was a rather uneventful.


In [15]:
pairs.to_csv("pairs.csv", index=False)