In [4]:
import numpy as np
import pandas as pd
import os               
import json

https://towardsdatascience.com/text-tiling-done-right-building-solid-foundations-for-your-personal-llm-e70947779ac1

https://medium.com/msackiit/what-is-text-similarity-and-how-to-implement-it-c74c8b641883

In [5]:
# Relative path
path_to_data = 'rssDevData/'

# Names of 5 blog categories to import 
blogs = ['DavidWalsh','DeveloperDotCom','DZone','GeeksForGeeks','SCAND','SDTimes']

# Identify key to blog text in JSON file
key = 'text'

# Initialize empty data frame
text_data = pd.DataFrame()

#Initialize counter to keep track of files that fail
files_not_read = 0

category_sizes = [] # To store number of blog articles by category
labels = [] # To store "true" labels ->  [0, 1, 2, 3, 4]
label = 0

for blog in blogs :
    current_blog_category_count = 0
    
    path_to_blogs = path_to_data + blog 
    
    for root, dir, files in os.walk(path_to_blogs) :
        # get list of only json files
        json_files = [pos_json for pos_json in files if pos_json.endswith('.json')]

        for j in json_files :
            with open(root + '/' + j, 'r') as f:
                try:
                    data = json.load(f)
                    if len(data[key]) > 200 :
                        try : # fails when there is more than one json object in the file 
                            text_data = pd.concat([text_data, pd.DataFrame(data, index=[0])], ignore_index=True) 
                            current_blog_category_count += 1 
                            labels.append(label) 
                        except : 
                            files_not_read += 1
                except:
                    print(j)
print('Files not read in: ', str(files_not_read))
print('Files read in: ', str(len(text_data)))
print(text_data.head())

5-web-design-trends-.json
locate-empty-directo.json
overview-of-spiral-s.json
13-code-quality-metr.json
can-value-stream-man.json
challenges-and-check.json
competition-of-the-m.json
connect-memphis-as-a.json
correlations-made-ea.json
creating-crap-faster.json
Files not read in:  0
Files read in:  1050
                                     header_title  \
0   Convert Fahrenheit to Celsius with JavaScript   
1     Create a Thumbnail From a Video with ffmpeg   
2                      CSS ::file-selector-button   
3                Customizing HTML Form Validation   
4  Detect Browser Bars Visibility with JavaScript   

                              date  \
0  Wed, 26 Oct 2022 10:19:49 +0000   
1  Tue, 25 Oct 2022 09:28:58 +0000   
2  Mon, 20 Feb 2023 09:50:57 +0000   
3  Mon, 09 Jan 2023 10:57:00 +0000   
4  Fri, 30 Dec 2022 01:36:35 +0000   

                                                text  \
0  The United States is one of the last bodies th...   
1  Creating a thumbnail to represent 

https://spotintelligence.com/2022/12/19/text-similarity-python/

## 1. Text similarity with NLTK (Lexical)

In [6]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def sentence_similarity_NLTK(text):
    # Split the text into sentences
    sentences = sent_tokenize(text)

    # Tokenize and lemmatize the sentences
    lemmatizer = WordNetLemmatizer()
    tokenized_sentences = []
    for sentence in sentences:
        tokens = word_tokenize(sentence)
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
        tokenized_sentences.append(tokens)

    # Remove stopwords
    stop_words = stopwords.words('english')
    filtered_sentences = []
    for tokens in tokenized_sentences:
        filtered_sentence = [token for token in tokens if token not in stop_words]
        filtered_sentences.append(filtered_sentence)

    # Create the TF-IDF vectors
    vectorizer = TfidfVectorizer()
    tfidf_vectors = vectorizer.fit_transform([' '.join(sentence) for sentence in filtered_sentences])

    # Calculate the cosine similarity for each pair of sentences
    similarity_scores = cosine_similarity(tfidf_vectors)

    # Store the similarity scores along with the sentence indices
    similarity_results = []
    num_sentences = len(sentences)
    for i in range(num_sentences):
        for j in range(i + 1, num_sentences):
            similarity_score = similarity_scores[i][j]
            similarity_results.append((i, j, round(similarity_score,3)))

    return similarity_results


In [7]:
example_text = "This is an example text. It consists of multiple sentences. The Text Tiling method is used to segment it into coherent sections. Each section represents a different topic. The boundaries are identified based on shifts in vocabulary and lexical scores."
sentence_similarity_NLTK(example_text)

[(0, 1, 0.0),
 (0, 2, 0.152),
 (0, 3, 0.0),
 (0, 4, 0.0),
 (1, 2, 0.0),
 (1, 3, 0.0),
 (1, 4, 0.0),
 (2, 3, 0.114),
 (2, 4, 0.089),
 (3, 4, 0.0)]

## NLTK Modified

In [8]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def sentence_similarity_NLTK_m(text, max_df=1.0, min_df=1, ngram_range=(1, 1), use_idf=True, smooth_idf=True, sublinear_tf=False):
    # Split the text into sentences
    sentences = sent_tokenize(text)

    # Tokenize and lemmatize the sentences
    lemmatizer = WordNetLemmatizer()
    tokenized_sentences = []
    for sentence in sentences:
        tokens = word_tokenize(sentence)
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
        tokenized_sentences.append(tokens)

    # Remove stopwords
    stop_words = stopwords.words('english')
    filtered_sentences = []
    for tokens in tokenized_sentences:
        filtered_sentence = [token for token in tokens if token not in stop_words]
        filtered_sentences.append(filtered_sentence)

    # Create the TF-IDF vectors with custom hyperparameters
    vectorizer = TfidfVectorizer(max_df=max_df, min_df=min_df, ngram_range=ngram_range, use_idf=use_idf, smooth_idf=smooth_idf, sublinear_tf=sublinear_tf)
    tfidf_vectors = vectorizer.fit_transform([' '.join(sentence) for sentence in filtered_sentences])

    # Calculate the cosine similarity for each pair of sentences
    similarity_scores = cosine_similarity(tfidf_vectors)

    # Store the similarity scores along with the sentence indices
    similarity_results = []
    num_sentences = len(sentences)
    for i in range(num_sentences):
        for j in range(i + 1, num_sentences):
            similarity_score = similarity_scores[i][j]
            similarity_results.append((i, j, round(similarity_score, 3)))

    return similarity_results


In [9]:
sentence_similarity_NLTK_m(example_text)

[(0, 1, 0.0),
 (0, 2, 0.152),
 (0, 3, 0.0),
 (0, 4, 0.0),
 (1, 2, 0.0),
 (1, 3, 0.0),
 (1, 4, 0.0),
 (2, 3, 0.114),
 (2, 4, 0.089),
 (3, 4, 0.0)]

## 2. Text similarity with Scikit-Learn (Lexical)

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re

def sentence_similarity_Scikit_Learn(text):
    # Split the text into sentences
    sentences = re.split(r'[.!?]', text)
    sentences = [sentence.strip() for sentence in sentences if sentence.strip() != ""]

    # Create the TF-IDF vectors
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_vectors = vectorizer.fit_transform(sentences)

    # Calculate the cosine similarity for each pair of sentences
    similarity_scores = cosine_similarity(tfidf_vectors)

#     return similarity_scores

    # Store the similarity scores along with the sentence indices
    similarity_results = []
    num_sentences = len(sentences)
    for i in range(num_sentences):
        for j in range(i + 1, num_sentences):
            similarity_score = similarity_scores[i][j]
            similarity_results.append((i, j, round(similarity_score,3)))

    return similarity_results


In [21]:
sentence_similarity_Scikit_Learn(example_text)

[(0, 1, 0.0),
 (0, 2, 0.196),
 (0, 3, 0.0),
 (0, 4, 0.0),
 (1, 2, 0.0),
 (1, 3, 0.0),
 (1, 4, 0.0),
 (2, 3, 0.0),
 (2, 4, 0.0),
 (3, 4, 0.0)]

In [22]:
# sentence_similarity_Scikit_Learn(text_data['text'][0])

## 3.Text similarity with BERT (Semantic)

In [23]:
import transformers
import numpy as np
import torch

def sentence_similarity_BERT(text):
    # Load the BERT model and tokenizer
    model = transformers.BertModel.from_pretrained('bert-base-uncased')
    tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased')

    # Split the text into sentences
    sentences = text.split('.')

    # Remove empty sentences
    sentences = [sentence.strip() for sentence in sentences if sentence.strip() != ""]

    # Encode each sentence to obtain embeddings
    embeddings = []
    for sentence in sentences:
        tokens = tokenizer.encode_plus(sentence, add_special_tokens=True, max_length=512, padding='max_length', truncation=True, return_tensors='pt')
        with torch.no_grad():
            output = model(**tokens)  # Pass the tokens as a dictionary
            encoding = output.last_hidden_state.mean(dim=1).squeeze(0)  # Calculate sentence embeddings using the mean pooling
        embeddings.append(encoding)

    # Calculate the cosine similarity for each pair of sentences
    similarity_scores = np.zeros((len(embeddings), len(embeddings)))
    for i in range(len(embeddings)):
        for j in range(i + 1, len(embeddings)):
            similarity_scores[i][j] = np.dot(embeddings[i], embeddings[j]) / (np.linalg.norm(embeddings[i]) * np.linalg.norm(embeddings[j]))
            similarity_scores[j][i] = similarity_scores[i][j]  # As cosine similarity is symmetric

    #     return similarity_scores

    # Store the similarity scores along with the sentence indices
    similarity_results = []
    num_sentences = len(sentences)
    for i in range(num_sentences):
        for j in range(i + 1, num_sentences):
            similarity_score = similarity_scores[i][j]
            similarity_results.append((i, j, round(similarity_score,3)))

    return similarity_results

In [24]:
sentence_similarity_BERT(example_text)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[(0, 1, 0.735),
 (0, 2, 0.767),
 (0, 3, 0.792),
 (0, 4, 0.676),
 (1, 2, 0.845),
 (1, 3, 0.847),
 (1, 4, 0.818),
 (2, 3, 0.844),
 (2, 4, 0.838),
 (3, 4, 0.822)]

In [25]:
# sentence_similarity_BERT(text_data['text'][0])

## 4. Text similarity with RoBERTa (Semantic)

In [26]:
import transformers
import numpy as np
import torch

def sentence_similarity_RoBERTa(text):
    # Load the RoBERTa model and tokenizer
    model = transformers.RobertaModel.from_pretrained('roberta-base')
    tokenizer = transformers.RobertaTokenizer.from_pretrained('roberta-base')

    # Split the text into sentences
    sentences = text.split('.')

    # Remove empty sentences
    sentences = [sentence.strip() for sentence in sentences if sentence.strip() != ""]

    # Encode each sentence to obtain embeddings
    embeddings = []
    for sentence in sentences:
        tokens = tokenizer.encode_plus(sentence, add_special_tokens=True, max_length=512, padding='max_length', truncation=True, return_tensors='pt')
        with torch.no_grad():
            output = model(**tokens)  # Pass the tokens as a dictionary
            encoding = output.last_hidden_state.mean(dim=1).squeeze(0)  # Calculate sentence embeddings using the mean pooling
        embeddings.append(encoding)

    # Calculate the cosine similarity for each pair of sentences
    similarity_scores = np.zeros((len(embeddings), len(embeddings)))
    for i in range(len(embeddings)):
        for j in range(i + 1, len(embeddings)):
            similarity_scores[i][j] = np.dot(embeddings[i], embeddings[j]) / (np.linalg.norm(embeddings[i]) * np.linalg.norm(embeddings[j]))
            similarity_scores[j][i] = similarity_scores[i][j]  # As cosine similarity is symmetric

#     return similarity_scores

    # Store the similarity scores along with the sentence indices
    similarity_results = []
    num_sentences = len(sentences)
    for i in range(num_sentences):
        for j in range(i + 1, num_sentences):
            similarity_score = similarity_scores[i][j]
            similarity_results.append((i, j, round(similarity_score,3)))

    return similarity_results




In [27]:
sentence_similarity_RoBERTa(example_text)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[(0, 1, 0.969),
 (0, 2, 0.791),
 (0, 3, 0.886),
 (0, 4, 0.846),
 (1, 2, 0.811),
 (1, 3, 0.919),
 (1, 4, 0.882),
 (2, 3, 0.94),
 (2, 4, 0.95),
 (3, 4, 0.986)]

In [28]:
# sentence_similarity_RoBERTa(text_data['text'][0])

In [29]:
# from sentence_transformers import SentenceTransformer
# model = SentenceTransformer('distilbert-base-nli-mean-tokens')

# sentences = [
#     'the person wear red T-shirt',
#     'this person is walking',
#     'the boy wear red T-shirt'
#     ]
# sentence_embeddings = model.encode(sentences)

# for sentence, embedding in zip(sentences, sentence_embeddings):
#     print("Sentence:", sentence)
#     print("Embedding:", embedding)
#     print("")

## 5. Text similarity with Sentence Transformers (Semantic)

https://www.sbert.net/docs/usage/semantic_textual_similarity.html

https://stackoverflow.com/questions/65199011/is-there-a-way-to-check-similarity-between-two-full-sentences-in-python

In [33]:
import nltk
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Download the nltk sentence tokenizer data
nltk.download('punkt')

def sentence_similarities_Transformers(text):
    # Tokenize the text into sentences
    sentences = nltk.sent_tokenize(text)

    # Initialize the SentenceTransformer model
    model = SentenceTransformer('distilbert-base-nli-mean-tokens')

    # Encode the sentences into embeddings
    sentence_embeddings = model.encode(sentences)

    # Calculate the cosine similarity for each pair of sentences
    similarity_results = []
    num_sentences = len(sentences)
    for i in range(num_sentences):
        for j in range(i + 1, num_sentences):
            similarity = cosine_similarity(sentence_embeddings[i].reshape(1, -1), sentence_embeddings[j].reshape(1, -1))
            similarity_score = round(similarity[0][0], 3)
            similarity_results.append((i, j, similarity_score))

    return similarity_results

sentence_similarities_Transformers(example_text)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\msalehi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


[(0, 1, 0.668),
 (0, 2, 0.604),
 (0, 3, 0.627),
 (0, 4, 0.482),
 (1, 2, 0.662),
 (1, 3, 0.784),
 (1, 4, 0.649),
 (2, 3, 0.595),
 (2, 4, 0.635),
 (3, 4, 0.632)]

In [None]:
# sentence_similarities_Transformers(text_data['text'][0])

## 6. Text similarity with TFHub Universal Sentence Encoder (Semantic)

https://tfhub.dev/google/universal-sentence-encoder/4

In [34]:
import tensorflow_hub as hub
from scipy.spatial.distance import cosine

def sentence_similarities_USE(text):
    # Load the Universal Sentence Encoder model
    embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

    # Tokenize the text into sentences
    sentences = text.split('.')  # Split the text by periods to get sentences

    # Remove empty sentences resulting from the split
    sentences = [sentence.strip() for sentence in sentences if sentence.strip()]

    # Get the sentence embeddings
    embeddings = embed(sentences)

    # Calculate the similarity for each pair of sentences
    similarity_results = []
    num_sentences = len(sentences)
    for i in range(num_sentences):
        for j in range(i + 1, num_sentences):
            similarity = 1.0 - cosine(embeddings[i], embeddings[j])
            similarity_results.append((i, j, round(similarity, 3)))

    return similarity_results

In [37]:
# Example usage
sentence_similarities_USE(example_text)


[(0, 1, 0.341),
 (0, 2, 0.257),
 (0, 3, 0.233),
 (0, 4, 0.104),
 (1, 2, 0.177),
 (1, 3, 0.29),
 (1, 4, 0.177),
 (2, 3, 0.177),
 (2, 4, 0.222),
 (3, 4, -0.032)]

In [38]:
# sentence_similarities_USE(text_data['text'][0])