In [3]:
!pip install flair



In [4]:
from flair.data import Sentence
from flair.models import SequenceTagger

# Pos tagging of given input sentence using flair pretrained model for ENGLISH
def get_Pos_tags_from_sentence(input_sentence):

  # load tagger
  tagger = SequenceTagger.load("flair/pos-english")

  # make example sentence
  sentence = Sentence(input_sentence)

  # predict POS tags
  tagger.predict(sentence)

  # print(sentence.tokens)

  # create a list to store POS tags
  pos_tags = []

  # iterate through each token in the sentence and retrieve its POS tag
  for token in sentence:
      pos_tags.append(token.tag)

  # # print the list of POS tags
  # print(pos_tags)

  return pos_tags

In [5]:
# Download NLTK data
# Wordnet to get synets of given word
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [20]:
import nltk
from nltk.translate.bleu_score import sentence_bleu
from nltk.corpus import wordnet
from itertools import product
import random
import math
import tensorflow as tf
import tensorflow_hub as hub
from heapq import nlargest
from tqdm import tqdm
import numpy as np

# How much peraphrases to consider at time of generation among all possible combinations
percent_random = 0.4
# At max generation of peraphrases
upper_bound = 4000
# How many peraphrases to return
top_k_results = 10


# Generate percent_random numbers for pick sentences from all possible comibnations.
def generate_unique_random_numbers(n):
    # Calculate the number of unique numbers to generate (40% of n)
    no_of_uniques = math.ceil(percent_random * n)
    # print(no_of_uniques)
    num_unique_numbers = int(percent_random * n)

    no_of_uniques = min(no_of_uniques,upper_bound)

    # Generate unique random numbers between 0 and n
    unique_random_numbers = random.sample(range(n+1), no_of_uniques)

    return unique_random_numbers

# Get all the synets for a paticular word using wordnet.
def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
          if lemma.name() != word:
            synonyms.add(lemma.name())
    return list(synonyms)

# Make all the possible combinations of sentences from all the synonyms from choosen pos tags
def generate_paraphrases(sentence,indices_of_choosen_tags):
    tokens = nltk.word_tokenize(sentence)
    print(tokens)
    # synonyms_per_word = [get_synonyms(word) for word in tokens]
    synonyms_per_word = [[word] if not get_synonyms(word) or index not in indices_of_choosen_tags else get_synonyms(word) for index,word in enumerate(tokens)]
    print("Synonyms : ",synonyms_per_word)
    paraphrases = []
    total_combintions = len(list(product(*synonyms_per_word)))

    # # Generate all combinations
    # all_combinations = list(product(*synonyms_per_word))

    # # Shuffle the list of combinations
    # random.shuffle(all_combinations)

    print("Total Combinations : ",total_combintions)
    unique_random_numbers = generate_unique_random_numbers(total_combintions)

    print(f"Considering {len(unique_random_numbers)} peraphrases out of {total_combintions}")
    # Generate possible combination
    for index, combination in tqdm(enumerate(product(*synonyms_per_word)), total=len(unique_random_numbers), desc="Peraphrase Generation : "):
        # Selecting the sentence if its presnet in random numbers.
        if index in unique_random_numbers:
          paraphrases.append(' '.join(combination))
          # print(paraphrases)
    return paraphrases

# Calculate similarity of paraphrased sentence with input sentence
def compare_to_input(sentence, paraphrases):
    reference_sentence = nltk.word_tokenize(sentence)
    paraphrase_scores = []

    for paraphrase in tqdm(paraphrases,total = len(paraphrases) , desc = "Comparision"):
        candidate_sentence = nltk.word_tokenize(paraphrase)
        similarity_score = semantic_similarity(sentence, paraphrase)  # Pass each sentence individually
        paraphrase_scores.append((paraphrase, similarity_score))

    # Get the top k paraphrases based on similarity score
    top_paraphrases = nlargest(top_k_results, paraphrase_scores, key=lambda x: x[1])

    return [paraphrase for paraphrase, score in top_paraphrases]


# Load the Universal Sentence Encoder
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

def semantic_similarity(sentence1, sentence2):
    # Compute embeddings for the two sentences
    embeddings = embed([sentence1, sentence2])

    # Calculate cosine similarity between the embeddings
    similarity_score = np.dot(embeddings[0], embeddings[1]) / (np.linalg.norm(embeddings[0]) * np.linalg.norm(embeddings[1]))

    return similarity_score

# Choose pos tags whoes synonyms need to find
def indices_of_choosen_pos(inp_tag_list):
  choosen_tags = ["NN","VB","VBD","VBG","VBN","NNP","NNPS","NNS","UH","JJ","JJR","JJS","VBP"]

  indices_of_tags = []

  for index,tag in enumerate(inp_tag_list):
    if tag in choosen_tags:
      indices_of_tags.append(index)

  return indices_of_tags

In [27]:
input_sentence = input("Enter a sentence: ")
pos_tags_of_sentence = get_Pos_tags_from_sentence(input_sentence)
print(pos_tags_of_sentence)
indices_of_choosen_tags = indices_of_choosen_pos(pos_tags_of_sentence)
paraphrases = generate_paraphrases(input_sentence,indices_of_choosen_tags)
best_paraphrase = compare_to_input(input_sentence, paraphrases)
print()
print("Top Peraphrases : ")
for pera in best_paraphrase:
  print(pera)

Enter a sentence: Work hard study hard and push your limits
2024-04-04 20:27:20,530 SequenceTagger predicts: Dictionary with 53 tags: <unk>, O, UH, ,, VBD, PRP, VB, PRP$, NN, RB, ., DT, JJ, VBP, VBG, IN, CD, NNS, NNP, WRB, VBZ, WDT, CC, TO, MD, VBN, WP, :, RP, EX, JJR, FW, XX, HYPH, POS, RBR, JJS, PDT, NNPS, RBS, AFX, WP$, -LRB-, -RRB-, ``, '', LS, $, SYM, ADD
['VB', 'RB', 'VB', 'RB', 'CC', 'VB', 'PRP$', 'NNS']
['Work', 'hard', 'study', 'hard', 'and', 'push', 'your', 'limits']
Synonyms :  [['oeuvre', 'do_work', 'piece_of_work', 'forge', 'make', 'solve', 'form', 'exploit', 'study', 'act', 'ferment', 'exercise', 'play', 'wreak', 'go', 'cultivate', 'function', 'work', 'make_for', 'puzzle_out', 'run', 'workplace', 'body_of_work', 'work_out', 'employment', 'mould', 'turn', 'sour', 'influence', 'act_upon', 'figure_out', 'process', 'work_on', 'crop', 'bring', 'knead', 'mold', 'operate', 'shape', 'put_to_work', 'lick'], ['hard'], ['field', 'report', 'consider', 'analyse', 'subject_field', 'cog

Peraphrase Generation : : 452025it [00:21, 21013.97it/s]
Comparision: 100%|██████████| 4000/4000 [00:26<00:00, 150.21it/s]


Top Peraphrases : 
study hard work hard and pushing your limit
act hard work hard and pushing your limit
work hard field_of_study hard and force your limit
study hard work hard and thrust your demarcation_line
work hard take hard and drive your specify
work hard consider hard and fight your limitation
study hard take hard and pushing your restrict
work hard work hard and force your point_of_accumulation
puzzle_out hard work hard and drive your limit
work hard hit_the_books hard and drive your limit



