In [None]:
#pip install py_thesaurus

In [None]:
import os
import spacy
import random
import nltk 
from nltk.corpus import wordnet 
import en_core_web_sm
import re

##Synonym Replacement

Synonym replacement is a technique in which we replace a word by one of its synonyms. 

The 'syn_replacement' function uses NLTK library and checks the level of similarity (value between 0..1) between a word and its synonyms. Selection of a new word would be done across maximum similarity value(s).

2 Types of POS (Part of Speech) are used: Noun and Verb. POS takes in input as ’n’ or ‘v’ where n stands for noun and v for verbs. Noun and Verb are sorted out using Spacy. If the output of ‘wup_similarity’ is NULL then it passes on to Spacy for comparison of words.

In [13]:

def syn_replacement(word,synonyms,POS):
    """
    - Main aim of this function is to return the most similar word to the given input word.
    - Arguments:
    - word: specific word as a string (present in input data)
    - synonyms: List of all synonyms of a word.
    - (POS) Part of Speech: Noun or Verb.
    """
    max_temp = -1
    flag = 0
    for i in synonyms:
        try:
            # n denotes noun and v denotes verb
            w1 = wordnet.synset(word+'.'+POS+'.01') 
            w2 = wordnet.synset(i+'.'+POS+'.01')
            # check for the highest synomym similarity
            if(max_temp<w1.wup_similarity(w2)):
                max_temp=w1.wup_similarity(w2)
                temp_name = i
                flag =1
        except:
            f = 0
            
    if flag == 0: # If the output of wup_similarity is NULL then use Spacy for comparison of words
        max1 = -1.
        value = ' '
        nlp = en_core_web_sm.load()
        for i in synonyms:
            j=i.replace(' ', '')
            tokens = nlp(u''+j)
            token_main = nlp(u''+word)
            for token1 in token_main:
                if max1<float(token1.similarity(tokens)):
                    max1 = token1.similarity(tokens)
                    value = i
        max1 = -1.
        return value 
    else:
        return temp_name

###I use WordNet, a large linguistic database, to identify relevant synonyms

In [None]:
import nltk
nltk.download('wordnet')

def get_synonyms(word):
    """
    Get synonyms of a word
    """
    synonyms = set()
    
    # Synset from NLTK to look up words in WordNet
    # In WordNet, similar words are grouped into a set known as a Synset (short for Synonym-set)
    # The words in a Synset are known as Lemmas.
    for syn in wordnet.synsets(word): 
        for l in syn.lemmas(): 
            synonym = l.name().replace("_", " ").replace("-", " ").lower()
            synonym = "".join([char for char in synonym if char in ' qwertyuiopasdfghjklzxcvbnm'])
            synonyms.add(synonym) 
    
    if word in synonyms:
        synonyms.remove(word)
    
    return list(synonyms)


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


The main code steps: 
- Read text file from Google Colab drive folder and create a list of Text Files
- For each text file and each line in the text file, process with the following steps:


- Split the line (sentence) into words
- Count the total number of unique words and make a list of unique words
- Remove 1 and 2 letters word & numbers
- Make a list of Noun and Verb using spacy from the above word list
- Loop through all words
- Find out all the synonyms using the ‘get synonyms’ function and make a list
- Checking If a selected word is Noun or Verb and then passing to ‘syn_replacement’ function
- Replacing the word with the new most similar word
- Storing the new sentences in output file in Google Colab drive


In [16]:
import warnings
warnings.filterwarnings('ignore')
  
# Read text file from Google Colab drive folder and create a list of Text Files 
all_files = os.listdir("/content/drive/MyDrive/data/")
txt_files = filter(lambda x: x[-4:] == '.txt', all_files)

# For each text file and each line in the text file, process with the following steps

#...per file
for i in txt_files:
    textfile = i
    print("Input File: "+ textfile)
    print(" ")
    path = '/content/drive/MyDrive/data/'+textfile
    exists = os.path.isfile(path)
    if exists: 
        file_open = open(path,"r")
        text_lines = file_open.readlines()
        #...per line in a file
        for text in text_lines:
          output_text = text
          print("Sentence: "+text)
          # Split the line (sentence) into words
          words = text.split()

          # Count the total number of unique words and make a list of unique words
          counts = {}
          for word in words:
              if word not in counts:
                  counts[word] = 0
              counts[word] += 1
          
          # Remove 1 and 2 letters word & numbers.
          one_word = []
          for key, value in counts.items():
              if value == 1 and key.isalpha() and len(key)>2:
                  one_word.append(key)
          
          # Make a list of Noun and Verb using spacy from the above word list
          noun = []
          verb = []
          # nlp refers to the language model loaded by en_core_web_sm.
          nlp = spacy.load('en_core_web_sm')
          doc = nlp(u''+' '.join(one_word))
          for token in doc:
              if  token.pos_ == 'VERB':
                  verb.append(token.text)
              if  token.pos_ == 'NOUN':
                  noun.append(token.text)
              
          all_main =verb + noun
          len_all = len(noun)+len(verb)
          f_output = open('/content/drive/MyDrive/data/'+'output_'+textfile+'file', "a")
          for i in range(len_all):
            word_str = all_main[i]
            
            # Find out all the synonyms of a word
            synonyms = get_synonyms(word_str)
            
            # Replacing the word with the new most similar words
            if i<len(verb):
                change_word=syn_replacement(word_str,synonyms,'v')
                try:
                    search_word = re.search(r'\b('+word_str+r')\b', output_text)
                    Loc = search_word.start()
                    output_text = output_text[:int(Loc)] + change_word + output_text[int(Loc) + len(word_str):]
                except:
                    f=0

            else:
                change_word=syn_replacement(word_str,synonyms,'n')
                try:
                    search_word = re.search(r'\b('+word_str+r')\b', output_text)
                    Loc = search_word.start()
                    output_text = output_text[:int(Loc)] + change_word + output_text[int(Loc) + len(word_str):]
                except:
                    f=0
            
            # Storing the new sentences in output file
            f_output.write(str(output_text))
            print(output_text)

Input File: sample2.txt
 
Sentence: data can leave customer premises

data displace leave customer premises

data displace lead customer premises

information displace lead customer premises

information displace lead client premises

information displace lead client premiss

Sentence: data has to stay in customer premises

data has to remain in customer premises

information has to remain in customer premises

information has to remain in client premises

information has to remain in client premiss

Sentence: It is not allowed to export the data

It is not let to export the data

It is not let to exportation the data

It is not let to exportation the information

Sentence: exported data should be first be encrypted

export data should be first be encrypted

export data   be first be encrypted

export data   be first be write in code

export information   be first be write in code

Sentence: exported data should be encrypted at rest and in transit

export data should be encrypted at re