In [34]:
import re
import json
import numpy as np
import pandas as pd
from string import punctuation
from sklearn.feature_extraction.text import TfidfVectorizer
%matplotlib inline

In [35]:
corpus = []
with open('corpus.json', 'r') as f:
    corpus = json.load(f)

In [36]:
corpus[0]

'Groundwater    Groundwater is the word used to describe water that saturates the ground, filling all the available spaces. By far the most abundant type of groundwater is meteoric water; this is the groundwater that circulates as part of the water cycle. Ordinary meteoric water is water that has soaked into the ground from the surface, from precipitation (rain and snow) and from lakes and streams. There it remains, sometimes for long periods, before emerging at the surface again. At first thought it seems incredible that there can be enough space in the “solid” ground underfoot to hold all this water.  The necessary space is there, however, in many forms. The commonest spaces are those among the particles—sand grains and tiny pebbles—of loose, unconsolidated sand and gravel. Beds of this material, out of sight beneath the soil, are common. They are found wherever fast rivers carrying loads of coarse sediment once flowed. For example, as the great ice sheets that covered North America 

In [37]:
def prepare_text(raw_text):
    punctuation_removed = raw_text.translate(str.maketrans(punctuation, ' ' * len(punctuation), '')).strip()
    lowered = punctuation_removed.lower()
    no_number = re.sub(r'\d+', '', lowered)
    return no_number

In [38]:
normal_corpus = list(map(prepare_text, corpus))

In [39]:
normal_corpus[0]

'groundwater    groundwater is the word used to describe water that saturates the ground  filling all the available spaces  by far the most abundant type of groundwater is meteoric water  this is the groundwater that circulates as part of the water cycle  ordinary meteoric water is water that has soaked into the ground from the surface  from precipitation  rain and snow  and from lakes and streams  there it remains  sometimes for long periods  before emerging at the surface again  at first thought it seems incredible that there can be enough space in the “solid” ground underfoot to hold all this water   the necessary space is there  however  in many forms  the commonest spaces are those among the particles—sand grains and tiny pebbles—of loose  unconsolidated sand and gravel  beds of this material  out of sight beneath the soil  are common  they are found wherever fast rivers carrying loads of coarse sediment once flowed  for example  as the great ice sheets that covered north america 

In [40]:
from sklearn.feature_extraction import text
stop_words = text.ENGLISH_STOP_WORDS

In [41]:
vectorizer = TfidfVectorizer(stop_words=stop_words)
X = vectorizer.fit_transform(normal_corpus)

In [42]:
vectorizer.get_feature_names()

['abandon',
 'abandoned',
 'abandoning',
 'abbasid',
 'abbreviated',
 'abbreviation',
 'abbé',
 'abilities',
 'ability',
 'ablaze',
 'able',
 'aboard',
 'abolished',
 'abolishing',
 'abominable',
 'aboriginal',
 'aborigines',
 'abortive',
 'abound',
 'abounded',
 'aboveground',
 'abrabian',
 'abrasion',
 'abroad',
 'abrogating',
 'abrupt',
 'abruptly',
 'absence',
 'absences',
 'absent',
 'absolute',
 'absolutely',
 'absorb',
 'absorbed',
 'absorbing',
 'absorbs',
 'absorption',
 'abstract',
 'abstraction',
 'abstractions',
 'absurd',
 'abu',
 'abundance',
 'abundances',
 'abundant',
 'abundantly',
 'abuse',
 'abyss',
 'acacia',
 'academic',
 'academically',
 'accelerated',
 'accelerates',
 'accelerating',
 'accentuated',
 'accept',
 'acceptable',
 'acceptance',
 'accepted',
 'accepting',
 'access',
 'accessible',
 'accessory',
 'accident',
 'accidental',
 'accidentally',
 'accidents',
 'acclimated',
 'acclimatization',
 'accommodate',
 'accommodation',
 'accompanied',
 'accompanies',


In [49]:
weight_sum = vectorizer.idf_.sum()
p = list([w / weight_sum for w in vectorizer.idf_])

In [54]:
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet

[nltk_data] Downloading package wordnet to /Users/parsa/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [63]:
from googletrans import Translator

In [64]:
translator = Translator()

In [82]:
word = np.random.choice(vectorizer.get_feature_names(), 1, p=p)[0]
print(word)
print(translator.translate(word, dest='fa').text)
synonyms = []
for syn in wordnet.synsets(word):
    for lm in syn.lemmas():
        synonyms.append(lm.name())
print(set(synonyms))

antonyms = []
for syn in wordnet.synsets(word):
    for lm in syn.lemmas():
        if lm.antonyms():
            antonyms.append(lm.antonyms()[0].name())
print(set(antonyms))

colder
سردتر
{'stale', 'cold-blooded', 'cold', 'inhuman', 'frigid', 'insensate', 'dusty', 'moth-eaten'}
{'hot'}
