In [101]:
import re
import nltk as nltk
from word2number import w2n
from nltk.stem.wordnet import WordNetLemmatizer
import spacy
from spacy import displacy
from collections import Counter
nlp = spacy.load("en_core_web_sm")
nltk.download('wordnet')
python -m spacy download en_core_web_sm



[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Number treatment

In [51]:
schema = ""
question = "How many seven billion people have a salary greater than one hundred ?"
sql = ""

In [97]:
def replace_text_number(text):

    tagged_number_words = 'billion/CD million/CD ten/CD thousand/CD nine/CD hundred/CD ninety/CD eight/CD seven/CD six/CD five/CD four/CD three/CD two/CD one/CD eighty/CD seventy/CD sixty/CD fifty/CD forty/CD thirty/CD twenty/CD nineteen/CD eighteen/CD seventeen/CD sixteen/CD fifteen/CD fourteen/CD thirteen/CD twelve/CD eleven/CD zero/CD'
    tagged_number_words_tuples = [nltk.tag.str2tuple(t) for t in tagged_number_words.split()]
    my_tagger = nltk.UnigramTagger([ tagged_number_words_tuples ], backoff=nltk.DefaultTagger('IGNORE'))

    my_grammar = 'NumberWord: {<CD>+}'
    parser = nltk.RegexpParser(my_grammar)
    parsed = parser.parse(my_tagger.tag(nltk.word_tokenize(text.lower())))
    #print(parsed)

    for tag in [tree.leaves() for tree in parsed.subtrees() if tree.label() == 'NumberWord']:
        ut = nltk.untag(tag)
        num = w2n.word_to_num(' '.join(ut))

        r = re.compile(re.escape(' '.join(ut)), re.IGNORECASE)
        text = r.sub(str(num), text)

    #print('-- AFTER --')
    return text

def number_treatment_process(schema = "", question = "", sql = ""):
    question_process = replace_text_number(question)
    return schema, question_process, sql

In [98]:
print(number_treatment_process(question = question))

('', 'How many 7000000000 people have a salary greater than 100 ?', '')


## Stemming 

In [54]:
def stemming_process(schema = "", question = "", sql = ""):
    question_split = question.split()
    o = []
    for word in question_split:
        try:
            o += [str(WordNetLemmatizer().lemmatize(word,'v'))]
        except ValueError:
            o += [word] 

    question_process = ' '.join(o)
    return schema, question_process, sql

In [61]:
test = "How worked working"
print(stemming_process(question = test))

How work work


## Surrounding Entity

In [102]:
def surrounding_entity_process(schema = "",question = "", sql = ""):
    doc = nlp(question)
    entities = doc.ents
    question_process = question
    for word in entities:
        question_process = question_process.replace(str(word),"'"+str(word)+"'")

    return schema, question_process, sql

In [103]:
test = "Asean and Twitter and New York"
print(surrounding_entity_process(question = test))

('', "'Asean' and 'Twitter' and 'New York'", '')


In [69]:
a = "Hao"
print("'"+a)

'Hao


In [1]:
def remove_redundant_field_words(schema, question, sql):
    schema = schema.replace('_FIELD', '')
    sql = sql.replace('_FIELD', '')
    return schema, question, sql

In [2]:
schema = 'DIRECTION_FIELD : TEXT, MANTRA_FIELD : TEXT'
question = ''
sql = "SELECT NAME_FIELD FROM TABLE WHERE NAME_OF_THE_LAKE_FIELD = 'LAGO DI LUZZONE'"

schema, question, sql = remove_redundant_field_words(schema, question, sql)

print(schema)
print(sql)

DIRECTION : TEXT, MANTRA : TEXT
SELECT NAME FROM TABLE WHERE NAME_OF_THE_LAKE = 'LAGO DI LUZZONE'
