In [1]:
import time
import spacy
import nltk
import yake


def remove_duplicates(ele_list):
    new_list = []
    for ele in ele_list:
        if ele not in new_list:
            new_list.append(ele)
    return new_list


def append_ngrams(noun_list, n_size=2):
    new_list = list()
    for sentence in noun_list:
        new_list.append(','.join([x+'_'+y for x,y in list(nltk.ngrams(sentence.split(','), n_size))]))
    new_list = [noun_list[i] + ',' + new_list[i] for i in range(len(noun_list))]
    return new_list


def parse_doc(nlp, text):
    return nlp(text)


def get_noun(doc):
    noun_list = []
    for word in doc:
        if word.pos_ in ['PROPN', 'NOUN']:
            noun_list.append(word.text)
    noun_list = remove_duplicates(noun_list)
    return ",".join(noun_list)


def get_verb(doc):
    verb_list = []
    for word in doc:
        if word.pos_ in ['VERB']:
            verb_list.append(word.text)
    verb_list = remove_duplicates(verb_list)
    return ",".join(verb_list)


def get_ner(doc):
    ner_list = []
    for ent in doc.ents:
        ner_list.append(ent.text)
    ner_list = remove_duplicates(ner_list)
    return ",".join(ner_list)


def get_keywords(docs):
    language = "en"
    max_ngram_size = 3
    deduplication_thresold = 0.9
    deduplication_algo = 'seqm'
    numOfKeywords = 1000
    
    list_of_keys = list()
    custom_kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_thresold, dedupFunc=deduplication_algo, top=numOfKeywords, features=None)
    
    for loc, each_article in enumerate(docs):
        keywords = custom_kw_extractor.extract_keywords(each_article)
        temp1 = list()
        for i, j in keywords:
            temp1.append(j)
        list_of_keys.append(", ".join(temp1))
    return list_of_keys


def get_features(docs, stages):
    default_stages = {
        'nouns': True,
        'verbs': True,
        'noun_phrases': False,
        'keywords': False,
        'ner': True,    
    }
    default_stages = default_stages.update(stages)
    
    t = time.time()
    noun_chunks = list()
    # verbs_list = list()
    nlp = spacy.load('en_core_web_sm')
    # ners_list = list()
    # nouns_list = list()

#    for text in docs:
#         doc = parse_doc(nlp, text)
        # verbs_list.append(get_verb(doc))
        # ners_list.append(get_ner(doc))
        # nouns_list.append(get_noun(doc))
#        noun_chunks.append(','.join(remove_duplicates([str(x) for x in list(doc.noun_chunks)])))
    
    for doc in nlp.pipe(docs, disable=['ner']):
        # nouns_list.append(get_noun(doc))
        # verbs_list.append(get_verb(doc))
        noun_chunks.append(','.join(remove_duplicates([str(x) for x in list(doc.noun_chunks)])))

    # keywords = [",".join(filter(None, [x, y])) for x, y in zip(noun_chunks, verbs_list)]
    # keywords = append_ngrams(nouns_list, 2)
    # keywords = noun_chunks
    yake_keywords = get_keywords(docs)
    keywords = [",".join(filter(None, [x, y])) for x, y in zip(noun_chunks, yake_keywords)]
    keywords = [','.join(set(x.split(','))) for x in keywords]
    print("Time elapsed for Keyword Extraction:", time.time() - t)
    return keywords

In [5]:
docs = ['Time elapsed for Keywords Extraction would be defined here in this space', 'Also, let me know what you think of this']
get_features(docs, {})

Time elapsed for Keyword Extraction: 0.4613072872161865


['Time,this space,Keywords Extraction, elapsed for keywords, time elapsed, keywords, time, elapsed, extraction, space,keywords extraction, defined',
 'you,what,me']

In [9]:
nlp = spacy.load('en_core_web_sm')
for doc in docs:
    da = nlp(doc)
    for token in da:
        print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)

Time time NOUN NN nsubjpass Xxxx True False
elapsed elapse VERB VBN acl xxxx True False
for for ADP IN prep xxx True True
Keywords Keywords PROPN NNP compound Xxxxx True False
Extraction Extraction PROPN NNP pobj Xxxxx True False
would would VERB MD aux xxxx True True
be be AUX VB auxpass xx True True
defined define VERB VBN ROOT xxxx True False
here here ADV RB advmod xxxx True True
in in ADP IN prep xx True True
this this DET DT det xxxx True True
space space NOUN NN pobj xxxx True False
Also also ADV RB advmod Xxxx True True
, , PUNCT , punct , False False
let let VERB VB ROOT xxx True False
me -PRON- PRON PRP nsubj xx True True
know know VERB VB ccomp xxxx True False
what what PRON WP dobj xxxx True True
you -PRON- PRON PRP nsubj xxx True True
think think VERB VBP ccomp xxxx True False
of of ADP IN prep xx True True
this this DET DT pobj xxxx True True


### Updates 

* Feature extractions like Nouns, Verbs, Adjectives, Numbers, Noun Phrases, NERs, Keywords
* Vectorization tools like TF-IDF, GloVe, Word2Vec, Bag of Words