In [154]:
import pandas as pd
import numpy as np
import spacy
import random
from spacy.util import minibatch, compounding

nlp = spacy.load('en_core_web_lg')

In [155]:
def remove_whitespace_entities(doc):
    doc.ents = [e for e in doc.ents if not e.text.isspace()]
    return doc

nlp.add_pipe(remove_whitespace_entities, after='ner')
doc = nlp(u'Hello\nNew York')
print(doc.ents)

(New York,)


In [156]:
df = pd.read_csv("../datawe/raw/Email_Classification/email_entity_cleansed.csv")
entityDf = df[df.apply(lambda x: x["text"][x["start_char"]:x["end_char"]] == x["name"], axis=1)]

In [157]:
entityTrainData = []
for text, item in  entityDf['text'].value_counts().items():
#     print(item, text)
    mulItems = entityDf[entityDf['text'] == text]
    multipleEntities = []
    for dta in mulItems.values:
#         print(dta)
        multipleEntities.append((dta[4], dta[1], dta[2]))
#          entityTrainData.append((dta[5], ))
    entityTrainData.append((text, {'entities':multipleEntities}))
    
TRAIN_DATA = entityTrainData

In [158]:
def main_train(model=None, output_dir=None, n_iter=100):
    """Load the model, set up the pipeline and train the entity recognizer."""
#     if model is not None:
#         nlp = spacy.load(model)  # load existing spaCy model
#         print("Loaded model '%s'" % model)
#     else:
#         nlp = spacy.blank('en')  # create blank Language class
#         print("Created blank 'en' model")

    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, last=True)
    # otherwise, get it so we can add labels
    else:
        ner = nlp.get_pipe('ner')

    # add labels
    for _, annotations in TRAIN_DATA:
        for ent in annotations.get('entities'):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        optimizer = nlp.begin_training()
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop=0.5,  # dropout - make it harder to memorise data
                    sgd=optimizer,  # callable to update weights
                    losses=losses)
            print('Losses', losses)

    # test the trained model
    for text, _ in TRAIN_DATA:
        doc = nlp(text)
#         print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
#         print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
#         print("Saved model to", output_dir)

        # test the saved model
#         print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        for text, _ in TRAIN_DATA:
            doc = nlp2(text)
#             print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
#             print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])
    return nlp



In [139]:
trainedModel = main_train('en_core_web_sm')

Losses {'ner': 11.04036768617255}
Losses {'ner': 8.994827101208504}
Losses {'ner': 7.431158193883179}
Losses {'ner': 7.665321009587325}
Losses {'ner': 6.139390302466461}
Losses {'ner': 5.7883339323906196}
Losses {'ner': 5.10750174003755}
Losses {'ner': 3.8834545573568042}
Losses {'ner': 4.258766549192355}
Losses {'ner': 4.235054012136147}
Losses {'ner': 3.860838005761605}
Losses {'ner': 3.0977877675606114}
Losses {'ner': 3.543331276498473}
Losses {'ner': 2.723645766153609}
Losses {'ner': 2.381960735122874}
Losses {'ner': 2.202290919946724}
Losses {'ner': 2.7878237475888192}
Losses {'ner': 1.801100408717077}
Losses {'ner': 1.636139536098428}
Losses {'ner': 1.877713051378941}
Losses {'ner': 1.595699860457334}
Losses {'ner': 1.2172807294578367}
Losses {'ner': 0.9811632195135973}
Losses {'ner': 1.4473658214348963}
Losses {'ner': 1.6787941906053279}
Losses {'ner': 1.294053436717511}
Losses {'ner': 0.5250647165754453}
Losses {'ner': 0.6002084558009897}
Losses {'ner': 0.7634135523698271}
Loss

In [140]:
doc = trainedModel(u''+ " ".join(entityDf["text"]))
trn_data = []
for ent in doc.ents:
#     trn_data.append({Entity, end_char,label,start_char, text})
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

lake palace 22 33 FAC
1st January 2019 85 101 DATE
lake palace 124 135 FAC
1st 187 190 DATE
lake palace 226 237 FAC
1st 289 292 DATE
lake palace 328 339 FAC
1st January 2019 391 407 DATE
lake palace 430 441 FAC
1st January 2019 493 509 DATE
lake palace 532 543 FAC
1st January 2019 595 611 DATE
Taj Gateway Ganges 710 728 ORG
24/26 December 2018 734 753 DATE
Taj Gateway Ganges 870 888 ORG
24/26 December 2018 894 913 DATE
17/12/2018 1066 1076 DATE
TEHP/18/12809 1580 1593 CARDINAL
Arun Gadamshetty P 1601 1619 PERSON
153021 1626 1632 CARDINAL
Spares Manufacturing Department 1646 1677 ORG
Vivanta 1710 1717 GPE
Taj - Madikeri Coorg 1721 1741 ORG
02:00 PM 1809 1817 TIME
12:00 PM 1852 1860 TIME
1 1875 1876 CARDINAL
Arun Gadamshetty   1952 1970 PERSON
34 years 2 1993 2003 DATE
SHWETA GADAMSHETTY 2013 2031 PERSON
28 years 2058 2066 DATE
TEHP/18/12809 2556 2569 CARDINAL
Arun Gadamshetty P 2577 2595 PERSON
153021 2602 2608 CARDINAL
Spares Manufacturing Department 2622 2653 ORG
Vivanta 2686 2693 GPE

Monday 26 November to Thursday 29 November 19098 19140 DATE
1 19146 19147 CARDINAL
Airport Limousine 19174 19191 ORG
Bengaluru Airport 19206 19223 FAC
61 428 856 843 19245 19259 CARDINAL
Dubai 19437 19442 GPE
1 19447 19448 CARDINAL
28 November – 01 December 2018 19459 19489 DATE
3 19495 19496 QUANTITY
Dubai 19616 19621 GPE
1 19626 19627 CARDINAL
28 November – 01 December 2018 19638 19668 DATE
3 19674 19675 QUANTITY
Dubai 19795 19800 GPE
1 19805 19806 CARDINAL
28 November – 01 December 2018 19817 19847 DATE
3 19853 19854 QUANTITY
Dubai 19974 19979 GPE
1 19984 19985 CARDINAL
28 November – 01 December 2018 19996 20026 DATE
3 20032 20033 QUANTITY
4 20093 20094 CARDINAL
12th Jan, 2019 20100 20114 DATE
15th Jan, 2019 20118 20132 ORDINAL
2 20134 20135 CARDINAL
2 20147 20148 CARDINAL
16 20155 20157 QUANTITY
4 20309 20310 CARDINAL
12th Jan, 2019 20316 20330 DATE
15th Jan, 2019 20334 20348 ORDINAL
2 20350 20351 CARDINAL
2 20363 20364 CARDINAL
16 20371 20373 QUANTITY
4 20525 20526 CARDINAL
12th J

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [161]:
doc = trainedModel(u''+ """i want to book a room at Lake palace Check in date is - 20th December 2018 check out - 1st January 2019""")
trn_data = []
for ent in doc.ents:
#     trn_data.append({Entity, end_char,label,start_char, text})
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Lake palace 25 36 FAC
20th December 2018 56 74 DATE
1st January 2019 87 103 DATE


In [91]:
mulItems

Unnamed: 0.1,Unnamed: 0,end_char,label,name,start_char,text
27,27,83,ORG,Star,79,"Dear Team, Please cancel the attached bookin..."


In [151]:
sentence = """i want to book a room at Lake palace 
Check in date is - 31st December 2018 
check out - 1st January 2019"""

In [152]:
doc = trainedModel(u''+ sentence.replace('\n', ' '))
trn_data = []
for ent in doc.ents:
#     trn_data.append({Entity, end_char,label,start_char, text})
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Lake palace 25 36 FAC
31st December 2018 57 75 DATE
1st January 2019 89 105 DATE


In [122]:
doc1 = "Sugar is bad to consume. My sister likes to have sugar, but not my father." 
doc2 = "My father spends a lot of time driving my sister around to dance practice."
doc3 = "Doctors suggest that driving may cause increased stress and blood pressure."
doc_complete = [doc1, doc2, doc3]
doc_clean = [doc.split() for doc in doc_complete]

import gensim
import corpora

# Creating the term dictionary of our corpus, where every unique term is assigned an index.  
dictionary = corpora.Dictionary(doc_clean)

# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above. 
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]

# Creating the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel

# Running and Training LDA model on the document term matrix
ldamodel = Lda(doc_term_matrix, num_topics=3, id2word = dictionary, passes=50)

# Results 
print(ldamodel.print_topics())

ModuleNotFoundError: No module named 'corpus'

In [121]:
!pip install corpus



In [125]:
import corpora

ModuleNotFoundError: No module named 'corpus'