# Test NLP utils

In [289]:
import pandas as pd
import numpy as np
import sklearn
from nltk.corpus import brown
from nltk import chunk
import nltk
import spacy
import warnings

import en_core_web_sm

nlp = en_core_web_sm.load()
# nlp = spacy.load("en")

In [74]:
test_example = "Send a message to Brain Williams via Linkedin"

In [75]:
# doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
# doc = nlp("Increase followers on Linkedin")
doc = nlp(test_example)

tokens = []
for token in doc:
    token_char = [token.text, token.lemma_, token.pos_, token.tag_, token.dep_, token.ent_type_]
#     print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_, token.ent_type_)
    tokens.append(token_char)
pd.DataFrame(tokens, columns = "Text Lemma POS Tag Dep Entity".split(" ")).T

Unnamed: 0,0,1,2,3,4,5,6,7
Text,Send,a,message,to,Brain,Williams,via,Linkedin
Lemma,send,a,message,to,Brain,Williams,via,Linkedin
POS,VERB,DET,NOUN,ADP,PROPN,PROPN,ADP,PROPN
Tag,VB,DT,NN,IN,NNP,NNP,IN,NNP
Dep,ROOT,det,dobj,dative,compound,pobj,prep,pobj
Entity,,,,,PERSON,PERSON,,PERSON


In [76]:
spacy.displacy.render(doc, jupyter=True, style='dep')

In [77]:
spacy.displacy.render(doc, jupyter=True, style='ent')

In [78]:
def return_results(doc):
    tokens = []
    for token in doc:
        token_char = [token.text, token.lemma_, token.pos_, token.tag_, token.dep_, token.ent_type_]
    #     print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_, token.ent_type_)
        tokens.append(token_char)
    tokens = pd.DataFrame(tokens, columns = "Text Lemma POS Tag Dep Entity".split(" "))
    action = tokens[tokens["Dep"]=="ROOT"]["Lemma"][0]
    
    keys = tokens[tokens["Entity"]!=""][["Lemma", "Entity"]]
    keys_dic = keys.groupby("Entity")["Lemma"].apply(list).to_dict()
    return tokens, action, keys_dic

tokens, _, keys = return_results(doc)

# Train model

In [79]:
ner = nlp.get_pipe("ner")

In [80]:
TRAIN_DATA = [
              ("Check Brian's posts on Linkedin.", {"entities": [(23, 31, "ORG")]}),
#               ("Send a message to John on Linkedin.", {"entities": [(26, 34, "ORG")]}),
              ("I recently ordered a book from Amazon", {"entities": [(24,32, "ORG")]}),
              ("I was driving a BMW", {"entities": [(16,19, "PRODUCT")]}),
              ("I ordered this from ShopClues", {"entities": [(20,29, "ORG")]})]

In [81]:
# Adding labels to the `ner`

for _, annotations in TRAIN_DATA:
    for ent in annotations.get("entities"):
        ner.add_label(ent[2])

In [82]:
# Disable pipeline components you dont need to change
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
unaffected_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

In [83]:
# Import requirements
import random
from spacy.util import minibatch, compounding
from pathlib import Path

# TRAINING THE MODEL
with nlp.disable_pipes(*unaffected_pipes):

  # Training for 30 iterations
  for iteration in range(30):

    # shuufling examples  before every iteration
    random.shuffle(TRAIN_DATA)
    losses = {}
    # batch up the examples using spaCy's minibatch
    batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
    for batch in batches:
        texts, annotations = zip(*batch)
        nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop=0.5,  # dropout - make it harder to memorise data
                    losses=losses,
                )
    if iteration%4==0:
        print(iteration, " Losses", losses)

  gold = GoldParse(doc, **gold)


0  Losses {'ner': 5.759613461312256}
4  Losses {'ner': 8.59147003479302}
8  Losses {'ner': 4.370278474219958}
12  Losses {'ner': 2.011332681685417}
16  Losses {'ner': 0.5031419273364008}
20  Losses {'ner': 0.9644296541810036}
24  Losses {'ner': 4.688607818763558}
28  Losses {'ner': 0.6370923449212569}


In [84]:
spacy.displacy.render(nlp("Send a message to Brain Williams via Linkedin."), jupyter=True, style='ent')

In [85]:
return_results(nlp("Send a message to Brain Williams via Linkedin."))

(       Text     Lemma    POS  Tag       Dep  Entity
 0      Send      send   VERB   VB      ROOT        
 1         a         a    DET   DT       det        
 2   message   message   NOUN   NN      dobj        
 3        to        to    ADP   IN    dative        
 4     Brain     Brain  PROPN  NNP  compound  PERSON
 5  Williams  Williams  PROPN  NNP      pobj  PERSON
 6       via       via    ADP   IN      prep        
 7  Linkedin  Linkedin  PROPN  NNP      pobj     ORG
 8         .         .  PUNCT    .     punct        ,
 'send',
 {'ORG': ['Linkedin'], 'PERSON': ['Brain', 'Williams']})

# Check related data

In [60]:
# return_results(nlp("Increase followers on Linkedin"))

In [65]:
df = pd.read_excel("AUTOMATION REQUESTS editted.xlsx")
requests = df[~df["REQUEST"].isna()].apply(lambda x: x.str.split("\n"))["REQUEST"].to_list()
requests = [item for sublist in requests for item in sublist if item !='']
requests

['1. Connect with marketing professionals',
 '2. Connect with marketing specialists',
 '3. Connect with specialists on Linkedin',
 '4. Grow my network on Linkedin',
 '5. Invite people to connect with me on Linkedin',
 '6. Invite people who commented to connect',
 '7. Connect with people who posted comments to my post',
 '8. Connect with people who posted comments',
 '9. Accept invitations from <specific people> only',
 '10. Do not accept all invitations to connect',
 "11. Don't accept all the invitations on Linkedin",
 '12. Ask to approve before accepting invitations',
 '13. Send invitations to follow the company page (name)',
 '14. Invite users to like the page',
 '15. Invite users to follow the page',
 '16. Get more followers',
 '17. Increase followers on Linkedin',
 '18. Send congratulations on the new job',
 '19. Send best wishes for new job',
 '20. Send job congratulations for the new job',
 '21. Send new position messages',
 '22. Send congratulations on special international days