In [None]:
subscription_key = 'Azure Cognitive Service Text Analysis Subscription Key'
text_analytics_base_url = "https://eastasia.api.cognitive.microsoft.com/text/analytics/v2.1/"
csv_file_path = 'data.csv'

In [None]:
import requests
import csv
import pickle
import time
import json
import sys
from bs4 import BeautifulSoup, NavigableString

def strip_html(src):
    p = BeautifulSoup(src)
    text = p.findAll(text=lambda text:isinstance(text, NavigableString))
    return u" ".join(text)

key_phrase_api_url = text_analytics_base_url + "keyPhrases"
print(key_phrase_api_url)

entity_linking_api_url = text_analytics_base_url + "entities"
print(entity_linking_api_url)

headers   = {"Ocp-Apim-Subscription-Key": subscription_key}

def load_entities(texts):
    res_entities=[]
    text_list=[]
    count=1
    for text in texts:
        count+=1
        text_list.append({"id":count,"text":text})
    documents = {'documents':text_list}
    response  = requests.post(entity_linking_api_url, headers=headers, json=documents)
    entities = response.json()
#     print(json.dumps(entities, indent=4, sort_keys=True))
    if('documents' in entities):
        for doc in entities['documents']:
            for ent in doc['entities']:
                if(ent['name'] not in entities):
                    res_entities.append(ent['name'])
    return res_entities

def load_keyphrases(texts):
    phrases=[]
    text_list=[]
    count=1
    for text in texts:
        count+=1
        text_list.append({"id":count,"text":text})
    documents = {'documents':text_list}
    response  = requests.post(key_phrase_api_url, headers=headers, json=documents)
    keyphrases = response.json()
    if('documents' in keyphrases):
        for doc in keyphrases['documents']:
            for keyphrase in doc['keyPhrases']:
                phrases.append(keyphrase)
    return phrases

In [2]:
result={}
#csv has three columns: id, title, description
with open(csv_file_path, mode='r', encoding="utf8") as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    for row in csv_reader:
        if row[0] not in result:
            texts = strip_html(row[2]).split('\n')
            text_list = []
            text_line_count = 0
            text_line_count+=1
            text_list.append(row[1])
            for text in texts:
                text_line_count+=1
                text_list.append(text)
            try :
                entities = load_entities(text_list)
                keyphrases = load_keyphrases(text_list)
                result[row[0]] = {"title":row[1], "description":strip_html(row[2]), "entities":entities, "keyphrases":keyphrases}
                print(row[0]+" loaded.")
            except :
                print(row[0]+" not loaded!")
                continue

In [6]:
pickle.dump(result, open('./data_entities_keyphrases', 'wb'))   

In [6]:
#optional extend entities by wiki2vec
from wikipedia2vec import Wikipedia2Vec
from wikipedia2vec.dictionary import Dictionary, Item, Word, Entity
wiki2vec = Wikipedia2Vec.load('enwiki_20180420_100d.pkl')
import threading
mylock = threading.Lock() 

def load_entity(id, entities):
    sub_entities = []
    for ent in entities:
        similar_entities = wiki2vec.most_similar(wiki2vec.get_entity(ent), 10)
        for similar_entity in similar_entities:
            if isinstance(similar_entity[0], Entity):
                sub_entities.append(similar_entity[0].title)

    for ent in sub_entities:
        if(ent not in entities):
            entities.append(ent)
    mylock.acquire() #Get the lock    
    try:
        result[id]['extended_entities'] = entities
    finally:
        mylock.release()
        
import concurrent.futures
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as e:
    for id in result:
        count+=1
        entities = result[id]['entities']
        e.submit(load_entity, id, entities)

In [8]:
pickle.dump(result, open('./data_entities_keyphrases_allentities', 'wb'))   

In [110]:
rules = {
    'Predictive Maintenance':{
        'entities':['Predictive maintenance', 'Anomaly detection'],
        'keyphrases':['Anomaly', 'abnormal']
    },
    'Connected Factory':{
        'entities':['SCADA', 
                    'OPC Unified Architecture', 
                    'Overall equipment effectiveness', 
                    'Remote terminal unit', 
                    'Programmable logic controller',
                   'Manufacturing execution system'],
        'keyphrases':['Connected Robotics']
    },
    'Cognitive Quality':{
        'conditions':[
               {
                   'keyphrases_a':['quality'],
                   'keyphrases_b':['computer vision', 'cognitive', 'cv', 'deep learning']
               }
        ]
    },
    'Ambient Intelligence':{
        'conditions':[
               {
                   'keyphrases_a':['safety'],
                   'keyphrases_b':['factory', 'plant', 'logistics']
               }
        ],
        'entities':['Safety'],

    },
    'Service Bot':{
        'entities':['Speech recognition', 'Chatbot'],
        'keyphrases':['LUIS']
    },
    'Asset Management':{
        'entities':['Asset Management', 'Traceability', 'Radio-frequency identification'],
        'keyphrases':['Traceability']
    },
    'Mixed Reality':{
        'entities':['Mixed reality']
    },
    'Process Optimization':{
        'keyphrases':['optimization','Optimizer']
    }
}

In [None]:
cate_result = {}
for rule in rules:
    cate_result[rule] = []
    for id in result:
        if 'entities' in rules[rule]:
            for entity in rules[rule]['entities']:
                if entity in result[id]['entities'] and eng not in cate_result[rule]:
                    cate_result[rule].append(id)
        if 'keyphrases' in rules[rule]:
            wholekeyphrases = (' '.join(result[id]['keyphrases'])).lower()
            for keyphrase in rules[rule]['keyphrases']:
                if keyphrase.lower() in wholekeyphrases and id not in cate_result[rule]:
                    cate_result[rule].append(id)
        if 'conditions' in rules[rule]:
            
            for condition in rules[rule]['conditions']:
                
                if('keyphrases_a' in condition and 'keyphrases_b' in condition):
                    wholekeyphrases = (' '.join(result[id]['keyphrases'])).lower()
                    condition_a = False
                    condition_b = False
                    
                    for keypharse_a in condition['keyphrases_a']:
                        if keypharse_a.lower() in wholekeyphrases:
                            condition_a = True
                    for keypharse_b in condition['keyphrases_b']:
                        if keypharse_b.lower() in wholekeyphrases:
                            condition_b = True
                    if condition_a and condition_b and id not in cate_result[rule]:
                        cate_result[rule].append(id)
                elif('entities_a' in condition and 'keyphrases_b' in condition):
                    wholekeyphrases = (' '.join(result[id]['keyphrases'])).lower()
                    condition_a = False
                    condition_b = False
                    
                    for entity_a in condition['entities_a']:
                        if entity_a in result[id]['entities']:
                            condition_a = True
                    for keypharse_b in condition['keyphrases_b']:
                        if keypharse_b.lower() in wholekeyphrases:
                            condition_b = True
                    if condition_a and condition_b and id not in cate_result[rule]:
                        cate_result[rule].append(id)
                    

for result in cate_result:
    print(result +" "+str(len(cate_result[result])))
    for eng in cate_result[result]:
        print(result[id]['title'])
    print('\n')

In [108]:

with open('./result_entities.csv', 'w', newline='', encoding='utf-8') as csv_file:
    writer = csv.writer(csv_file, delimiter=',')
    writer.writerow(['id', 'entity'])
    for id in result:
        for entity in result[id]['entities']:
            row=[id, entity]
            writer.writerow(row)

    

In [109]:

with open('./result_categories.csv', 'w', newline='', encoding='utf-8') as csv_file:
    writer = csv.writer(csv_file, delimiter=',')
    writer.writerow(['id', 'category'])
    for result in cate_result:
        for id in cate_result[result]:
            row=[id, result]
            writer.writerow(row)