In [14]:
import spacy
from spacy_conll import ConllFormatter

from pprint import pprint
from collections import defaultdict, OrderedDict

## 例句

In [15]:
string1 = 'PhD degree in Material Science, Polymer Science, Chemical Engineering or Chemistry Discipline.'
string2 = 'Strong knowledge in computer vision, including image registration, segmentation, classification, object detection.'
string3 = '5 years plus experience in project management, six sigma and DOE is preferred.'
string4 = 'Experience working with and extracting value from large、 disconnected、 unstructured datasets'
string5 = 'Develop a test plan including functional QA, integration testing, string testing, ICAT, ECAT, etc.'
string6 = 'Perform data analysis including data mapping, report analysis, interface definitions'
string7 = 'Develop functional specifications in a team environment, as well as derive use cases where appropriate'

test_list = [string1,string2,string3,string4,string5,string6,string7]

In [16]:
string = string2

## 依存句法分析

In [17]:
nlp = spacy.load("en_core_web_sm")
conllformatter = ConllFormatter(nlp)
nlp.add_pipe(conllformatter, after='parser')
doc = nlp( string )
conll = doc._.conll
print(doc._.conll_str)

1	Strong	strong	ADJ	JJ	Degree=pos	2	amod	_	_
2	knowledge	knowledge	NOUN	NN	Number=sing	0	ROOT	_	_
3	in	in	ADP	IN	_	2	prep	_	_
4	computer	computer	NOUN	NN	Number=sing	5	compound	_	_
5	vision	vision	NOUN	NN	Number=sing	3	pobj	_	_
6	,	,	PUNCT	,	PunctType=comm	5	punct	_	_
7	including	include	VERB	VBG	VerbForm=part|Tense=pres|Aspect=prog	5	prep	_	_
8	image	image	NOUN	NN	Number=sing	9	compound	_	_
9	registration	registration	NOUN	NN	Number=sing	7	pobj	_	_
10	,	,	PUNCT	,	PunctType=comm	9	punct	_	_
11	segmentation	segmentation	NOUN	NN	Number=sing	9	conj	_	_
12	,	,	PUNCT	,	PunctType=comm	11	punct	_	_
13	classification	classification	NOUN	NN	Number=sing	11	conj	_	_
14	,	,	PUNCT	,	PunctType=comm	13	punct	_	_
15	object	object	NOUN	NN	Number=sing	16	compound	_	_
16	detection	detection	NOUN	NN	Number=sing	13	conj	_	_
17	.	.	PUNCT	.	PunctType=peri	2	punct	_	_



In [18]:
from stat_parser import Parser
parser = Parser()
print( parser.parse(string) )

(SINV
  (S
    (NP
      (NP (JJ strong) (NN knowledge))
      (PP (IN in) (NP (NN computer) (NN vision))))
    (VP
      (, ,)
      (S+VP (VBG including) (NP (NN image) (NN registration)))))
  (, ,)
  (VP (VBZ segmentation))
  (, ,)
  (VP (VBZ classification))
  (, ,)
  (NP (JJ object) (NN detection))
  (. .))


## 基于 开源工具 的关键词识别

In [19]:
my_chunks_list = []
roor_chunks_dict = {}

for chunk in doc.noun_chunks:
    my_chunks_list.append( chunk.root )
    roor_chunks_dict[chunk.root] = chunk

print( [roor_chunks_dict[w] for w in my_chunks_list] )

[Strong knowledge, computer vision, image registration, segmentation, classification, object detection]


## 基于 句法分析器 的层次结构生成

In [20]:
Total_List = []

for word in my_chunks_list:
    temp_list = []
    temp_list.append(word)
#     print( word,'',end = '')
    
    while word != word.head:
        
        relation = word.dep_
        word = word.head
#         tree_set.append( word )
        
        if word in my_chunks_list and relation != 'conj':
            temp_list.append(word)
#             print(word,'',end = '')
            
    Total_List.append( temp_list[::-1] )
    
for l in Total_List:
    l = [roor_chunks_dict[w].text for w in l]
    print( ' -> '.join(l) )

Strong knowledge
Strong knowledge -> computer vision
Strong knowledge -> computer vision -> image registration
Strong knowledge -> computer vision -> segmentation
Strong knowledge -> computer vision -> classification
Strong knowledge -> computer vision -> object detection


## 输出结果转树状格式

In [21]:
processed_dict = {}
processed_dict['ROOT'] = {'name':'ROOT','children':[]}

for l in Total_List:
    
    last = 'ROOT'
    for w in l:
        if w not in processed_dict.keys():            
            processed_dict[w] = {'name':roor_chunks_dict[w].text,'children':[]}
            processed_dict[last]['children'].append( processed_dict[w] )
            
        last = w
        
pprint( processed_dict['ROOT'] )

{'children': [{'children': [{'children': [{'children': [],
                                           'name': 'image registration'},
                                          {'children': [],
                                           'name': 'segmentation'},
                                          {'children': [],
                                           'name': 'classification'},
                                          {'children': [],
                                           'name': 'object detection'}],
                             'name': 'computer vision'}],
               'name': 'Strong knowledge'}],
 'name': 'ROOT'}


## 结果可视化

In [22]:
from unittest.mock import patch
from nose.tools import assert_equal, assert_in
from pyecharts import options as opts
from pyecharts.charts import Tree

In [23]:
TEST_DATA = [processed_dict['ROOT']]

In [24]:
c = Tree().add(
        series_name="技能树",
        data=TEST_DATA,
        symbol='emptyCircle',
        symbol_size =18,
        initial_tree_depth=10,
        label_opts=opts.LabelOpts(),
        leaves_label_opts=opts.LabelOpts(),
#         title_opts=opts.TitleOpts(title="技能树")
    )
c.render_notebook()

# ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■
# ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■

# 测试

In [25]:
def prcess(string):
    nlp = spacy.load("en_core_web_sm")
    conllformatter = ConllFormatter(nlp)
    nlp.add_pipe(conllformatter, after='parser')
    doc = nlp( string )
    conll = doc._.conll
    
    
    my_chunks_list = []
    roor_chunks_dict = {}

    for chunk in doc.noun_chunks:
        my_chunks_list.append( chunk.root )
        roor_chunks_dict[chunk.root] = chunk
        
        
    Total_List = []
    for word in my_chunks_list:
        temp_list = []
        temp_list.append(word)

        while word != word.head:

            relation = word.dep_
            word = word.head

            if word in my_chunks_list and relation != 'conj':
                temp_list.append(word)

        Total_List.append( temp_list[::-1] )
        
    for l in Total_List:
        l = [roor_chunks_dict[w].text for w in l]
        print( ' -> '.join(l) )

In [26]:
for s in test_list:
    print('例句:',s)
    print('输出:')
    prcess(s)
    print( '='* 100)

例句: PhD degree in Material Science, Polymer Science, Chemical Engineering or Chemistry Discipline.
输出:
PhD degree
PhD degree -> Material Science
PhD degree -> Polymer Science
PhD degree -> Chemical Engineering
PhD degree -> Chemistry Discipline
例句: Strong knowledge in computer vision, including image registration, segmentation, classification, object detection.
输出:
Strong knowledge
Strong knowledge -> computer vision
Strong knowledge -> computer vision -> image registration
Strong knowledge -> computer vision -> segmentation
Strong knowledge -> computer vision -> classification
Strong knowledge -> computer vision -> object detection
例句: 5 years plus experience in project management, six sigma and DOE is preferred.
输出:
5 years
experience
5 years -> project management
DOE
例句: Experience working with and extracting value from large、 disconnected、 unstructured datasets
输出:
Experience
Experience -> value
Experience -> large、 disconnected、 unstructured datasets
例句: Develop a test plan includ