In [10]:
import tensorflow as tf
import pandas as pd
from pprint import pprint

from keras.models import *
from keras.layers import *
from keras.optimizers import *
from keras.callbacks import *
from keras_contrib.layers import CRF
from keras_contrib.losses import crf_loss
from keras_contrib.metrics import crf_accuracy
from keras.preprocessing.sequence import pad_sequences
from keras_bert import load_trained_model_from_checkpoint
from keras_bert import Tokenizer
import matplotlib.pyplot as plt

In [11]:
class bert_bilstm_crf:
    def __init__(self, max_seq_length, batch_size, epochs, lstm_dim):
        self.label = {}
        self._label = {}
        self.max_seq_length = max_seq_length
        self.batch_size = batch_size
        self.epochs = epochs
        self.lstmDim = lstm_dim
        self.label_path = r"uncased_L-2_H-128_A-2\tag_dict.txt"
        self.vocab_path = r"uncased_L-2_H-128_A-2\vocab.txt"
        self.model_path = r"uncased_L-2_H-128_A-2/"
        
        self.LoadLabel()
        self.model = self.Model()
        self.model.load_weights('NER_model/my_NER_model')
        
    ##############################################
    def LoadLabel(self):
        #label
        f_label = open(self.label_path, 'r+', encoding='utf-8')
        for line in f_label:
            content = line.strip().split()
            self.label[content[0].strip()] = content[1].strip()
            self._label[content[1].strip()] = content[0].strip()
            
        #dict
        self.vocab = {}
        with open(self.vocab_path, 'r+', encoding='utf-8') as f_vocab:
            for line in f_vocab.readlines():
                self.vocab[line.strip()] = len(self.vocab)

    def Model(self):
        bert = load_trained_model_from_checkpoint(
            self.model_path + "bert_config.json",
            self.model_path + "bert_model.ckpt",
            seq_len=self.max_seq_length
            )
        #make bert layer trainable
        for layer in bert.layers:
            layer.trainable = True
        x1 = Input(shape=(None,))
        x2 = Input(shape=(None,))
        bert_out = bert([x1, x2])
        lstm_out = Bidirectional(LSTM(self.lstmDim,
                                         return_sequences=True,
                                         dropout=0.2,
                                         recurrent_dropout=0.2))(bert_out)
        crf_out = CRF(len(self.label), sparse_target=True)(lstm_out)
        model = Model([x1, x2], crf_out)
        model.summary()
        
        model.compile(
            optimizer=Adam(1e-4),
            loss=crf_loss,
            metrics=[crf_accuracy]
        )
        
        return model

    def PreProcessInputData(self, text):
        word_labels = []
        seq_types = []
        
        for sequence in text:
            len_text = len(sequence)
            
            ###########################################
            temp_word_labels = []
            
            temp_word_labels.append( 101 )            
            for w in sequence:
                temp_word_labels.append( self.vocab.get(w,1) )
            temp_word_labels.append( 102 )
            
            ###########################################
            temp_seq_types = [1] * len(temp_word_labels) +  [0] * (self.max_seq_length - len( temp_word_labels ))
            temp_word_labels = temp_word_labels + [0] * (self.max_seq_length - len( temp_word_labels ))
            
            word_labels.append( temp_word_labels )
            seq_types.append( temp_seq_types )
            
        return word_labels, seq_types


    def PreProcessOutputData(self, text):
        tags = []
        for line in text:
            tag = [0]
            for item in line:
                tag.append(int(self.label[item.strip()]))
            tag.append(0)
            tags.append(tag)

        pad_tags = pad_sequences(tags, maxlen=self.max_seq_length, padding="post", truncating="post")
        result_tags = np.expand_dims(pad_tags, 2)
        return result_tags

    def TrainModel(self, train_data):
        input_train, result_train = train_data
        input_test, result_test = test_data
        
        #训练集
        input_train_labels, input_train_types = self.PreProcessInputData(input_train)
        result_train = self.PreProcessOutputData(result_train)
        
        #测试集
#         input_test_labels, input_test_types = self.PreProcessInputData(input_test)
#         result_test = self.PreProcessOutputData(result_test)
        
        history = self.model.fit(x=[input_train_labels, input_train_types],
                       y=result_train,
                       validation_split=0.2,
                       batch_size=self.batch_size,
                       epochs=self.epochs,
                       shuffle=True,
                       verbose=1,
                       class_weight = 'auto')
        return

    def Id2Label(self, ids):
        result = []
        for id in ids:
            result.append(self._label[str(id)])
        return result

    def Vector2Id(self, tags):
        result = []
        for tag in tags:
            result.append(np.argmax(tag))
        return result

    def ModelPredict(self, sentence):
        labels, types = self.PreProcessInputData([sentence])
        tags = self.model.predict([labels, types])[0]
        
        result = []
        for i in range(1, len(sentence) + 1):
            result.append(tags[i])
        result = self.Vector2Id(result)
        tag = self.Id2Label(result)
        return tag

    def EvalModel(self, valid_data):
        input_valid, result_valid = valid_data
        #训练集
        input_valid_labels, input_valid_types = self.PreProcessInputData(input_valid)
        result_valid = self.PreProcessOutputData(result_valid)
        
        res = ( self.model.evaluate(x=[input_valid_labels, input_valid_types],
                           y=result_valid,batch_size=self.batch_size) )
        print(res)
        return

In [12]:
max_seq_length = 128
batch_size = 16
epochs = 20
lstmDim = 64

model = bert_bilstm_crf( max_seq_length, batch_size, epochs, lstmDim )

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
input_6 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
model_8 (Model)                 multiple             4320256     input_5[0][0]                    
                                                                 input_6[0][0]                    
__________________________________________________________________________________________________
bidirectional_3 (Bidirectional) (None, None, 128)    98816       model_8[1][0]                    
__________

In [13]:
import spacy
from bs4 import BeautifulSoup

nlp = spacy.load("en_core_web_sm")

In [22]:
def print_predict( model, tem):
    predict_list = model.ModelPredict( [w.lower() for w in tem] )

    result_list = []
    tem_skill = ''
    
#     print()
#     print( '【Extract Result】', end='' )
    i = 0
    
    pre = ''
    for i in range( len( predict_list ) ):
        if predict_list[i] == 'B':
            if tem_skill != '':
                result_list.append( tem_skill )
                tem_skill = ''
                
#             print( '|',tem[i],'',end='' )
            tem_skill = tem[i]
            
        if predict_list[i] == 'I':
#             print( tem[i],'',end='' )
            tem_skill += ' ' + tem[i]
            
        if predict_list[i] == '0':
            if pre == 'B' or pre == 'I':
                result_list.append( tem_skill )
                tem_skill = ''
            tem_skill = ''
            
        pre = predict_list[i]
        
    if tem_skill != '':
        result_list.append( tem_skill )
        
    result_list = [w.strip() for w in result_list if w.strip() != '']
    print()
    return result_list

In [23]:
JD_DF = pd.read_csv(r'../Data/JD.csv')
JD_DF = JD_DF[JD_DF['Query'] == 'Java Developer']

In [24]:
text = JD_DF.sample(1).iloc[0]['Description']
soup = BeautifulSoup(text)
text = soup.get_text()
text



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


'Qualifications: The successful candidate will have Object-Oriented experience in these areas: • Spring Framework • Hibernate/JPA • Web Services in java • Java Swing Knowledge The successful candidate will also have a good working knowledge of the following: • C, C++, Make, Makefiles • Java Core API’s • Using Subversion • Web Logic • Unix Scripts knowledge a plus (.ksh,. bsh,. csh) • COBOL knowledge is a plus • Working knowledge of Perl Scripts • Excellent knowledge of Oracle SQL, must be able to read, write and debug Oracle scripts. This position requires the ability to multi-task in a high-paced environment.\xa0\xa0\\r\\rResponsibilities: Build an application with persistent domain objects using Dependency Injection, AOP (AspectJ) and Spring Object Relational Mapping (ORM with JPA/Hibernate) and with Ant build files.'

In [28]:
print(text)
print('='*120)

total_skill = []
for sentence in text.replace('\\t',' ').replace('\\r','\\n').split('\\n'):
    if len(sentence.strip()) > 0:
        print('-'*120)
        
        print( sentence.strip() )
        tem_list = print_predict( model, [str(w) for w in list(nlp(sentence.strip()))[:120] ] ) 
        print( tem_list )
        
        total_skill += tem_list

print('='*120)
print( list(set(total_skill)) )

Qualifications: The successful candidate will have Object-Oriented experience in these areas: • Spring Framework • Hibernate/JPA • Web Services in java • Java Swing Knowledge The successful candidate will also have a good working knowledge of the following: • C, C++, Make, Makefiles • Java Core API’s • Using Subversion • Web Logic • Unix Scripts knowledge a plus (.ksh,. bsh,. csh) • COBOL knowledge is a plus • Working knowledge of Perl Scripts • Excellent knowledge of Oracle SQL, must be able to read, write and debug Oracle scripts. This position requires the ability to multi-task in a high-paced environment.  \r\rResponsibilities: Build an application with persistent domain objects using Dependency Injection, AOP (AspectJ) and Spring Object Relational Mapping (ORM with JPA/Hibernate) and with Ant build files.
------------------------------------------------------------------------------------------------------------------------
Qualifications: The successful candidate will have Object