In [1]:
import pandas as pd
from pprint import pprint
from IPython.core.display import display, HTML

import warnings
warnings.filterwarnings('ignore')

# import data

In [2]:
import json
with open("NER_dataset/tran_X.json", "r") as f:
    X = json.load(f)
    
with open("NER_dataset/tran_Y.json", "r") as f:
    Y = json.load(f)

# buliding training set

In [3]:
train_x = []
train_y = []

test_x = []
test_y = []

In [4]:
ind = 0

for ind in range(len(X)):
    if ind % 7 == 0:
        test_x.append( X[ind] )
        test_y.append( Y[ind] )

    else:
        train_x.append( X[ind] )
        train_y.append( Y[ind] )

    ind += 1

In [5]:
train_x = [x for x in train_x if len(x) > 2 and len(x) < 120]
train_y = [x for x in train_y if len(x) > 2 and len(x) < 120]

test_x = [x for x in test_x if len(x) > 2 and len(x) < 120]
test_y = [x for x in test_y if len(x) > 2 and len(x) < 120]

# sample data

In [6]:
print('sentence:')
print( train_x[0] )
print('label:')
print( train_y[0] )

sentence:
['Experience', 'working', 'on', 'front', '-', 'back', '-', 'end', ',', 'or', 'full', '-', 'stack', 'web', 'development', 'projects', '.']
label:
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'I', 'O', 'O']


In [7]:
print( 'trainset_size:',len( train_x ) )
print( 'testset_size:',len( test_x ) )

trainset_size: 525
testset_size: 88


# building NER model

In [8]:
import tensorflow as tf
from pprint import pprint

In [9]:
from keras.models import *
from keras.layers import *
from keras.optimizers import *
from keras.callbacks import *
from keras_contrib.layers import CRF
from keras_contrib.losses import crf_loss
from keras_contrib.metrics import crf_accuracy
from keras.preprocessing.sequence import pad_sequences
from keras_bert import load_trained_model_from_checkpoint
from keras_bert import Tokenizer
import matplotlib.pyplot as plt

Using TensorFlow backend.


In [10]:
tf.__version__

'1.15.0'

In [11]:
class bert_bilstm_crf:
    def __init__(self, max_seq_length, batch_size, epochs, lstm_dim):
        self.label = {}
        self._label = {}
        self.max_seq_length = max_seq_length
        self.batch_size = batch_size
        self.epochs = epochs
        self.lstmDim = lstm_dim
        self.LoadLabel()
        self.model = self.Model()

    ##############################################
    def LoadLabel(self):
        #label
        label_path = r"uncased_L-2_H-128_A-2/tag_dict.txt"
        f_label = open(label_path, 'r+', encoding='utf-8')
        for line in f_label:
            content = line.strip().split()
            self.label[content[0].strip()] = content[1].strip()
            self._label[content[1].strip()] = content[0].strip()
            
        #dict
        self.vocab = {}
        vocab_path = r"uncased_L-2_H-128_A-2/vocab.txt"
        with open(vocab_path, 'r+', encoding='utf-8') as f_vocab:
            for line in f_vocab.readlines():
                self.vocab[line.strip()] = len(self.vocab)

    def Model(self):
        model_path = r"uncased_L-2_H-128_A-2/"
        bert = load_trained_model_from_checkpoint(
            model_path + "bert_config.json",
            model_path + "bert_model.ckpt",
            seq_len=self.max_seq_length
            )
        #make bert layer trainable
        for layer in bert.layers:
            layer.trainable = True
        x1 = Input(shape=(None,))
        x2 = Input(shape=(None,))
        bert_out = bert([x1, x2])
        lstm_out = Bidirectional(LSTM(self.lstmDim,
                                         return_sequences=True,
                                         dropout=0.2,
                                         recurrent_dropout=0.2))(bert_out)
        crf_out = CRF(len(self.label), sparse_target=True)(lstm_out)
        model = Model([x1, x2], crf_out)
        model.summary()
        
        model.compile(
            optimizer=Adam(1e-4),
            loss=crf_loss,
            metrics=[crf_accuracy]
        )
        
        return model

    def PreProcessInputData(self, text):
        word_labels = []
        seq_types = []
        
        for sequence in text:
            len_text = len(sequence)
            
            ###########################################
            temp_word_labels = []
            
            temp_word_labels.append( 101 )            
            for w in sequence:
                temp_word_labels.append( self.vocab.get(w,1) )
            temp_word_labels.append( 102 )
            
            ###########################################
            temp_seq_types = [1] * len(temp_word_labels) +  [0] * (self.max_seq_length - len( temp_word_labels ))
            temp_word_labels = temp_word_labels + [0] * (self.max_seq_length - len( temp_word_labels ))
            
            word_labels.append( temp_word_labels )
            seq_types.append( temp_seq_types )
            
        return word_labels, seq_types


    def PreProcessOutputData(self, text):
        tags = []
        for line in text:
            tag = [0]
            for item in line:
                tag.append(int(self.label[item.strip()]))
            tag.append(0)
            tags.append(tag)

        pad_tags = pad_sequences(tags, maxlen=self.max_seq_length, padding="post", truncating="post")
        result_tags = np.expand_dims(pad_tags, 2)
        return result_tags

    def TrainModel(self, train_data):
        input_train, result_train = train_data
        input_test, result_test = test_data
        
        #训练集
        input_train_labels, input_train_types = self.PreProcessInputData(input_train)
        result_train = self.PreProcessOutputData(result_train)
        
        #测试集
#         input_test_labels, input_test_types = self.PreProcessInputData(input_test)
#         result_test = self.PreProcessOutputData(result_test)
        
        history = self.model.fit(x=[input_train_labels, input_train_types],
                       y=result_train,
                       validation_split=0.2,
                       batch_size=self.batch_size,
                       epochs=self.epochs,
                       shuffle=True,
                       verbose=1,
                       class_weight = 'auto')
        
        self.model.save('NER_model/my_NER_model')
        return

    def Id2Label(self, ids):
        result = []
        for id in ids:
            result.append(self._label[str(id)])
        return result

    def Vector2Id(self, tags):
        result = []
        for tag in tags:
            result.append(np.argmax(tag))
        return result

    def ModelPredict(self, sentence):
        labels, types = self.PreProcessInputData([sentence])
        self.model.load_weights('NER_model/my_NER_model')
        tags = self.model.predict([labels, types])[0]
        
        result = []
        for i in range(1, len(sentence) + 1):
            result.append(tags[i])
        result = self.Vector2Id(result)
        tag = self.Id2Label(result)
        return tag

    def EvalModel(self, valid_data):
        input_valid, result_valid = valid_data
        #训练集
        input_valid_labels, input_valid_types = self.PreProcessInputData(input_valid)
        result_valid = self.PreProcessOutputData(result_valid)
        
        res = ( self.model.evaluate(x=[input_valid_labels, input_valid_types],
                           y=result_valid,batch_size=self.batch_size) )
        print(res)
        return

# training

In [12]:
train_data = ( train_x, train_y )
test_data  = ( test_x, test_y )

#模型
max_seq_length = 128
batch_size = 16
epochs = 20
lstmDim = 64
model = bert_bilstm_crf( max_seq_length, batch_size, epochs, lstmDim )
model.TrainModel( train_data )





Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.







Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
model_2 (Model)                 multiple             4320256     input_1[0][0]                    
                                                                 input_2[0][0]         

Epoch 19/20
Epoch 20/20


# testing

In [13]:
def print_predict( model, tem):
    predict_list = model.ModelPredict( [w.lower() for w in tem] )

    print()
    print( '【Extract Result】', end='' )
    i = 0
    for i in range( len( predict_list ) ):
        if predict_list[i] == 'B':
            print( '|',tem[i],'',end='' )
            
        if predict_list[i] == 'I':
            print( tem[i],'',end='' )
            
    print()

In [14]:
indeX = 0
for tem in test_x:
    print( ' '.join(tem) )
    print()
    print( '【Real Result】' ,end='' )
    for i in range( len( tem ) ):
        if test_y[indeX][i] == 'B':
            print( '|',tem[i],'',end='' )
            
        if test_y[indeX][i] == 'I':
            print( tem[i],'',end='' )
            
    print_predict( model, tem  )
    
    indeX +=1 
    print( '=' * 120 )

Experience in commonly used for data analysis such as Python , R , Julia , or SAS .

【Real Result】| data analysis | Python | R | Julia | SAS 
【Extract Result】| data analysis | Python | R | Julia | SAS 
Design solutions by mapping client business processes and challenges to an end - to - end solution on the platform utilizing data analytics , machine learning , and artificial intelligence to predict outcomes and prescribe actions

【Real Result】| data analytics | machine learning | artificial intelligence 
【Extract Result】| data analytics | machine learning | artificial intelligence 
Research and keep track of industry trends in Data Analytics / Statistic Modeling / Predictive Modeling / AI / Algorithms to ensure that the department is evaluating new techniques .

【Real Result】| Data Analytics | Statistic Modeling | Predictive Modeling | AI | Algorithms 
【Extract Result】| Data Analytics | Statistic | Modeling / | Predictive Modeling AI Algorithms 
Data Science experience   should have ex

【Real Result】| data acquisition tools | SQL | Apache Spark | large datasets | Hadoop | data mining 
【Extract Result】| data acquisition tools SQL | Apache Spark | Hadoop | data mining 
Possess basic understanding of system requirements for the deployment of the latest versions of R and Python and Scripting

【Real Result】| R | Python | Scripting 
【Extract Result】| Python | Scripting 
Develops and validates statistical forecasting models and tools .

【Real Result】| statistical forecasting models 
【Extract Result】| statistical forecasting models 
1 + year experience with Tableau or Power BI

【Real Result】| Tableau | Power BI 
【Extract Result】| Tableau BI 
Develop and deliver advanced statistical and mathematical models to support fact - based decision making within the organization

【Real Result】| mathematical models 
【Extract Result】| advanced | statistical | mathematical models decision 
RDBMS SQL Development .

【Real Result】| RDBMS SQL 
【Extract Result】| SQL Development 
Experience with

【Real Result】| SAS | QlikView | SQL | PL / SQL | MATLAB 
【Extract Result】| SAS | QlikView | SQL | PL / | SQL | MATLAB | statistical tools 
Software programming proficiency with Java , C , R , Python , and/or MATLAB

【Real Result】| Java | C | R | Python | MATLAB 
【Extract Result】| Software programming | Java | C | R | Python | and/or MATLAB 
We are seeking a Data Scientist to assist in analyzing and implementing data driven solutions to problems specific to risk analysis projects and programs .

【Real Result】| Data Scientist | risk analysis 
【Extract Result】| Data Scientist | analyzing | data 
As this Data Scientist , you will use established programmatic and quantitative methods to find patterns and relationships in large data sets .

【Real Result】| quantitative methods | find patterns 
【Extract Result】| Data Scientist | programmatic quantitative methods 
MS degree in Electrical Engineering , Computer Engineering , or Computer Science ; and at least 6 years of related experience resear

【Real Result】| software development | Java | Linux | Ruby | Python | .NET | C # | C++ 
【Extract Result】| software development enterprise software development stack | Java | Linux | Ruby / | Python | Windows development stack | .NET | C # | C++ 
Real world experience using Hadoop and the related query engines ( Hive / Impala )

【Real Result】| Hadoop | query engines | Hive | Impala 
【Extract Result】| Hadoop | query engines | Hive / | Impala 
Proficiency in designing & solving classification / prediction problems using open source libraries such as Scikit learn .

【Real Result】| classification | prediction | Scikit learn 
【Extract Result】| designing | classification / | prediction problems | open source | Scikit learn 
Deep understanding of and experience of modern machine learning techniques such as classification , recommendation systems , and other shallow learning techniques , data analytics , and statistical models .

【Real Result】| modern machine learning techniques | classification

# input a random JD output Skill

In [18]:
import spacy
from bs4 import BeautifulSoup

nlp = spacy.load("en_core_web_sm")



    Only loading the 'en' tokenizer.



In [19]:
def process( model, tem ):
    temp_list = []
    predict_list = model.ModelPredict( [w.lower() for w in tem] )
            
    for index in range(0,len(predict_list)):
        if predict_list[index] == 'B' or predict_list[index] == 'I':
            temp_list.append(  "<span style='background:yellow'>"+str(tem[index])+'</span>' )
            
        else:
            temp_list.append( str(tem[index]) )
        
    return temp_list

In [21]:
JD_DF = pd.read_csv(r'../Data/JD.csv')
JD_DF = JD_DF[JD_DF['Query'] == 'Java Developer']
JD_DF

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Query,Description
6,388,388,Java Developer,<P><STRONG>As a member of the Web and Portal D...
7,395,395,Java Developer,<BR>\r<TABLE border=0 cellSpacing=0 cellPaddin...
11,713,713,Java Developer,<strong>Application Developer-Senior-Java<br>\...
118,4631,4631,Java Developer,<b>Responsibilities:</b> Kforce is seeking a m...
132,4873,4873,Java Developer,<b>Responsibilities:</b> Our client is looking...
...,...,...,...,...
71866,1114580,1114580,Java Developer,Energize Global Services CJSC is looking for J...
71871,1114616,1114616,Java Developer,Workfront is a technology company that needs a...
71875,1114710,1114710,Java Developer,"EPAM Systems, Inc. is seeking Java Developers ..."
71889,1114949,1114949,Java Developer,Monitis is looking for a Java Developer who wi...


# get a random JD

In [22]:
text = JD_DF.sample(1).iloc[0]['Description']
soup = BeautifulSoup(text)
text = soup.get_text()

for sentence in text.replace('\\r','\\n').split('\\n'):
    print(sentence)

Infosys is currently searching for a Java Developer to join their team in Bentonville, AR.  Infosys is the business critical technology partner for the world’s most successful organizations. As a global leader in Business Transformation, Infosys provides strategic business consulting, technology, engineering and outsourcing services to help clients leverage technology and create impactful and measurable business value for every IT investment.  
  
THIS POSITION IS LOCATED IN BENTONVILLE, AR.  RELOCATION TO BENTONVILLE, AR, IS REQUIRED.  RELOCATION ASSISTANCE IS AVAILABLE FOR QUALIFIED CANDIDATES.
  
POSITION RESPONSIBILITIES:

    Participate in estimation, staffing analysis and solutioning in order to provide inputs for preparing solution delivery for the proposal.
    Participate in discussions with customers to gather scope information and perform analysis of scope information in order to provide inputs for project scope documentation
    Participate in requirement elicitation proce

# output the NER result

In [23]:
sentences = [sentence for sentence in text.replace('\\t',' ').replace('\\r','\\n').split('\\n')]

total_text_list = []
for sent in sentences:
#     sentence = str(sent).replace('\r',' ').replace('\\r',' ').replace('\\n',' ').replace('\n',' ').replace('\t',' ').replace('\\t',' ')
#     sentence = sentence.strip(' ')
    
    total_text_list += process( model, [str(w) for w in list(nlp(sent))[:120] ] +['<br>'] )
#     print( '-' * 120 )

display(HTML( '<html>'+' '.join(total_text_list) + '</html>' ))