In [1]:
import re
import pandas as pd
import numpy as np

import tensorflow.keras as keras
from tensorflow.keras.models import load_model
import tensorflow as tf

from flask import Flask, render_template, request

# 导入词表

In [2]:
import json
with open(r'dataset/word2idx.json', "r") as f:
    word2idx = json.load(f)

In [3]:
max_word_length = 500
max_seq_length = 512

# 导入模型

In [4]:
textCNN_model = load_model(r'model_save/model.h5')

layer_output = textCNN_model.get_layer('concatenate').output
intermediate_model = tf.keras.models.Model(inputs=textCNN_model.input,outputs=layer_output)

# 导入数据

In [5]:
data = pd.read_csv(r'dataset/JD_dataset.csv',usecols = ['Query','Description'])
data.sample(10)

Unnamed: 0,Query,Description
55701,Receptionist,<strong>Classification: </strong> Receptionis...
62628,Retail Sales Associate,<p>We are currently seeking Part Time Retail S...
29347,Business Analyst,<B>Job Classification: </B> Direct Hire \r\n\r...
58949,Retail Wireless Sales Consultant,"<p style=""text-align: center""><strong>GOWIRELE..."
19564,Inside Sales Representative,<DIV>\r<P><B><SPAN>An exciting opportunity now...
13006,Store Manager,"<p align=""justify""><strong>Check 'n Go </stron..."
69206,Sales / Franchise,<p>A <strong>Snap-on Tools franchise </strong>...
49431,Retail Sales Associate,"<div align=""left""><strong>&nbsp;&nbsp; </stron..."
23635,Restaurant Manager,"<p>Join a team that is all about sports, great..."
38091,Executive Assistant,<div>\r<p><span>A well-known utilities company...


# 文本预处理

In [6]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
cleanr = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')

def clean_text(text):
    text = text.replace(r'\\n', ' ').replace(r'\\r', ' ').replace(r'\r', ' ').replace(r'\n', ' ')
    text = cleanr.sub(' ', text)
    
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.
    text = BAD_SYMBOLS_RE.sub('', text) # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing. 

    text = ' '.join(word for word in text.split()) # remove stopwors from text
    return text

def string_process(l):
    return l.split()[:max_word_length]

data['Description'] = data['Description'].apply(clean_text)
data['word_list'] = data['Description'].apply( string_process )
data.sample(10)

Unnamed: 0,Query,Description,word_list
56770,Mobile Tool Sales / Franchise Distributor,do you enjoy outside sales interacting with pe...,"[do, you, enjoy, outside, sales, interacting, ..."
18334,Customer Service - Sales Representative - Part...,want to get started with a rapidly growing and...,"[want, to, get, started, with, a, rapidly, gro..."
49418,Store Manager,looking for a better opportunity looking for a...,"[looking, for, a, better, opportunity, looking..."
72176,Business Analyst,junior business analyst gather document manage...,"[junior, business, analyst, gather, document, ..."
46035,Business Analyst,a business analyst ba has a minimum of five 5 ...,"[a, business, analyst, ba, has, a, minimum, of..."
35472,Business Analyst,a pitney bowes business analyst for distributi...,"[a, pitney, bowes, business, analyst, for, dis..."
4156,Store Manager,candidates must have strong leadership skills ...,"[candidates, must, have, strong, leadership, s..."
26992,Customer Service Representative,classification customer service compensation 1...,"[classification, customer, service, compensati..."
32985,Sales Representative,,[]
57633,Staff Accountant,t staff accountant tnorthern va ttemp to hire ...,"[t, staff, accountant, tnorthern, va, ttemp, t..."


In [7]:
def PreProcessInputData( text ):
    word_labels = []

    for sequence in text:
        len_text = len(sequence)

        ###########################################
        temp_word_labels = []
        for w in sequence:
            temp_word_labels.append( word2idx.get( str(w).lower(),1 ) )

        ###########################################
        temp_word_labels = temp_word_labels + [0] * ( max_seq_length - len_text )
        word_labels.append( temp_word_labels )

    return word_labels

# 简历转向量矩阵

In [8]:
XX = np.array( PreProcessInputData( data['word_list'] ) )
intermediate_prediction = intermediate_model.predict( XX )

JD_Vector_List = []
for i in range(0,len(intermediate_prediction)):
    JD_Vector_List.append( intermediate_prediction[i][0][0] )

# 用 FAISS 对向量矩阵做索引

In [9]:
import numpy as np
import faiss                   # make faiss available

# 构造数据
import time
d = 500                           # dimension
nb = len(JD_Vector_List)                      # database size

np.random.seed(1234)             # make reproducible
xb = np.array( JD_Vector_List ).astype('float32')

In [10]:
# %time index = faiss.IndexFlatL2(d)   # build the index
index = faiss.IndexFlatL2(d)   # build the index

print(index.is_trained)
index.add(xb)                  # add vectors to the index
print(index.ntotal)

True
72292


# 启动 Flask

In [None]:
app = Flask(__name__)

@app.route('/get_simmilar/', methods=['POST'])
# 返回最接近的向量
def get_simmilar():
    print( 'get_simmilar' )
    
    number = 5
    my_clean_text = clean_text( eval(request.get_data())['CV'] )
    print( my_clean_text )
    
    D, I = index.search( intermediate_model.predict( np.array( PreProcessInputData( [  my_clean_text.split()[:max_word_length] ] ) ) )[0][0] , number)
    
    # 返回最接近的职位描述
    response = {}
    for ind in I[0]:
        print('-'*120)
        print( data['Query'].iloc[ind] )
        print( data['Description'].iloc[ind] )
        response[str(ind)] = data['Description'].iloc[ind]
    
    return str(response)

if __name__ == '__main__':
    app.run(host="0.0.0.0", port=3335)

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: off


I0507 23:44:57.047859 139789835323200 _internal.py:122]  * Running on http://0.0.0.0:3335/ (Press CTRL+C to quit)
I0507 23:46:12.340965 139788311398144 _internal.py:122] 127.0.0.1 - - [07/May/2020 23:46:12] "[37mPOST /get_simmilar/ HTTP/1.1[0m" 200 -


get_simmilar
business analyst singapore graduated with a degree in economics from the university of queensland i have experience working in an office environment and team setting from my time in national service and during internships i am a diligent and a fast learner and i place utmost importance in ensuring quality and professionalism in my workwork experiencebusiness analystmaybank singaporejune 2018 to presentuat coordinator to liaise between users and it to rectify defects and report progress of testing to project director extraction and reporting of uat progression and results from hp alm end to end testing for payment and settlement related projects capturing and extracting relevant data and compilation of test scripts liaise with it to ensure all backend screens as400 correctly shows transactions from user testsinterncimb bank singapore commercial banking compliance and operational riskjanuary 2017 to february 2017 administrative paperwork checking and organising of audit docu