In [1]:
# Imports
from bs4 import BeautifulSoup
import json 
import ast
import pandas as pd
import numpy as np
from scipy import spatial
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from abc import ABCMeta, abstractmethod
import tensorflow_hub as hub
import tensorflow as tf
import tensorflow_text
from tqdm import tqdm
import re,os,requests
import yake
from math import log
from flask import Flask, jsonify
from flask_restful import Resource, Api, reqparse
from flask_cors import CORS

# Load Encoder Model 
class TFEncoder(metaclass = ABCMeta):
    """Base encoder to be used for all encoders."""
    def __init__(self, model_path:str):
        self.model = hub.load(model_path)
 
    @abstractmethod
    def encode(self, text:list):
        """Encodes text.
        Text: should be a list of strings to encode
        """
 
class USE(TFEncoder):
    """Universal sentence encoder"""
    def __init__(self, model_path):
        super().__init__(model_path)
 
    def encode(self, text):
        return self.model(text).numpy()
 
class USEQA(TFEncoder):
    """Universal sentence encoder trained on Question Answer pairs"""
    def __init__(self, model_path):
        super().__init__(model_path)
 
    def encode(self, text):
        return self.model.signatures['question_encoder'](tf.constant(s))['outputs'].numpy()
 
class BERT():
    """BERT models"""
    def __init__(self, model_name, layers="-2", pooling_operation="mean"):
        self.embeddings = BertEmbeddings(model_name, layers=layers, pooling_operation=pooling_operation)
 
        self.document_embeddings = DocumentPoolEmbeddings([self.embeddings], fine_tune_mode='nonlinear')
 
    def encode(self, text):
        sentence = Sentence(text)
        self.document_embeddings.embed(sentence)
        return sentence.embedding.detach().numpy().reshape(1, -1)
 
model_path = "https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3"
 
encoder = USE(model_path)

app = Flask(__name__)
CORS(app)
api = Api(app)

In [2]:
df_SID_KB = pd.read_csv("SID_KB.csv")
df_KB_embedding = pd.read_csv("SID_Embedding.csv")

In [3]:
def search(test_keypoints):
    test_keypoints = [test_keypoints] if isinstance(test_keypoints, str) else test_keypoints
    from scipy import spatial
    NUM_TO_RERANK = 10
    
    # Get Test Embedding 
    test_descriptors = []
    for i,v in enumerate(tqdm(test_keypoints)):
        try:
            test_descriptors.append(encoder.encode([v]))
        except:
            print("problem in ", test_keypoints[i])
    
    test_descriptors = np.array(test_descriptors).reshape(len(test_keypoints),512)
    test_keypoints = np.array(test_keypoints).reshape(len(test_keypoints),1)
    
    # define test identifiers and descriptors
    test_ids, test_embeddings = test_keypoints, test_descriptors
    
    # define train identifiers and descriptors
    train_ids_kb, train_embeddings_kb =  df_SID_KB['ArticleNumber'].values.reshape(len(df_SID_KB),1) , df_KB_embedding.values
    #train_ids_case, train_embeddings_case =  df_SID_SFDC['CaseNumber'].values.reshape(len(df_SID_SFDC),1) , df__SFDC_embedding.values

    for test_index in range(len(test_embeddings)):
        distances_kb = spatial.distance.cdist(test_embeddings[np.newaxis, test_index, :], train_embeddings_kb,'cosine')[0]
        partition_kb = np.argpartition(distances_kb, NUM_TO_RERANK)[:NUM_TO_RERANK]
        nearest_kb = sorted([train_ids_kb[p] for p in partition_kb])
        print(nearest_kb)
        
        """
        distances_case = spatial.distance.cdist(test_embeddings[np.newaxis, test_index, :], train_embeddings_case,'cosine')[0]
        partition_case = np.argpartition(distances_case, NUM_TO_RERANK)[:NUM_TO_RERANK]
        nearest_case = sorted([train_ids_case[p] for p in partition_case])
        """
        
    df_Results_KB = df_SID_KB[df_SID_KB['ArticleNumber'].isin(nearest_kb)]
    #df_Results_Case = df_SID_SFDC[df_SID_SFDC['CaseNumber'].isin(nearest_case)]['CaseNumber','Subject']
    print(df_Results_KB[['ArticleNumber', 'Title']].to_json(orient="records"))
    return df_Results_KB[['ArticleNumber', 'Title']].to_json(orient="records")

In [4]:
class Articles(Resource):
        def get(self):
            parser = reqparse.RequestParser()  # initialize
            parser.add_argument('msg', required=True)  # add args
            args = parser.parse_args()  # parse arguments to dictionary
            response = app.response_class(
                response=json.dumps(search(args['msg'])),
                status=200,
                mimetype='application/json'
            )
            return response  # return data and 200 OK code
api.add_resource(Articles, '/getArticles')  # '/users' is our entry point

In [None]:
if __name__ == '__main__':
    app.run(port=80)  # run our Flask app

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: off


INFO:werkzeug: * Running on http://127.0.0.1:80/ (Press CTRL+C to quit)
100%|██████████| 1/1 [00:04<00:00,  4.68s/it]
INFO:werkzeug:127.0.0.1 - - [02/Feb/2021 17:18:16] "[37mGET /getArticles?msg=ki HTTP/1.1[0m" 200 -


[array([17049], dtype=int64), array([18137], dtype=int64), array([20379], dtype=int64), array([24493], dtype=int64), array([25766], dtype=int64), array([30150], dtype=int64), array([30952], dtype=int64), array([33269], dtype=int64), array([35297], dtype=int64), array([36972], dtype=int64)]
[{"ArticleNumber":24493,"Title":"A custom query to show assigned or unassigned software tokens"}]


INFO:werkzeug:127.0.0.1 - - [02/Feb/2021 17:19:13] "[37mOPTIONS /getArticles?msg=server HTTP/1.1[0m" 200 -
100%|██████████| 1/1 [00:00<00:00, 25.00it/s]
INFO:werkzeug:127.0.0.1 - - [02/Feb/2021 17:19:13] "[37mGET /getArticles?msg=server HTTP/1.1[0m" 200 -


[array([13474], dtype=int64), array([16446], dtype=int64), array([17401], dtype=int64), array([18524], dtype=int64), array([28060], dtype=int64), array([29654], dtype=int64), array([30659], dtype=int64), array([36059], dtype=int64), array([37589], dtype=int64), array([38196], dtype=int64)]
[{"ArticleNumber":17401,"Title":"RSA Authentication Manager 8.x shows replication status as \"Instance Offline\""},{"ArticleNumber":38196,"Title":"RSA Replication Service fails to start on RSA Authentication Manager 8.x primary"}]


INFO:werkzeug:127.0.0.1 - - [02/Feb/2021 17:20:39] "[37mOPTIONS /getArticles?msg=server HTTP/1.1[0m" 200 -
100%|██████████| 1/1 [00:00<00:00, 21.74it/s]
INFO:werkzeug:127.0.0.1 - - [02/Feb/2021 17:20:39] "[37mGET /getArticles?msg=server HTTP/1.1[0m" 200 -


[array([13474], dtype=int64), array([16446], dtype=int64), array([17401], dtype=int64), array([18524], dtype=int64), array([28060], dtype=int64), array([29654], dtype=int64), array([30659], dtype=int64), array([36059], dtype=int64), array([37589], dtype=int64), array([38196], dtype=int64)]
[{"ArticleNumber":17401,"Title":"RSA Authentication Manager 8.x shows replication status as \"Instance Offline\""},{"ArticleNumber":38196,"Title":"RSA Replication Service fails to start on RSA Authentication Manager 8.x primary"}]


INFO:werkzeug:127.0.0.1 - - [02/Feb/2021 17:20:45] "[37mOPTIONS /getArticles?msg=server%20is%20bad HTTP/1.1[0m" 200 -
100%|██████████| 1/1 [00:00<00:00, 19.23it/s]
INFO:werkzeug:127.0.0.1 - - [02/Feb/2021 17:20:45] "[37mGET /getArticles?msg=server%20is%20bad HTTP/1.1[0m" 200 -


[array([17401], dtype=int64), array([18524], dtype=int64), array([28060], dtype=int64), array([30237], dtype=int64), array([30659], dtype=int64), array([32456], dtype=int64), array([33442], dtype=int64), array([35701], dtype=int64), array([37012], dtype=int64), array([37920], dtype=int64)]
[{"ArticleNumber":18524,"Title":"SecurID Radius Server Partners"},{"ArticleNumber":37920,"Title":"NTP error in RSA SecurID: Crtical event notification NTP out of sync"},{"ArticleNumber":30659,"Title":"RSA Authentication Manager 8.x services do not start after activating a new console certificate"}]
