In [1]:
import re
import string
import nltk
import gensim.downloader as api
import numpy as np
import timeit
from itertools import groupby

from pymilvus import Collection
from pymilvus import connections
from flask import Flask, render_template, request
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from joblib import load
from statistics import mean

In [50]:
print('Init server variables...')
scaler = load('models/scaler.joblib')
sentences_splitter = nltk.data.load('tokenizers/punkt/english.pickle')
wl = WordNetLemmatizer()

Init server variables...


In [3]:
print('Init MilvusDB...')
connections.connect(
    alias="default",
    host='localhost',
    port='19530'
)
collection = Collection('documents')
collection.load()

Init MilvusDB...


In [4]:
print('Loading model...')
start = timeit.default_timer()
wv = api.load('word2vec-google-news-300')
index2word_set = set(wv.index_to_key)
stop = timeit.default_timer()
print(f'Model loaded in {stop - start} seconds!')

Loading model...
Model loaded in 34.58765059999132 seconds!


In [39]:
# Cleaning
def cleaning(s):
    # Lowercase text
    s = s.lower()
    # Trim text
    s = s.strip()
    # Remove punctuations, special characters, URLs & hashtags
    s = re.compile('<.*?>').sub('', s)
    s = re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', s)
    s = re.sub('\s+', ' ', s)
    s = re.sub(r'\[[0-9]*\]', ' ', s)
    s = re.sub(r'[^\w\s]', '', str(s).lower().strip())
    s = re.sub(r'\d', ' ', s)
    s = re.sub(r'\s+', ' ', s)

    return s


# Remove stopword
def stopword(s):
    a = [i for i in s.split() if i not in stopwords.words('english')]
    return ' '.join(a)


# This is a helper function to map NTLK position tags
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN


# Tokenize the sentence
def lemmatizer(s):
    word_pos_tags = nltk.pos_tag(word_tokenize(s))  # Get position tags
    a = [wl.lemmatize(tag[0], get_wordnet_pos(tag[1])) for idx, tag in
         enumerate(word_pos_tags)]  # Map the position tag and lemmatize the word/token
    return " ".join(a)


# Preprocessing
def preprocess(s):
    s = cleaning(s)
    s = stopword(s)
    s = lemmatizer(s)
    return s


# Feature extraction
def avg_feature_vector(sentence, model, num_features, index):
    words = sentence.split()
    feature_vec = np.zeros((num_features, ), dtype='float32')
    n_words = 0
    for word in words:
        if word in index:
            n_words += 1
            feature_vec = np.add(feature_vec, model[word])
    if n_words > 0:
        feature_vec = np.divide(feature_vec, n_words)
    return feature_vec


In [53]:
scaler = load('models/scaler.joblib')
texts = "Predictive models allow subject-specific inference when analyzing disease\nrelated alterations in neuroimaging data."
texts = sentences_splitter.tokenize(texts)

clean_texts = [preprocess(text) for text in texts]

vectors = avg_feature_vector('predictive model allow subject specific inference analyze disease related alteration neuroimaging data', model=wv, num_features=300, index=index2word_set)
vectors = np.array([vectors])
vectors = scaler.transform(vectors)
vectors = np.array([vector / np.linalg.norm(vector) for vector in vectors])
vectors

array([[ 1.81738064e-02, -5.25945686e-02,  5.41344695e-02,
         2.36477982e-02, -7.01663718e-02,  2.99715064e-02,
        -2.35252338e-03,  6.56126812e-02,  8.41437206e-02,
        -3.24970931e-02, -2.57467590e-02, -5.67510650e-02,
         7.94855319e-03,  4.59639393e-02,  5.71175814e-02,
        -3.75173613e-02, -7.08127990e-02,  2.49954164e-02,
        -1.97182372e-02, -1.25621453e-01,  1.01160118e-02,
         1.36857545e-02, -3.66127342e-02,  6.53620511e-02,
        -1.99537426e-02, -1.98749676e-02,  7.50762671e-02,
         2.08483525e-02, -8.18253905e-02,  4.27742861e-02,
         7.33862221e-02, -8.67684409e-02,  4.29857299e-02,
         1.17880385e-02, -5.15621752e-02, -5.59849432e-03,
         4.91640754e-02, -3.36822756e-02,  5.77653982e-02,
        -6.21243753e-02,  8.22758377e-02,  6.57420531e-02,
        -7.51300901e-02,  7.50541082e-03,  5.87132014e-02,
        -7.31290728e-02, -1.92129798e-02, -1.73566933e-03,
        -4.20353524e-02, -1.34568494e-02,  2.61305291e-0

In [54]:
clean_texts

['predictive model allow subject specific inference analyze disease related alteration neuroimaging data']

In [56]:
query_result = collection.search(
    data=vectors,
    anns_field='vector',
    param=dict(metric_type='IP', params=dict(nprobe=1)),
    limit=1,
    output_fields=['title', 'text', 'hash']
)

results = []
reports = []
score = 0
for i, item in enumerate(query_result):
    if len(item) > 0 and item[0].distance > 0.85:
        results.append(item[0].id)
        reports.append({
            'index': i,
            'id': item[0].id,
            'title': item[0].entity.get('title'),
            'text': item[0].entity.get('text'),
            'hash': item[0].entity.get('hash'),
            'distance': round(item[0].distance * 100, 1)
        })
        score += 1 - item[0].distance
    else:
        score += 1
        results.append(None)

reports = sorted(reports, key=lambda x: x['hash'])
reports = [(key, list(group)) for key, group in groupby(reports, key=lambda x: x['hash'])]
reports = sorted(reports, key=lambda x: mean([el['distance'] for el in x[1]]), reverse=True)
score = 100 - round(score / len(vectors) * 100, 2)
str(query_result)

'["[\'(distance: 1.0000001192092896, id: 435919503349404884)\']"]'