# DATA SCIENCE HACKATHON: INTELLIGENT SEARCH ENGINE
This guide demonstrates solution for this competition.

## Import 

In [None]:
import os
import json
import re
import multiprocessing
from collections import Counter
from itertools import chain
from multiprocessing import Pool
from functools import partial
from evaluation import mean_average_precision

import pandas as pd
import numpy as np
from tqdm import tqdm
from nltk.util import ngrams
from spacy.lang.en import English
from gensim.models import Word2Vec
from gensim.models.fasttext import FastText

from utils.dataset import create_dataframe, split_sentence, split_words
from utils.modelling import preparing_data_w2v
from utils.prediction import search_similar_text
from utils.processing import get_text, get_files_path
from utils.vectors import get_avg_vector, postprocess_vectors

## Initialization

In [None]:
cores = multiprocessing.cpu_count()
print('There are {} CPU cores.'. format(cores))

In [None]:
number_processes = cores - 2

In [None]:
# spacy rule-based matching 
nlp_eng = English()
sentencizer = nlp_eng.create_pipe("sentencizer")
nlp_eng.add_pipe(sentencizer)

In [None]:
folder_list = [
    'Ashcroft', 
    'Density&Viscosity', 
    'Flow', 
    'Gas_analysis', 
    'Level', 
    'Liquid _analysis',
    'Pressure', 
    'Temperature', 
    'Valves_actuators'
]

In [None]:
folder_data = './'
folder_created_data = './'

folder_json = 'text_json'
folder_json_processed = 'data_text_processed'

path_documents = './documents'

dataset_texts = 'data_texts.pkl'

dataset_names = 'device2document_map.pkl'

dataset_vectors_w2v = 'data_vectors_w2v.pkl'
dataset_vectors_ft = 'data_vectors_ft.pkl'

model_name_w2v = 'word2vec.model'
model_name_ft = 'fasttext.model'

In [None]:
path_jsons = os.path.join(folder_data, folder_json)
path_json_processed = os.path.join(folder_created_data, folder_json_processed)

path_dataset_texts = os.path.join(folder_created_data, dataset_texts)

path_dataset_names = os.path.join(folder_data, dataset_names)

path_dataset_vectors_w2v = os.path.join(folder_created_data, dataset_vectors_w2v)
path_dataset_vectors_ft = os.path.join(folder_created_data, dataset_vectors_ft)

path_model_w2v = os.path.join(folder_created_data, model_name_w2v)
path_model_ft = os.path.join(folder_created_data, model_name_ft)

In [None]:
#function for multiprocessing
def _apply_df(args):
    df, func, kwargs = args
    return df.apply(func, **kwargs)


def apply_by_multiprocessing(df, func, **kwargs):
    workers = kwargs.pop('workers')
    pool = multiprocessing.Pool(processes=workers)
    result = pool.map(_apply_df, [(d, func, kwargs)
            for d in np.array_split(df, workers)])
    pool.close()
    return pd.concat(list(result))

# Data Processing

After passing flow  PDF documents are converted into following data sets:
<li>documents (pdf);</li>
<li>document pages (images);</li>
<li>document texts with coordinates (json files);</li>
<li>texts embeddings (pickle file);</li>
<li>company and device names (pickle file).</li>

### Download Data

In [None]:
# !aws s3 cp s3://dsg-hackathon-dataset/text_json.zip .
# !unzip text_json.zip
# !aws s3 cp s3://dsg-hackathon-dataset/documents.zip .
# !unzip documents.zip -d documents
# !aws s3 cp s3://dsg-hackathon-dataset/validation.csv .

#### File Usage

There are 3 files provided for the baseline solution:
<li>text_json - folder contains parsed text. The text was parsed by Cloud Vision API. Apart from texts the model also returns text coordinates on page;</li>
<li>documents - folder contains PDF documents;</li>
<li>validation.csv - validation queries for local validation</li>

### Processing all files

In [None]:
# checking how many files in respective folders
for fold, f_list in get_files_path(path_jsons, folder_list).items():
    print('Folder: {}.\nNumber files for processing: {} \n'.format(fold, len(f_list)))

In [None]:
# create processed json files by respective folders
files_list_dict = get_files_path(path_jsons, folder_list)
partial_args = partial(get_text, folder_save=path_json_processed, lang_threshold=0.5)
with Pool(processes=number_processes) as pool:
    %time pool.map(partial_args, list(files_list_dict.values()))

### Creating Dataset

In [None]:
dataframe_list = []
folder_exist_list = [item.split('.')[0] for item in os.listdir(path_json_processed)]

for folder in tqdm(folder_exist_list):
    if folder in folder_list:
        file_processed_path = '{}/{}.json'.format(path_json_processed, folder)
        dataframe_list.append(create_dataframe(file_processed_path, 
                                               doc_path_folder='s3://hackathon-baseline/duai_docs/'))
data = pd.concat(dataframe_list).reset_index(drop=True)
print('There are {} rows in created DataFrame'.format(data.shape[0]))

In [None]:
# split text into sentences
%time data['text_sentences'] = apply_by_multiprocessing(data['text'], partial(split_sentence, nlp=nlp_eng, 
                                                                    sentence_length=10), workers=number_processes)

In [None]:
# split sentences into words
%time data['text_words'] = data['text_sentences'].apply(lambda t: [split_words(sent, nlp_eng, 2) for sent in t])

In [None]:
# remove records without sentences
data = data[data['text_words'].apply(len)!=0].reset_index(drop=True)
print('There are {} rows after removing records without sentences...'.format(data.shape[0]))
data.head()

In [None]:
# save dataset to pickle file
data.to_pickle(path_dataset_texts)

## Vector Encoder

#### Dataset initialization

In [None]:
data = pd.read_pickle(path_dataset_texts)
print('There are {} rows in loaded DataFrame'.format(data.shape[0]))

In [None]:
data.head()

#### Text preprocessing
Text preprocessing steps for Word2Vec models include:
<li>removing stop words;</li>
<li>removing numbers and words with numbers;</li>
<li>removing short words.</li>

In [None]:
%time sentences_list = preparing_data_w2v(data)

In [None]:
#delete very short words
sentences_list = [[word for word in sent if len(word)  > 2 ]
                                for sent in sentences_list]

In [None]:
# example of sentences for modelling 
print(sentences_list[:10])

### Word2vec

In [None]:
model_w2v = Word2Vec(min_count=4,
                 window=3,
                 size=300,
                 sample=6e-5, 
                 alpha=0.025, 
                 min_alpha=0.0001, 
                 negative=20,
                 workers=cores-2)

In [None]:
# build vocabulary for Word2Vec
%time model_w2v.build_vocab(sentences_list, progress_per=10000)

In [None]:
# train Word2Vec model
%time model_w2v.train(sentences_list, total_examples=model_w2v.corpus_count, epochs=60, report_delay=1)

In [None]:
# save Word2Vec model
model_w2v.save(path_model_w2v)

### FastText

In [None]:
model_ft = FastText(min_count=3,
                 window=4,
                 size=300,
                 sample=6e-5,
                 alpha=0.025,
                 min_alpha=0.0001,
                 negative=20,
                 workers=cores-2
)

In [None]:
# build vocabulary for FastText
%time model_ft.build_vocab(sentences_list, progress_per=10000)

In [None]:
# train FastText model
%time model_ft.train(sentences_list, total_examples=model_ft.corpus_count, epochs=100, report_delay=1)

In [None]:
# save FastText model
model_ft.save(path_model_ft)

## Vectors

#### Dataset initialization

In [None]:
data = pd.read_pickle(path_dataset_texts)
print('There are {} rows in loaded DataFrame'.format(data.shape[0]))

In [None]:
data.head()

#### Models initialization

In [None]:
model_w2v = Word2Vec.load(path_model_w2v)
print('Word2Vec Model Initialization: {}'.format(model_w2v))

In [None]:
model_ft = FastText.load(path_model_ft)
print('FastText Model Initialization: {}'.format(model_ft))

Explanation of variables:
<li>data - pd.DataFrame where save vectors which build by Word2Vec model;</li>
<li>data_2 - pd.DataFrame where save vectors which build by FastText model.</li>

In [None]:
# deep copy pd.DataFrame 
data_2 = data.copy(deep=True)

#### Creating vectors for Word2Vec model

In [None]:
%time data['text_vectors'] = data['text_words'].apply(lambda x: [get_avg_vector(sent, model_w2v) for sent in x])
%time data['sentences'] = data.apply(lambda x: list(zip(x['text_sentences'], x['text_vectors'])), axis=1)

In [None]:
# checking if number of sentences equal to number of vectors
data[data['text_sentences'].apply(len)!=data['text_vectors'].apply(len)]

In [None]:
%time data_vectors = postprocess_vectors(data)

In [None]:
#save pd.DataFrame with text vectors to pickle file
data_vectors.to_pickle(path_dataset_vectors_w2v)

#### Creating vectors for Word2Vec model

In [None]:
%time data_2['text_vectors'] = data['text_words'].apply(lambda x: [get_avg_vector(sent, model_ft) for sent in x])
%time data_2['sentences'] = data.apply(lambda x: list(zip(x['text_sentences'], x['text_vectors'])), axis=1)

In [None]:
# checking if number of sentences equal to number of vectors
data_2[data_2['text_sentences'].apply(len)!=data_2['text_vectors'].apply(len)]

In [None]:
%time data_vectors_2 = postprocess_vectors(data_2)

In [None]:
#save pd.DataFrame with text vectors to pickle file
data_vectors_2.to_pickle(path_dataset_vectors_ft)

## Inference

### Create document to device mapping

Each document from the dataset contains information about single or multiple  devices and manufacturer companies. In most cases this information is located on the first pages. But very often device models appear in text a lot.

In [None]:
WORD_BLACK_LIST = ['', 'installation', 'manual', 'manuals', 'en', 'transmitters', 'ultrasonic',
                   'quick', 'start', 'foundation', 'model', 'guide', 'rev', 'series', 'gas', 'sensor',
                   'transmitter', 'meters', 'pressure', 'supplement', 'shafer', 'replacement', 'protocol',
                   'instructions', 'instructions', 'service', 'control', 'configuration', 'operation',
                   'power', 'procedure', 'instruction', 'maintenance', 'level', 'guides', 'meter', 'analyzer']

file_dict = {folder: os.listdir(os.path.join(path_jsons, folder)) for folder in os.listdir(path_jsons)[1:]}

            
def filter_word_freq(freq_dict, threshold):
    return {item: freq for item, freq in freq_dict.items() if freq > threshold} 

def sort_dict(freq_dict):
    return {k: v for k, v in sorted(freq_dict.items(), key=lambda item: item[1], reverse=True)}

device_model_pattern = r'\d{3,4}\w{0,2}'
words = '-'.join(list(chain(*file_dict.values())))

device_models_freq = Counter(ngrams(re.findall(device_model_pattern, words), 1))
device_models_freq = sort_dict(filter_word_freq(device_models_freq, 1))

In [None]:
unigram_freq = Counter(ngrams([w for w in re.sub(device_model_pattern, '', words).replace('.pdf', '').split('-') if w not in WORD_BLACK_LIST and len(w) > 4], 1))
unigram_freq = filter_word_freq(unigram_freq, 8)
bigram_freq = Counter(ngrams([w for w in re.sub(device_model_pattern, '', words).replace('.pdf', '').split('-') if w not in WORD_BLACK_LIST and len(w) > 4], 2))
bigram_freq = filter_word_freq(bigram_freq, 4)

unigram2del = []
for manufacturer, freq in bigram_freq.items():
    w1 = (manufacturer[0],)
    w2 = (manufacturer[1],)
    if w1 in unigram_freq.keys() and w2 in unigram_freq.keys():
        if freq >= unigram_freq[w1] or freq >= unigram_freq[w2]:
            unigram2del.extend([w1, w2])
clean_unigram_freq = {manufacturer: freq for manufacturer, freq in 
                      unigram_freq.items() if manufacturer not in unigram2del}

manufacturer_freq = sort_dict(dict(chain.from_iterable(d.items() for d in (unigram_freq, bigram_freq))))

In [None]:
doc_map = []
for folder, docs in file_dict.items():
    for doc in docs:
        clean_doc = doc.replace('-', ' ')
        
        device_model = []
        prev_model_freq = 0
        for prob_device_model, freq in device_models_freq.items():
            if (prob_device_model[0] in clean_doc) and (freq >= prev_model_freq):
                device_model.append(prob_device_model[0])
                prev_model_freq = freq
            elif freq < prev_model_freq:
                break
        
        manufacturer = ''
        for prob_manufacturer in manufacturer_freq.keys():
            prob_manufacturer = ' '.join(prob_manufacturer)
            if prob_manufacturer in clean_doc:
                manufacturer  = prob_manufacturer
                break
                
        doc_map.append({'doc_name': doc, 'doc_class': folder, 'manufacturer': manufacturer, 'device_model': device_model})

In [None]:
data_names = pd.DataFrame(doc_map)
data_names.head()

In [None]:
#save pd.DataFrame with device information to pickle file
data_names.to_pickle(path_dataset_names)

### Data & Model Initialization

#### vectors dataset initialization for Word2Vec model

In [None]:
data_vectors = pd.read_pickle(path_dataset_vectors_w2v)
print('The dimensionality of the DataFrame for Word2Vec model: {}'.format(data_vectors.shape))

In [None]:
data_vectors.head()

#### vectors dataset initialization for FastText model

In [None]:
data_vectors_2 = pd.read_pickle(path_dataset_vectors_ft)
print('The dimensionality of the DataFrame for ft: {}'.format(data_vectors_2.shape))

In [None]:
data_vectors_2.head()

#### Models initialization

In [None]:
model_w2v = Word2Vec.load(path_model_w2v)
print('Word2Vec Model Initialization: {}'.format(model_w2v))

In [None]:
model_ft = FastText.load(path_model_ft)
print('FastText Model Initialization: {}'.format(model_ft))

#### Names dataset initialization

In [None]:
data_names = pd.read_pickle(path_dataset_names)
print('The dimensionality of the DataFrame: {}'.format(data_names.shape))

In [None]:
data_names.head()

### Search

Let’s try to find relevant places in different documents for a user's queries. To retrieve best similar texts for a new query we would have to:
<li>Split the query into: text, company and device name.</li>
<li>Encode query text into the same model we used for vectors creating.</li>
<li>Filter texts based on company and device names.</li>
<li>Retrieve most similar text chunks and IDs.</li>

In [None]:
def split_query(x):
    manufacturer = ''
    device_model = ''
    text = ''
    digit_pattern = r'\s\d{3,}\w{0,2}'
    if re.findall(digit_pattern, x):
        groups = re.split(digit_pattern, x)
        manufacturer = groups[0].strip()
        text = groups[1].strip()
        device_model = re.findall(digit_pattern, x)[0].strip()
    else:
        special_cases = ['burner', 'flow meter', 'hydrocarbon analyzer', 'liquid analyzer',
                         'pressure meter', 'temperature meter', 'viscosity meter', 'eho bettis',
                         'eho', 'level meter']
        for case in special_cases:
            result = re.match(case, x)
            if result is not None:
                manufacturer = case
                text = x[result.end()+1:].strip()
                break
            
    return pd.Series([manufacturer, device_model, text])

In [None]:
# validation dataset initialization
validation_df = pd.read_csv('validation.csv')

In [None]:
validation_df.head(10)

In [None]:
# create test queries dataset
data_input = pd.Series(validation_df['query'].unique()).apply(lambda x: split_query(x))
data_input.rename(columns={0: 'manufacturer', 1: 'device_model', 2: 'text'}, inplace=True)
data_input.head()

How does predictions dataset build?
   1. Find top_n similar text for Word2Vec and FastText models. 
   2. Concatenation prediction for 2 models and sort by similarity.
   3. Get only top_n first values in this created dataset.

In [None]:
def get_prediction(data_input, data_vectors, data_vectors_2, data_names, model_w2v, model_ft, nlp, top_n = 5):
    outputs = []
    for i in range(len(data_input)):
        query_input = data_input.iloc[i]
        query_text = query_input['manufacturer'] + ' ' + query_input['device_model'] + ' ' + query_input['text']
        data_output_w2v = search_similar_text(query_input, data_vectors, data_names,
                                              'text_vectors', model_w2v, nlp, top_n)
        data_output_ft = search_similar_text(query_input, data_vectors_2, data_names,
                                              'text_vectors', model_ft, nlp, top_n)
        data_output = pd.concat([data_output_w2v,data_output_ft]).sort_values(by=['similarity'],ascending=False)[:top_n]
        data_output['query'] = [query_text for _ in range(1, top_n+1)]
        data_output['top_n'] = [i for i in range(1, top_n+1)]
        outputs.append(data_output)
    
    return pd.concat(outputs)

In [None]:
# get predictions for test queries
predictions = get_prediction(data_input, data_vectors, data_vectors_2, data_names,
                                   model_w2v, model_ft, nlp_eng, top_n=5)

In [None]:
predictions.head(30)

## Evaluation
Your model will to be evaluated with Mean Average Precision at 5 (MAP@5) metric

In [None]:
def create_submission(predictions: pd.DataFrame):
    predictions[['doc_path', 'text_page']] = predictions['page_class_coordinate'].apply(lambda x: pd.Series(x[0][0].split('.pdf_')))
    predictions['doc_path'] =  predictions['doc_path'] + '.pdf'
    return predictions[['query', 'top_n', 'doc_path', 'text_page']]

In [None]:
# create submission pd.DataFrame from predictions 
submission_df = create_submission(predictions)

In [None]:
submission_df.head()

In [None]:
# Mean Average Precision metric for this model
mean_average_precision(validation_df, submission_df)