<a href="https://colab.research.google.com/github/ontologist/viba-project/blob/main/Preprocessor_Indexer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Developed by Yuri Tijerino

Copyright© 2023 AIngle Labs.  All rights reserved

# Indexing Pipeline to index several files from a repository

## Let's prepare the environment

In [1]:
#!pip --version

pip 22.0.4 from /usr/local/lib/python3.9/dist-packages/pip (python 3.9)


In [1]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%%bash

# Install the latest main of Haystack
pip install --upgrade pip
pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab,faiss]

apt install libgraphviz-dev
pip install pygraphviz

In [3]:
import logging


logging.basicConfig(format="%(levelname)s - %(name)s -  %(message)s", level=logging.WARNING)
logging.getLogger("haystack").setLevel(logging.INFO)

In [4]:
from haystack.utils import fetch_archive_from_http

# This fetches some sample files to work with
doc_dir = "data/laptops"
s3_url = "https://vibahaystack.s3.us-west-2.amazonaws.com/All_Laptops_Cleaned_test.zip"
fetch_archive_from_http(url=s3_url, output_dir=doc_dir)

INFO:haystack.telemetry_2:Haystack sends anonymous usage data to understand the actual usage and steer dev efforts towards features that are most meaningful to users. You can opt-out at anytime by manually setting the environment variable HAYSTACK_TELEMETRY_ENABLED as described for different operating systems in the [documentation page](https://docs.haystack.deepset.ai/docs/telemetry#how-can-i-opt-out). More information at [Telemetry](https://docs.haystack.deepset.ai/docs/telemetry).
INFO:haystack.utils.import_utils:Fetching from https://vibahaystack.s3.us-west-2.amazonaws.com/All_Laptops_Cleaned_test.zip to 'data/laptops'


True

In [5]:
from haystack.nodes import TransformersDocumentClassifier

doc_classifier = TransformersDocumentClassifier(
    model_name_or_path="cross-encoder/nli-distilroberta-base",
    task="zero-shot-classification",
    labels=["gaming laptop", "business laptop", "content creation laptop", "heavy coding laptop", "multitasking laptop", "starter laptop", "student laptop", "trafel laptop", "influencer laptop"],
    batch_size=16,
)

INFO:haystack.modeling.utils:Using devices: CUDA:0 - Number of GPUs: 1


Downloading (…)lve/main/config.json:   0%|          | 0.00/701 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/329M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

In [None]:
from haystack.nodes import PreProcessor
from haystack.utils import convert_files_to_docs

# classify using gpu, batch_size makes sure we do not run out of memory
all_docs = convert_files_to_docs(dir_path=doc_dir)


classified_docs = doc_classifier.predict(all_docs) # This way we classify the original documents

In [7]:


from haystack.nodes import PreProcessor


# This is a default usage of the PreProcessor.
# Here, it performs cleaning of consecutive whitespaces
# and splits a single large document into smaller documents.
# Each document is up to 1000 words long and document breaks cannot fall in the middle of sentences
# Note how the single document passed into the document gets split into 5 smaller documents

preprocessor = PreProcessor(
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=False,
    #split_by="word",
    #split_length=100,
    #split_respect_sentence_boundary=True,
)
docs_default = preprocessor.process(classified_docs)
print(f"n_docs_input: 1\nn_docs_output: {len(docs_default)}")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Preprocessing:   0%|          | 0/1094 [00:00<?, ?docs/s]



n_docs_input: 1
n_docs_output: 3929


In [None]:
docs_default

In [4]:
#from haystack.nodes import PreProcessor
#from haystack.utils import convert_files_to_docs

#No need to do this because we are trying to do extration on the original documents

# note that you can also use the document classifier before applying the PreProcessor, e.g. before splitting your documents
#all_docs = convert_files_to_docs(dir_path=doc_dir)
#preprocessor_sliding_window = PreProcessor(split_overlap=3, split_length=10, split_respect_sentence_boundary=False)

# I don't think we need to split the original documents 
#docs_sliding_window = preprocessor_sliding_window.process(all_docs)

In [7]:
# let's see how it looks: there should be a classification result in the meta entry containing labels and scores.
print(classified_docs[0].to_dict())

{'content': 'Dell Chromebook 11 3100 8GB 1DIMM QC CEL N4120 64GB EMMC NT WLS 1366x768 3c\nPrice:382\nItem#: 9SIAD6HJ6R3958\n\n#Overview\n\n#Specs\nCPU Type:Intel Celeron\nCPU Speed:N4120 (1.10GHz)\nNumber of Cores:Quad-core Processor\n\nScreen Size:11.6"\nDisplay Type:HD\nResolution:1366 x 768\n\nOperating System:Chrome OS\n\nSSD:64 GB\n\nMemory:8GB\n\nDate First Available:September 12, 2022\n\n\n\n#Reviews', 'content_type': 'text', 'score': None, 'meta': {'name': 'p_1XV-000A-01ZE1.txt', 'classification': {'label': 'gaming laptop', 'score': 0.16292552649974823, 'details': {'gaming laptop': 0.16292552649974823, 'business laptop': 0.1306798905134201, 'content creation laptop': 0.12457068264484406, 'multitasking laptop': 0.11906225234270096, 'heavy coding laptop': 0.10548959672451019, 'influencer laptop': 0.09493645280599594, 'trafel laptop': 0.09065605700016022, 'student laptop': 0.08912153542041779, 'starter laptop': 0.0825580358505249}}}, 'id_hash_keys': ['content'], 'embedding': None,

In [6]:
# Recommended: Start Elasticsearch using Docker via the Haystack utility function
#from haystack.utils import launch_es

In [None]:
#%%bash

# Install the latest main of Haystack for faiss
#pip install --upgrade pip
#pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[faiss]

In [8]:


from haystack.document_stores import FAISSDocumentStore
from haystack.nodes import DensePassageRetriever



document_store=FAISSDocumentStore(faiss_index_factory_str="Flat",vector_dim=768)

In [9]:
document_store.delete_index("faiss_document_store.db")

In [11]:
from haystack.nodes import PreProcessor


# This is a default usage of the PreProcessor.
# Here, it performs cleaning of consecutive whitespaces
# and splits a single large document into smaller documents.
# Each document is up to 1000 words long and document breaks cannot fall in the middle of sentences
# Note how the single document passed into the document gets split into 5 smaller documents

preprocessor = PreProcessor(
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=False,
    #split_by="word",
    #split_length=100,
    #split_respect_sentence_boundary=True,
)
docs_default = preprocessor.process(classified_docs)
print(f"n_docs_input: 1\nn_docs_output: {len(docs_default)}")

Preprocessing:   0%|          | 0/1094 [00:00<?, ?docs/s]



n_docs_input: 1
n_docs_output: 3929


In [12]:
docs = classified_docs

document_store.delete_documents()
document_store.write_documents(classified_docs)

Writing Documents:   0%|          | 0/1094 [00:00<?, ?it/s]

In [None]:
#Let's see what is in docs
docs

In [13]:
# check if indexed docs contain classification results
from pprint import pprint

test_doc = document_store.get_all_documents()[0]
pprint(
#    f'document {test_doc.id} with content \n\n{test_doc.content}\n\nhas label {test_doc.meta["classification"]["label"]}'
test_doc.to_json()
)

('{"content": "MSI GT Series Titan GT77HX 13VI-042US 17.3\\" 4K / UHD 144 Hz '
 'IPS Intel Core i9 13th Gen 13980HX (2.20GHz) NVIDIA GeForce RTX 4090 Laptop '
 'GPU 128GB Memory 4 TB NVMe( 2TB x 2) SSD Windows 11 Pro 64-bit '
 'G\\nPrice:5,299\\nItem#: '
 'N82E16834156421\\n\\n#Overview\\n\\n\\n#Specs\\nBest Seller Ranking:#23 in '
 'Gaming Laptops\\n\\nBrand:MSI\\nSeries:GT Series\\nModel:Titan GT77HX '
 '13VI-042US\\n\\nColor:Core Black\\nOperating System:Windows 11 Pro '
 '64-bit\\nCPU:Intel Core i9-13980HX 2.20 GHz\\nScreen:17.3\\" 4K/UHD 144 Hz '
 'Mini LED IPS\\nMemory:128 GB DDR5\\nStorage:4 TB NVMe Gen4x4 SSD\\nGraphics '
 'Card:NVIDIA GeForce RTX 4090 Laptop GPU\\nVideo Memory:16 GB '
 'GDDR6\\nDimensions (W x D x H):15.63\\" x 12.99\\" x 0.91\\"\\nWeight:7.28 '
 'lbs.\\n\\nCPU Type:Intel Core i9 13th Gen\\nCPU Speed:13980HX '
 '(2.20GHz)\\nNumber of Cores:24-core (8P+16E) Processor\\nCore Name:Raptor '
 'Lake\\nTurbo Frequency:Up to 5.60 GHz\\nCPU L3 Cache:36 MB\\n\\nScreen '

In [14]:
# Initialize QA-Pipeline
from haystack.pipelines import ExtractiveQAPipeline
from haystack.nodes import FARMReader, BM25Retriever

In [15]:
from haystack.utils import print_answers

In [16]:
!pip install -Uqq ipdb
import ipdb

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m793.3/793.3 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m44.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m385.8/385.8 kB[0m [31m32.3 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires ipython~=7.9.0, but you have ipython 8.11.0 which is incompatible.[0m[31m
[0m

In [17]:
%pdb off

Automatic pdb calling has been turned ON


In [None]:
from pathlib import Path
from haystack.pipelines import Pipeline
from haystack.nodes import TextConverter, PreProcessor, FileTypeClassifier


file_type_classifier = FileTypeClassifier()
text_converter = TextConverter()
#pdf_converter = PDFToTextConverter()
#docx_converter = DocxToTextConverter()

indexing_pipeline_with_classification = Pipeline()
indexing_pipeline_with_classification.add_node(
    component=file_type_classifier, name="FileTypeClassifier", inputs=["File"]
)
indexing_pipeline_with_classification.add_node(
    component=text_converter, name="TextConverter", inputs=["FileTypeClassifier.output_1"]
)
#indexing_pipeline_with_classification.add_node(
#    component=pdf_converter, name="PdfConverter", inputs=["FileTypeClassifier.output_2"]
#)
#indexing_pipeline_with_classification.add_node(
#    component=docx_converter, name="DocxConverter", inputs=["FileTypeClassifier.output_4"]
#
indexing_pipeline_with_classification.add_node(
    component=preprocessor,
    name="Preprocessor",
    inputs=["TextConverter"],
)
indexing_pipeline_with_classification.add_node(
    component=doc_classifier, name="DocumentClassifier", inputs=["Preprocessor"]
)
indexing_pipeline_with_classification.add_node(
    component=document_store, name="DocumentStore", inputs=["DocumentClassifier"]
)
# Uncomment the following to generate the pipeline image
# indexing_pipeline_with_classification.draw("index_time_document_classifier.png")

document_store.delete_documents()
#pdf_files = [f for f in Path(doc_dir).iterdir() if f.suffix == ".pdf"]
#docx_files = [f for f in Path(doc_dir).iterdir() if f.suffix == ".docx"]
txt_files = [f for f in Path("data/laptops/All_Laptops_Cleaned").iterdir() if f.suffix == ".txt"]
#indexing_pipeline_with_classification.run(file_paths=pdf_files)
#indexing_pipeline_with_classification.run(file_paths=docx_files)
indexing_pipeline_with_classification.run(file_paths=txt_files)

document_store.get_all_documents()[0]

Converting files:   0%|          | 0/1094 [00:00<?, ?it/s]

Preprocessing:   0%|          | 0/1094 [00:00<?, ?docs/s]

Classifying documents:   0%|          | 0/3929 [00:00<?, ?it/s]

