## <span style="color:#ff5f27">📝 Imports </span>

In [None]:
!pip install -r requirements.txt -q

In [None]:
import PyPDF2
import pandas as pd
from sentence_transformers import SentenceTransformer

from functions.pdf_preprocess import (
    download_files_to_folder, 
    process_pdf_file,
)
from functions.text_preprocess import process_text_data
import config

import warnings
warnings.filterwarnings('ignore')

## <span style="color:#ff5f27">💾 Download files from Google Drive </span>

In [None]:
# Call the function to download files
new_files = download_files_to_folder(
    config.FOLDER_ID, 
    config.DOWNLOAD_PATH,
)

## <span style="color:#ff5f27">🧬 Text Extraction </span>

In [None]:
# Initialize an empty list
document_text = []

for file in new_files:
    process_pdf_file(
        file, 
        document_text, 
        config.DOWNLOAD_PATH,
    )

In [None]:
# Create a DataFrame
columns = ["file_name", "file_link", "page_number", "text"]
df_text = pd.DataFrame(
    data=document_text,
    columns=columns,
)
# Display the DataFrame
df_text

In [None]:
# Process text data using the process_text_data function
df_text_processed = process_text_data(df_text)

# Display the processed DataFrame
df_text_processed

## <span style="color:#ff5f27">⚙️ Embeddings Creation </span>

In [None]:
# Load the SentenceTransformer model
model = SentenceTransformer(
    config.MODEL_SENTENCE_TRANSFORMER,
).to(config.DEVICE)
model.device

In [None]:
# Generate embeddings for the 'text' column using the SentenceTransformer model
df_text_processed['embeddings'] = pd.Series(
    model.encode(df_text_processed['text']).tolist(),
)

# Create a new column 'context_id' with values ranging from 0 to the number of rows in the DataFrame
df_text_processed['context_id'] = [*range(df_text_processed.shape[0])]

# Display the resulting DataFrame with the added 'embeddings' and 'context_id' columns
df_text_processed

## <span style="color:#ff5f27;"> 🔮 Connecting to Hopsworks Feature Store </span>

In [None]:
import hopsworks

project = hopsworks.login()

fs = project.get_feature_store() 

## <span style="color:#ff5f27;"> 🪄 Feature Group Creation </span>

In [None]:
from hsfs import embedding

# Create the Embedding Index
emb = embedding.EmbeddingIndex()

emb.add_embedding(
    "embeddings", 
    model.get_sentence_embedding_dimension(),
)

In [None]:
# Get or create the 'documents_fg' feature group
documents_fg = fs.get_or_create_feature_group(
    name="documents_fg",
    embedding_index=emb,
    primary_key=['context_id'],
    version=1,
    description='Information from various files, presenting details like file names, source links, and structured text excerpts from different pages and paragraphs.',
    online_enabled=True,
)

documents_fg.insert(df_text_processed)

## <span style="color:#ff5f27;">🪄 Feature View Creation </span>


In [None]:
# Get or create the 'documents' feature view
feature_view = fs.get_or_create_feature_view(
    name="documents",
    version=1,
    description='Chunked context for RAG system',
    query=documents_fg.select(["file_name", "file_link", "page_number", "paragraph", "text"]),
)

---