In [None]:
# import requests
# from PIL import Image
# import numpy as np
# import torch
# from transformers import AutoProcessor, Owlv2ForObjectDetection
# from transformers.utils.constants import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD

# processor = AutoProcessor.from_pretrained("google/owlv2-base-patch16-ensemble")
# model = Owlv2ForObjectDetection.from_pretrained("google/owlv2-base-patch16-ensemble")

# url = "https://as2.ftcdn.net/v2/jpg/05/39/70/49/1000_F_539704980_FPmFIDBc74pUeV3oGWUE0g4v9RsoKwl4.jpg"
# image = Image.open(requests.get(url, stream=True).raw)
# texts = [["a photo of a cat", "a photo of a dog"]]
# inputs = processor(text=texts, images=image, return_tensors="pt")

# # forward pass
# with torch.no_grad():
#     outputs = model(**inputs)

# # Note: boxes need to be visualized on the padded, unnormalized image
# # hence we'll set the target image sizes (height, width) based on that

# def get_preprocessed_image(pixel_values):
#     pixel_values = pixel_values.squeeze().numpy()
#     unnormalized_image = (pixel_values * np.array(OPENAI_CLIP_STD)[:, None, None]) + np.array(OPENAI_CLIP_MEAN)[:, None, None]
#     unnormalized_image = (unnormalized_image * 255).astype(np.uint8)
#     unnormalized_image = np.moveaxis(unnormalized_image, 0, -1)
#     unnormalized_image = Image.fromarray(unnormalized_image)
#     return unnormalized_image

# unnormalized_image = get_preprocessed_image(inputs.pixel_values)

# target_sizes = torch.Tensor([unnormalized_image.size[::-1]])
# # Convert outputs (bounding boxes and class logits) to final bounding boxes and scores
# results = processor.post_process_object_detection(
#     outputs=outputs, threshold=0.2, target_sizes=target_sizes
# )

# i = 0  # Retrieve predictions for the first image for the corresponding text queries
# text = texts[i]
# boxes, scores, labels = results[i]["boxes"], results[i]["scores"], results[i]["labels"]
# count=0
# for box, score, label in zip(boxes, scores, labels):
#   count+=1
#   box = [round(i, 2) for i in box.tolist()]
#   print(f"Detected {text[label]} with confidence {round(score.item(), 3)} at location {box}")
#   x1, y1, x2, y2 = box
#   cropped_image = unnormalized_image.crop((x1, y1, x2, y2))
#   cropped_image.save(f"detected_{text[label]}-{count}.jpg")


In [None]:
from pyspark.sql import SparkSession

# Create Spark session
spark = SparkSession.builder.appName("MyApp").getOrCreate()

# Example DataFrame creation
data = [("Alice", 34), ("Bob", 45), ("Cathy", 29)]
df = spark.createDataFrame(data, ["Name", "Age"])

df.show()

+-----+---+
| Name|Age|
+-----+---+
|Alice| 34|
|  Bob| 45|
|Cathy| 29|
+-----+---+



In [None]:
# !pip install numpy scikit-learn transformers

In [2]:
# !pip install transformers faiss-cpu torch

In [None]:
# Sample input documents
documents = [
    "Python is a programming language that lets you work quickly and integrate systems more effectively.",
    "JavaScript is a high-level, often just-in-time compiled language that conforms to the ECMAScript specification.",
    "Java is a class-based, object-oriented programming language that is designed to have as few implementation dependencies as possible.",
    "Ruby is an interpreted, high-level, general-purpose programming language."
]


In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
import faiss
import numpy as np

# Load model and tokenizer
model_name = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Function to encode documents into embeddings
def encode_documents(documents):
    inputs = tokenizer(documents, padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        embeddings = model(**inputs).last_hidden_state[:, 0, :].numpy()  # Get the [CLS] token
    return embeddings

# Encode documents and create FAISS index
embeddings = encode_documents(documents)
index = faiss.IndexFlatL2(embeddings.shape[1])  # Create index for L2 distance
index.add(embeddings)  # Add embeddings to the index


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

In [None]:
from transformers import pipeline

# Initialize the generator
generator = pipeline("text-generation", model="gpt2")  # Use a generative model

# Function to retrieve relevant documents and generate an answer
def generate_answer(query):
    # Encode the query
    query_embedding = encode_documents([query])

    # Perform retrieval
    distances, indices = index.search(query_embedding, k=2)  # Retrieve top 2 documents
    retrieved_docs = [documents[i] for i in indices[0]]

    # Combine retrieved documents for generation
    context = "\n".join(retrieved_docs)
    prompt = f"Context:\n{context}\n\nQuestion: {query}\nAnswer:"

    # Generate answer
    answer = generator(prompt, max_length=100)[0]['generated_text']
    return answer

# Example usage
query = "What is java?"
answer = generate_answer(query)
print(answer)


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Context:
Java is a class-based, object-oriented programming language that is designed to have as few implementation dependencies as possible.
JavaScript is a high-level, often just-in-time compiled language that conforms to the ECMAScript specification.

Question: What is java?
Answer: It's a cross-platform (Platform-agnostic) language developed at the C++ Technical University by Jelene Gerskoi. It's an integral part of


## Testing with text from json file

In [None]:
import json
import torch
import faiss
from transformers import AutoTokenizer, AutoModel, pipeline

# Step 1: Load and process the JSON file
def load_extracted_text(json_file_path):
    """Load JSON data and extract the 'extracted_text' field."""
    with open(json_file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    # Extract only the 'extracted_text' field
    extracted_texts = [entry['extracted_text'] for entry in data if 'extracted_text' in entry]

    return extracted_texts

# Step 2: Encode documents into embeddings using a pre-trained model
def encode_documents(documents, model, tokenizer):
    """Encode the extracted documents into embeddings."""
    inputs = tokenizer(documents, padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        embeddings = model(**inputs).last_hidden_state[:, 0, :].numpy()  # Get the [CLS] token
    return embeddings

# Step 3: Create a FAISS index from the document embeddings
def create_faiss_index(embeddings):
    """Create and return a FAISS index for the embeddings."""
    index = faiss.IndexFlatL2(embeddings.shape[1])  # Create index for L2 distance
    index.add(embeddings)  # Add embeddings to the index
    return index

# Step 4: Retrieve relevant documents and generate an answer using a generative model
def generate_answer(query, index, documents, model, tokenizer, generator):
    """Retrieve relevant documents and generate an answer based on the query."""
    # Encode the query
    query_embedding = encode_documents([query], model, tokenizer)

    # Perform retrieval
    distances, indices = index.search(query_embedding, k=3)  # Retrieve top 3 documents
    retrieved_docs = [documents[i] for i in indices[0]]

    # Combine retrieved documents for generation
    context = "\n".join(retrieved_docs)
    prompt = f"Context:\n{context}\n\nQuestion: {query}\nAnswer:"

    # Generate the answer
    answer = generator(prompt, max_new_tokens=100)[0]['generated_text']
    return answer

# Step 5: Main function to load data, create index, and handle queries
def main(json_file_path, query):
    # Load the extracted texts from the JSON file
    documents = load_extracted_text(json_file_path)

    # Load the pre-trained model and tokenizer for encoding
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)

    # Encode the documents into embeddings
    embeddings = encode_documents(documents, model, tokenizer)

    # Create a FAISS index for the embeddings
    index = create_faiss_index(embeddings)

    # Initialize the text generation model (e.g., GPT-2)
    generator = pipeline("text-generation", model="gpt2")

    # Generate an answer for the query
    answer = generate_answer(query, index, documents, model, tokenizer, generator)

    # Return the generated answer
    return answer

# Example usage
if __name__ == "__main__":
    json_file_path = '/content/ocr_results_geo.json'
    query = "What is human geography?"
    result = main(json_file_path, query)
    print(result)


## Fine-tuning model

In [None]:
import json
import torch
import faiss
import random
from transformers import AutoTokenizer, AutoModel, pipeline

# Step 1: Load and process the JSON file
def load_extracted_text(json_file_path):
    """Load JSON data and extract the 'extracted_text' field."""
    with open(json_file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    # Extract only the 'extracted_text' field
    extracted_texts = [entry['extracted_text'] for entry in data if 'extracted_text' in entry]

    return extracted_texts

# Step 2: Encode documents into embeddings using a pre-trained model
def encode_documents(documents, model, tokenizer):
    """Encode the extracted documents into embeddings."""
    inputs = tokenizer(documents, padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        embeddings = model(**inputs).last_hidden_state[:, 0, :].numpy()  # Get the [CLS] token
    return embeddings

# Step 3: Create a FAISS index from the document embeddings
def create_faiss_index(embeddings):
    """Create and return a FAISS index for the embeddings."""
    index = faiss.IndexFlatL2(embeddings.shape[1])  # Create index for L2 distance
    index.add(embeddings)  # Add embeddings to the index
    return index

# Step 4: Retrieve relevant documents and generate an answer using a generative model
def generate_answer(query, index, documents, model, tokenizer, generator):
    """Retrieve relevant documents and generate an answer based on the query."""
    # Encode the query
    query_embedding = encode_documents([query], model, tokenizer)

    # Perform retrieval
    distances, indices = index.search(query_embedding, k=3)  # Retrieve top 3 documents
    retrieved_docs = [documents[i] for i in indices[0]]

    # Combine retrieved documents for generation
    context = "\n".join(retrieved_docs)
    prompt = f"Context:\n{context}\n\nQuestion: {query}\nAnswer:"

    # Generate the answer
    answer = generator(prompt, max_new_tokens=100)[0]['generated_text']
    return answer

# Step 5: Generate synthetic queries and responses for fine-tuning the model
def generate_query_response_pairs(documents):
    """Generate synthetic queries and responses based on document content."""
    query_response_pairs = []
    for doc in documents:
        # Generate a simple question based on the document content
        question = generate_question(doc)
        answer = generate_answer_from_text(doc)
        query_response_pairs.append({"question": question, "answer": answer})

    return query_response_pairs

def generate_question(document):
    """Generate a sample question from the content of the document."""
    # Simple heuristic to generate questions (e.g., based on keywords)
    sample_questions = [
        "What is this document about?",
        "What are the main points of this paper?",
        "What is the conclusion of this document?",
        "What is the purpose of this research?",
        "What data does this document analyze?"
    ]
    return random.choice(sample_questions)

def generate_answer_from_text(document):
    """Generate a response by selecting part of the document as the answer."""
    # You can split the document and return a random part as the 'answer'
    sentences = document.split(". ")
    if len(sentences) > 1:
        return random.choice(sentences)  # Select a random sentence as the answer
    return document  # If not enough sentences, return the full document as the answer

# Fine-tuning preparation
def prepare_fine_tuning_dataset(json_file_path):
    """Prepare the dataset for fine-tuning the generative model."""
    # Load the extracted texts from the JSON file
    documents = load_extracted_text(json_file_path)

    # Generate synthetic query-response pairs
    query_response_pairs = generate_query_response_pairs(documents)

    # Return the generated pairs for fine-tuning
    return query_response_pairs

# Main function to load data, create index, and handle queries
def main(json_file_path, query):
    # Load the extracted texts from the JSON file
    documents = load_extracted_text(json_file_path)

    # Load the pre-trained model and tokenizer for encoding
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)

    # Encode the documents into embeddings
    embeddings = encode_documents(documents, model, tokenizer)

    # Create a FAISS index for the embeddings
    index = create_faiss_index(embeddings)

    # Initialize the text generation model (e.g., GPT-2)
    generator = pipeline("text-generation", model="gpt2")

    # Generate an answer for the query
    answer = generate_answer(query, index, documents, model, tokenizer, generator)

    # Return the generated answer
    return answer

# Example usage
if __name__ == "__main__":
    json_file_path = '/content/ocr_results.json'

    # Generate fine-tuning data
    fine_tuning_data = prepare_fine_tuning_dataset(json_file_path)
    print("Sample fine-tuning data (queries and responses):")
    for item in fine_tuning_data[:5]:  # Show first 5 samples
        print(f"Q: {item['question']}\nA: {item['answer']}\n")

    # Query and generate a response
    query = "What is GIS?"
    result = main(json_file_path, query)
    print("Generated answer for query:", result)


Sample fine-tuning data (queries and responses):
Q: What data does this document analyze?
A: | African Governance and Development Institute (AGDI}, Yaoundé, Cameroon


Q: What data does this document analyze?
A: ) https:é*hdl handle.net/10419/228031 |


Q: What are the main points of this paper?
A: WP/20/054


Q: What is the purpose of this research?
A: 

Q: What data does this document analyze?
A: ‘Suggested Citation: Asongu, Simplice; Diop, Samba; Nnanna, Jaseph (2020) : The geography of the
effectiveness and consequences of Covid19 measures: Global evidence, ASDI Werking Paper, No
|WP/20/054, African Gavernance and Development Institute (AGD1), Yaoundé




Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated answer for query: Context:
GIS is one of the information systems types implemented on the basis of made
computer tchnologies 16 perfor various tas Folate to geospatial analysis. I is
designed to erent geagtaphical thematic maps and analyst of abjets ofthe real world,

For Ukraine, the issue of development and implementation of GIS is relevant. [t is.
emphasized inthe flowing segulations: the Concept of the digit) economy and
society of Usain fr 2018-2020 [5], the Coneapt of eGovernment development in
Ukssine ducd 20.00.2017 No 649-p {16}, the Concept of ereaon ofthe national
automated system “Open covroomn [I], Dee of the President of Uksaine 0
Sustainable Development Goals of Uktsine fr the period wp 10 2030" deted

‘Geoinformatics, as.a science, is relatively young in Ukraine. Therefore, educational and
methodological materials are ot dovelopad sufficiently. Various aspccts fhe GIS use
{or solving cavtonmental aft prablems are discussed in [2,28 29,30, 38,41, 43]
Developmen