In [1]:
#Import
import os
import json
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer;

In [2]:
# Load valid projects from valid_projects.txt
valid_projects = []
with open('valid_project.txt', 'r') as file:
    for line in file:
        project = line.strip()
        if project:  # Ignore empty lines
            valid_projects.append(project)

In [3]:
valid_projects

['CERV-2025-CHILD',
 'CERV-2025-CITIZENS-CIV',
 'CERV-2025-DAPHNE',
 'Connecting_Spheres',
 'COPILOT',
 'CREA-CROSS-2025-INNOVLAB',
 'CREA-MEDIA-2025-CINNET',
 'CREA-MEDIA-2025-FILMDIST',
 'CREA-MEDIA-2025-FILMOVE',
 'CREA-MEDIA-2025-TRAINING',
 'CREA-MEDIA-2025-TVONLINE',
 'DigiQ',
 'DIGITAL-2025-EDIH-EU-EEA-08-CONSOLIDATION-STEP',
 'E-BOOST',
 'EDF-2025-CSA-NFP',
 'EDF-2025-LS-DA-SME-NT',
 'EDF-2025-LS-RA-SMERO-NT',
 'EP-LINC-SUBV-2025-CONF-INT-01',
 'EP-LINC-SUBV-2025-CONF-INT-02',
 'EP-LINC-SUBV-2025-CONF-INT-03',
 'ERA4Health',
 'ERASMUS-EDU-2022-ECHE-CERT-FP',
 'ERASMUS-EDU-2025-CSC-OG-FPA',
 'ERASMUS-EDU-2025-PEX-COVE',
 'ERASMUS-YOUTH-2025-CSC-OG-FPA',
 'ERC-2025-POC',
 'ESC-HUMAID-2021-QUAL-LABEL-FP',
 'ESC-HUMAID-2025-VOLUN',
 'ESF-2025-AG-NETW-MF-SE',
 'ESF-2025-EURES-CBC',
 'ESF-2025-OG-NETW-NGO-FPA',
 'ESF-2025-OG-NETW-NGO-SGA',
 'EUBA-EFSA-2025-PREV-02',
 'FABRIX',
 'FRONTIERS',
 'HORIZON-EIC-2025-ACCELERATOR-01',
 'HORIZON-EIC-2025-EICSTEP-01',
 'HORIZON-JU-CBE-2025-CSA-

In [5]:
import os
import google.generativeai as genai
from google.generativeai import types
import time  # Added for retry logic

# Nastav API klíč (ujisti se, že máš ve svém prostředí proměnnou GOOGLE_API_KEY)
genai.configure(api_key="AIzaSyB3LkpREqIq8WwCFxsjXEd6-2h-Jnu1G7U")


def generate(input: str, questions:str, max_retries=5):
    model = genai.GenerativeModel("gemini-2.0-flash")
    prompt = f"""
    Based on the following text about a European Project Call answer to each question. Output must be in same JSON format. Each answer must be only a number!

    Text:
    {input}

    Questions:
    {questions}
    """

    retries = 0
    while retries < max_retries:
        try:
            response = model.generate_content(prompt)
            # Najdi první a poslední složenou závorku a zkus to zparsovat
            json_start = response.text.find('{')
            json_end = response.text.rfind('}') + 1
            json_str = response.text[json_start:json_end]
            parsed = json.loads(json_str)
            return parsed
        except Exception as e:
            retries += 1
            wait_time = 2 ** retries  # Exponential backoff
            print(f"Retry {retries}/{max_retries} after error: {e}. Waiting {wait_time} seconds.")
            time.sleep(wait_time)
    print("Max retries reached. Returning None.")
    return None

In [9]:
non_valid_counter = 0
answers_score = []
answers = []
for call in valid_projects:
    
    with open("./test_data/"+call+"_questions_answers.txt", "r") as file:
        input = file.read()
        try:
            question_and_answers = json.loads(input)
        except:
            continue

    with open("./data"+"/"+call+"_combined_text.txt", "r") as file:
        input_combined = file.read()
    
    # Step 1: Chunk the combined text
    chunks = chunk_text(input_combined)
    print(f"Document chunked into {len(chunks)} pieces")
    
    # Step 2: Vectorize all chunks with E5 multilingual model
    chunk_embeddings = model.encode(chunks)
    print("All chunks vectorized")
    
    # Step 3: Extract questions
    questions = [question["question"] for question in question_and_answers["test_questions"]]
    original_answers = [question["answer"] for question in question_and_answers["test_questions"]]
    
    # Step 4: Process each question to get relevant context
    all_relevant_chunks = []
    for question in questions:
        # Vectorize the question
        question_embedding = model.encode(question)
        
        # Find top 5 relevant chunks
        top_chunks = find_top_k_chunks(question_embedding, chunk_embeddings, chunks, k=1)
        
        # Add to our collection (only the text, not the scores)
        all_relevant_chunks.extend([chunk for chunk, _ in top_chunks])
    
    # Remove duplicates while preserving order
    unique_chunks = []
    for chunk in all_relevant_chunks:
        if chunk not in unique_chunks:
            unique_chunks.append(chunk)
    
    # Combine all relevant chunks as context
    rag_context = "\n\n".join(unique_chunks)
    print(f"RAG context created with {len(unique_chunks)} relevant chunks")

    question_and_answers_blank_answers = question_and_answers.copy()
    for question in question_and_answers_blank_answers["test_questions"]:
        question["answer"] = "TO_BE_FILLED_BY_NUMBER"

    # Use the RAG context instead of the full combined text
    validation_answers = generate(rag_context, json.dumps(question_and_answers_blank_answers, indent=4), max_retries=5)
    
    validation_answers = [question["answer"] for question in validation_answers["test_questions"]]

    correct_answers = 0
    for i in range(len(original_answers)):
        if str(original_answers[i]) == str(validation_answers[i]):
            correct_answers += 1

    print(f"Correct answers: {correct_answers}/{len(original_answers)}")
    answers_score.append(correct_answers/len(original_answers))
    print(f"Current average score: {sum(answers_score)/len(answers_score):.4f}")


    #Save the answers to a file folder validation_single
    if not os.path.exists("validation_rag"):
        os.makedirs("validation_rag")
    with open(f"validation_rag/{call}_answers.json", "w") as file:
        json.dump(validation_answers, file, indent=4) 

    #delay
    time.sleep(5)


print(f"Non valid projects: {non_valid_counter}")
        

Document chunked into 589 pieces
All chunks vectorized
RAG context created with 7 relevant chunks
Correct answers: 6/10
Current average score: 0.6000
Document chunked into 579 pieces
All chunks vectorized
RAG context created with 9 relevant chunks
Correct answers: 8/10
Current average score: 0.7000
Document chunked into 727 pieces
All chunks vectorized
RAG context created with 6 relevant chunks
Correct answers: 5/10
Current average score: 0.6333
Document chunked into 5 pieces
All chunks vectorized
RAG context created with 1 relevant chunks
Correct answers: 9/10
Current average score: 0.7000
Document chunked into 7 pieces
All chunks vectorized
RAG context created with 4 relevant chunks
Correct answers: 9/10
Current average score: 0.7400
Document chunked into 591 pieces
All chunks vectorized
RAG context created with 6 relevant chunks
Correct answers: 8/10
Current average score: 0.7500
Document chunked into 591 pieces
All chunks vectorized
RAG context created with 8 relevant chunks
Correc

In [6]:
# Load the E5 multilingual model for embedding
model = SentenceTransformer('intfloat/multilingual-e5-large')

# Function to chunk text with overlap
def chunk_text(text, chunk_size=512, overlap=100):
    words = text.split()
    chunks = []
    
    for i in range(0, len(words), chunk_size - overlap):
        # Create a chunk that begins at the current position
        chunk = words[i:i + chunk_size]
        # Join the words to form a chunk
        chunk_text = ' '.join(chunk)
        chunks.append(chunk_text)
        
        # Stop if we've processed all the words
        if i + chunk_size >= len(words):
            break
            
    return chunks

# Function to find top k similar chunks
def find_top_k_chunks(query_embedding, chunk_embeddings, chunks, k=10):
    # Calculate similarity between query and all chunks
    similarities = cosine_similarity([query_embedding], chunk_embeddings)[0]
    
    # Get indices of top k similarities
    top_indices = np.argsort(similarities)[-k:][::-1]
    
    # Return the top k chunks and their similarity scores
    return [(chunks[i], similarities[i]) for i in top_indices]

In [10]:
res = sum(answers_score)/len(answers_score)
res

0.6508474576271186