In [None]:
!pip install transformers python-docx torch

In [None]:
from docx import Document
import sqlite3
import numpy as np
from transformers import AutoTokenizer, AutoModel, pipeline
import torch
from nltk.tokenize import sent_tokenize

In [None]:
document_path = "your_document_path(CV)"
database_path = "vector_database.db"

# Fine-Tune / Training Model

In [None]:
# from transformers import AutoModelForQuestionAnswering, Trainer, TrainingArguments
# from datasets import load_dataset

# model_name = "bert-base-uncased"
# model = AutoModelForQuestionAnswering.from_pretrained(model_name)
# dataset = load_dataset("squad")

# training_args = TrainingArguments(
#     output_dir="./results",
#     evaluation_strategy="epoch",
#     learning_rate=2e-5,
#     per_device_train_batch_size=16,
#     num_train_epochs=3,
#     weight_decay=0.01,
# )

# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=dataset["train"],
#     eval_dataset=dataset["validation"],
# )

# trainer.train()

In [None]:
def extract_sections_from_docx(doc_path):
    doc = Document(doc_path)
    sections = {}
    current_section = None
    current_text = []

    for paragraph in doc.paragraphs:
        if paragraph.text.isupper():
            if current_section:
                sections[current_section] = " ".join(current_text)
            current_section = paragraph.text.strip()
            current_text = []
        else:
            current_text.append(paragraph.text.strip())

    if current_section:
        sections[current_section] = " ".join(current_text)

    return sections

In [None]:
def text_to_vector(text):
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)

    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
        vector = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

    return vector

In [None]:
def save_vectors_to_database(sections, database_path):
    conn = sqlite3.connect(database_path)
    cursor = conn.cursor()

    cursor.execute("DROP TABLE IF EXISTS vectors")

    cursor.execute("""
    CREATE TABLE IF NOT EXISTS vectors (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        section TEXT,
        vector TEXT,
        text TEXT
    )
    """)

    for section, text in sections.items():
        vector = text_to_vector(text)
        vector_str = ",".join(map(str, vector))
        cursor.execute("INSERT INTO vectors (section, vector) VALUES (?, ?)", (section, vector_str))

    conn.commit()
    conn.close()

In [None]:
def find_exact_answer(query, database_path):
    conn = sqlite3.connect(database_path)
    cursor = conn.cursor()

    cursor.execute("SELECT section, vector, text FROM vectors")
    rows = cursor.fetchall()
    conn.close()

    vectors = []
    sections = []
    texts = []
    for row in rows:
        sections.append(row[0])
        vectors.append(np.array(list(map(float, row[1].split(',')))))
        texts.append(row[2] if row[2] else "")

    query_vector = text_to_vector(query)

    def cosine_similarity(vec1, vec2):
        return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

    similarities = [cosine_similarity(query_vector, v) for v in vectors]
    max_index = np.argmax(similarities)

    related_section_text = texts[max_index]

    if not related_section_text.strip():
        return sections[max_index], "No text available in this section.", 0.0

    MAX_CONTEXT_LENGTH = 512
    related_section_text = related_section_text[:MAX_CONTEXT_LENGTH]

    MAX_SENTENCES = 5
    sentences = sent_tokenize(related_section_text)
    related_section_text = " ".join(sentences[:MAX_SENTENCES])

    if not query.strip():
        raise ValueError("The query/question cannot be empty.")

    qa_model = pipeline("question-answering", model="deepset/roberta-large-squad2")
    answer = qa_model(question=query, context=related_section_text)

    if not answer or 'answer' not in answer:
        return sections[max_index], "No valid answer found.", 0.0

    return sections[max_index], answer['answer'], answer['score']

In [None]:
sections = extract_sections_from_docx(document_path)
save_vectors_to_database(sections, database_path)

In [None]:
query = "What is his email address?"
section, exact_answer, confidence = find_exact_answer(query, database_path)
print(f"Closest section: {section}")
print(f"Answer: {exact_answer} (Confidence: {confidence:.2f})")