# Data Processing


importing libraries

In [18]:
from docx import Document
import re
import unicodedata
import tiktoken
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct, Distance, VectorParams
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [19]:
def extract_data(doc_path):
    doc = Document(doc_path)
    current_headings = []  
    results = []
    for para in doc.paragraphs:
        style_name = para.style.name
        if style_name.startswith('Heading 1'):
            current_headings = [para.text.strip()]
        elif style_name.startswith('Heading 2'):
            if len(current_headings) >= 1:
                if len(current_headings) == 1:
                    current_headings.append(para.text.strip())
                else:
                    current_headings[1] = para.text.strip()
            else:
                current_headings = [None, para.text.strip()]
        else:
            if para.text.strip():
                if len(current_headings)>1:
                    metadata = {
                        "heading 1":  current_headings[0],
                        "heading 2": current_headings[1]
                    }
                else:
                    metadata = {
                        "heading 1":  current_headings[0],
                        "heading 2": None
                    }
                results.append({
                    "text": para.text.strip(),
                    "metadata": metadata
                })
    return results


In [20]:
file_path="dataset.docx"

In [21]:
sections = extract_data(file_path)

## preprocessing

### 1. normalization

In [22]:
def normalize_text(text):
    text = re.sub(r'\s+', ' ', text)
    text = unicodedata.normalize("NFKC", text)
    text = ''.join(c for c in text if c.isprintable())
    return text.strip()

### 2. clean punctuation

In [23]:
def clean_punctuation(text):
    text = re.sub(r'[“”«»]', '"', text)
    text = re.sub(r"[’‘]", "'", text)
    text = re.sub(r"[–—]", "-", text)
    return text


### 3. strip noise

In [24]:
def remove_noise(text):
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)
    text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'\d{5,}', '', text)
    return text


### 4. paraghraphs

In [25]:
def split_paragraphs(text):
    return [para.strip() for para in text.split('\n') if para.strip()]

In [26]:
def preprocess_data(raw_text):
    text = normalize_text(raw_text)
    text = clean_punctuation(text)
    paragraphs = split_paragraphs(text)
    return paragraphs


In [27]:
for sec in sections:
    sec['text']=preprocess_data(sec['text'])

## tokenizer and Chunk

In [28]:
def tokenize_chunk(text, metadata, tokenizer_name="gpt2", chunk_size=300, overlap=0.15):
    enc = tiktoken.get_encoding(tokenizer_name)
    tokens = enc.encode(text[0])

    step = int(chunk_size * (1 - overlap))
    dataset = []

    for start in range(0, len(tokens), step):
        end = start + chunk_size
        chunk_tokens = tokens[start:end]
        chunk_text = enc.decode(chunk_tokens)
        
        chunk_entry = {
            "text": chunk_text,
            "metadata": metadata.copy()
        }
        dataset.append(chunk_entry)

        if end >= len(tokens):
            break

    return dataset


In [29]:
dataset=[]

In [30]:
for section in sections:
    dataset.append(tokenize_chunk(section['text'],section['metadata']))

In [52]:
dataset_new=[]

In [53]:
for i in dataset:
    dataset_new.append(i[0])

In [55]:
len(dataset_new)

13

In [35]:
def upload_chunks_to_qdrant(
    dataset,  # list of {'text': ..., 'metadata': {...}}
    collection_name="French_population_structure",
    qdrant_url="http://localhost",
    qdrant_port=8080
):
    model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
    client = QdrantClient(url=qdrant_url, port=qdrant_port)
    client.recreate_collection(
        collection_name=collection_name,
        vectors_config=VectorParams(
            size=model.get_sentence_embedding_dimension(),
            distance=Distance.COSINE
        )
    )

    # تولید امبدینگ و آماده‌سازی نقاط
    points = []
    for i, item in enumerate(dataset):
        vector = model.encode(item["text"]).tolist()
        payload = item["metadata"]
        points.append(PointStruct(id=i, vector=vector, payload=payload))

    # آپلود در Qdrant
    client.upsert(
        collection_name=collection_name,
        points=points
    )

    print(f"{len(points)} documents uploaded to collection '{collection_name}'.")

In [56]:
upload_chunks_to_qdrant(dataset_new)

  client.recreate_collection(


13 documents uploaded to collection 'French_population_structure'.
