# Data Processing


importing libraries

In [1]:
from docx import Document
import re
import unicodedata
import tiktoken
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct, Distance, VectorParams
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def extract_data(doc_path):
    doc = Document(doc_path)
    current_headings = []  
    results = []
    for para in doc.paragraphs:
        style_name = para.style.name
        if style_name.startswith('Heading 1'):
            current_headings = [para.text.strip()]
        elif style_name.startswith('Heading 2'):
            if len(current_headings) >= 1:
                if len(current_headings) == 1:
                    current_headings.append(para.text.strip())
                else:
                    current_headings[1] = para.text.strip()
            else:
                current_headings = [None, para.text.strip()]
        else:
            if para.text.strip():
                if len(current_headings)>1:
                    metadata = {
                        "heading 1":  current_headings[0],
                        "heading 2": current_headings[1]
                    }
                else:
                    metadata = {
                        "heading 1":  current_headings[0],
                        "heading 2": None
                    }
                results.append({
                    "text": para.text.strip(),
                    "metadata": metadata
                })
    return results


In [3]:
file_path="dataset.docx"

In [4]:
sections = extract_data(file_path)

## preprocessing

### 1. normalization

In [5]:
def normalize_text(text):
    text = re.sub(r'\s+', ' ', text)
    text = unicodedata.normalize("NFKC", text)
    text = ''.join(c for c in text if c.isprintable())
    return text.strip()

### 2. clean punctuation

In [6]:
def clean_punctuation(text):
    text = re.sub(r'[“”«»]', '"', text)
    text = re.sub(r"[’‘]", "'", text)
    text = re.sub(r"[–—]", "-", text)
    return text


### 3. strip noise

In [7]:
def remove_noise(text):
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)
    text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'\d{5,}', '', text)
    return text


### 4. paraghraphs

In [8]:
def split_paragraphs(text):
    return [para.strip() for para in text.split('\n') if para.strip()]

In [9]:
def preprocess_data(raw_text):
    text = normalize_text(raw_text)
    text = clean_punctuation(text)
    paragraphs = split_paragraphs(text)
    return paragraphs


In [10]:
for sec in sections:
    sec['text']=preprocess_data(sec['text'])

## tokenizer and Chunk

In [11]:
def tokenize_chunk(text, metadata, tokenizer_name="gpt2", chunk_size=300, overlap=0.15):
    enc = tiktoken.get_encoding(tokenizer_name)
    tokens = enc.encode(text[0])

    step = int(chunk_size * (1 - overlap))
    dataset = []

    for start in range(0, len(tokens), step):
        end = start + chunk_size
        chunk_tokens = tokens[start:end]
        chunk_text = enc.decode(chunk_tokens)
        
        chunk_entry = {
            "text": chunk_text,
            "metadata": metadata.copy()
        }
        dataset.append(chunk_entry)

        if end >= len(tokens):
            break

    return dataset


In [None]:
dataset=[]

In [13]:
for section in sections:
    dataset.append(tokenize_chunk(section['text'],section['metadata']))

In [14]:
dataset_new=[]

In [15]:
for i in dataset:
    dataset_new.append(i[0])

In [16]:
len(dataset_new)

13

In [20]:
model =SentenceTransformer( 'sentence-transformers/all-MiniLM-L6-v2')

In [None]:
def upload_chunks_to_qdrant(
    dataset,  # list of {'text': ..., 'metadata': {...}}
    collection_name="French_population_structure",
    qdrant_url="http://localhost",
    qdrant_port=8080,
    model = model
):
    client = QdrantClient(url=qdrant_url, port=qdrant_port)
    client.recreate_collection(
        collection_name=collection_name,
        vectors_config=VectorParams(
            size=model.get_sentence_embedding_dimension(),
            distance=Distance.COSINE
        )
    )

    points = []
    for i, item in enumerate(dataset):
        vector = model.encode(item["text"]).tolist()
        payload = item["metadata"]
        points.append(PointStruct(id=i, vector=vector, payload=payload))

    # آپلود در Qdrant
    client.upsert(
        collection_name=collection_name,
        points=points
    )

    print(f"{len(points)} documents uploaded to collection '{collection_name}'.")

In [35]:
upload_chunks_to_qdrant(dataset_new)

  client.recreate_collection(


13 documents uploaded to collection 'French_population_structure_1'.


## Testing the Vector Store

In [38]:
client = QdrantClient(
    url="http://localhost", 
    port=8080)

In [None]:
query = "French is nice and big"
query_vector = model.encode([query])[0]

results = client.search(
    collection_name="French_population_structure",
    query_vector=query_vector,
    limit=3,
    with_vectors=True
)

{'heading 1': 'People of France', 'heading 2': 'Languages'} 0.42507312
{'heading 1': 'Demographic trends', 'heading 2': 'Emigration'} 0.40637708
{'heading 1': 'People of France', 'heading 2': 'Religion of France'} 0.36920825


  results = client.search(


In [46]:
print(f"\n🔍 Query: {query}\n")
for i, result in enumerate(results, 1):
    original_text = result.payload.get("text")
    score = result.score
    vector=result.vector
    print(f"{i}. finded text {original_text}\n similarity: {score:.4f}\n vector:{vector}\n")


🔍 Query: French is nice and big

1. finded text None
 similarity: 0.4251
 vector:[0.09290737, -0.043803852, 0.028328842, -0.05070284, 0.0058285655, -0.019271452, -0.01698876, -0.038476974, 0.11232263, -0.062367573, 0.09372031, 0.012757036, 0.0018438933, 0.032819334, -0.01704579, -0.01915422, -0.020260064, 0.009843415, 0.0100483615, 0.03027189, -0.022601932, 0.041678343, 0.064721435, 0.025454398, 0.07673996, -0.0385019, 0.06843168, -0.019455528, 0.016665433, 0.013284485, -0.0017521897, 0.07987852, -0.010402895, 0.00020080197, -0.031813283, 0.040322382, 0.06458855, -0.03020859, -0.057250645, 0.05950742, -0.1259576, -0.0085038375, -0.007807345, 0.008797372, 0.061959278, 0.07970116, -0.0065674554, 0.051809426, -0.07386145, -0.0058509195, -0.010719115, -0.061111957, 0.034603022, -0.0019859732, -0.050172612, -0.017486723, -0.041646317, 0.057158183, 0.0005322435, 0.0072963433, -0.066960454, -0.031279385, 0.016603203, -0.0060158432, -0.044940718, 0.02943615, 0.01033384, 0.015912063, -0.112883