# Data Processing


importing libraries

In [94]:
from docx import Document
import re
import unicodedata
import tiktoken

In [95]:
file_path="dataset.docx"

In [96]:
def extract_data(doc_path):
    doc = Document(doc_path)
    current_headings = []  
    results = []
    for para in doc.paragraphs:
        style_name = para.style.name
        if style_name.startswith('Heading 1'):
            current_headings = [para.text.strip()]
        elif style_name.startswith('Heading 2'):
            if len(current_headings) >= 1:
                if len(current_headings) == 1:
                    current_headings.append(para.text.strip())
                else:
                    current_headings[1] = para.text.strip()
            else:
                current_headings = [None, para.text.strip()]
        else:
            if para.text.strip():
                metadata = {
                    "headings": current_headings.copy()  
                }
                results.append({
                    "text": para.text.strip(),
                    "metadata": metadata
                })
    return results


In [97]:
sections = extract_data(file_path)

## preprocessing

### 1. normalization

In [98]:
def normalize_text(text):
    text = re.sub(r'\s+', ' ', text)
    text = unicodedata.normalize("NFKC", text)
    text = ''.join(c for c in text if c.isprintable())
    return text.strip()

### 2. clean punctuation

In [99]:
def clean_punctuation(text):
    text = re.sub(r'[“”«»]', '"', text)
    text = re.sub(r"[’‘]", "'", text)
    text = re.sub(r"[–—]", "-", text)
    return text


### 3. strip noise

In [100]:
def remove_noise(text):
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)
    text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'\d{5,}', '', text)
    return text


### 4. paraghraphs

In [101]:
def split_paragraphs(text):
    return [para.strip() for para in text.split('\n') if para.strip()]

In [102]:
def preprocess_data(raw_text):
    text = normalize_text(raw_text)
    text = clean_punctuation(text)
    paragraphs = split_paragraphs(text)
    return paragraphs


In [103]:
for sec in sections:
    sec['text']=preprocess_data(sec['text'])

## tokenizer

In [115]:
def tokenize_chunk(text, metadata, tokenizer_name="gpt2", chunk_size=300, overlap=0.15):
    enc = tiktoken.get_encoding(tokenizer_name)
    tokens = enc.encode(text[0])

    step = int(chunk_size * (1 - overlap))
    dataset = []

    for start in range(0, len(tokens), step):
        end = start + chunk_size
        chunk_tokens = tokens[start:end]
        chunk_text = enc.decode(chunk_tokens)
        
        chunk_entry = {
            "text": chunk_text,
            "metadata": metadata.copy()
        }
        dataset.append(chunk_entry)

        if end >= len(tokens):
            break

    return dataset


## chunking

In [117]:
dataset=[]

In [118]:
for section in sections:
    dataset.append(process_and_chunk_data(section['text'],section['metadata']))

In [119]:
dataset

[[{'text': 'The French are, paradoxically, strongly conscious of belonging to a single nation, but they hardly constitute a unified ethnic group by any scientific gauge. Before the official discovery of the Americas at the end of the 15th century, France, located on the western extremity of the Old World, was regarded for centuries by Europeans as being near the edge of the known world. Generations of different migrants traveling by way of the Mediterranean from the Middle East and Africa and through Europe from Central Asia and the Nordic lands settled permanently in France, forming a variegated grouping, almost like a series of geologic strata, since they were unable to migrate any farther. Perhaps the oldest reflection of these migrations is furnished by the Basque people, who live in an isolated area west of the Pyrenees in both Spain and France, who speak a language unrelated to other European languages, and whose origin remains unclear. The Celtic tribes, known to the Romans as G