# Data set

Do przechowywania danych użyto SQLite, ze względu na jego komaptybilność z Rsutem i Reactem( w których projekt zostało napisany), prostą implementację, a szczególnie na szybki i nisko-kosztowny dostęp do danych. Jeśli miałby to być serwer z większą ilością zapytań to wybrałbym Mongo, ale tutaj dla offline, prostej i nie obsługującej dużej liczby zapytań jest baza SQL.


Dane pobrano ze strony: https://dumps.wikimedia.org/enwiki/latest/ 


Posłużono się wersją Simple Wiki [Eng]

Dane z pliku xml przeparsowano na pliki txt wikiextracotrem(https://github.com/attardi/wikiextractor) w lekko zmodyfikowanej wersji dla tego zadania


Oddzielnie wprowadzono dane z plików txt do bazy sqlite 

In [None]:
import sqlite3
import re
import glob

def parse_file(file_path):
    docs = []
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
        pattern = r'<doc id="(\d+)" url="(https?://[^"]+)" title="([^"]+)">([^<]+)<\/doc>'
        matches = re.findall(pattern, content)
        for match in matches:
            doc_id = match[0]
            url = match[1]
            title = match[2]
            text = match[3]
            docs.append((doc_id, title, url, text))
    return docs

def create_db_and_insert_data(docs):
    conn = sqlite3.connect('articles.db')
    cursor = conn.cursor()

    cursor.execute('''
        CREATE TABLE IF NOT EXISTS articles (
            id INTEGER PRIMARY KEY,
            title TEXT,
            url TEXT,
            text TEXT
        )
    ''')

    cursor.executemany('''
        INSERT INTO articles (id, title, url, text)
        VALUES (?, ?, ?, ?)
    ''', docs)

    conn.commit()
    conn.close()

def main():
    folders = ['AA/*', 'AC/*', 'AB/*']
    for file in folders:
        file_paths = glob.glob(f)
        all_docs = []
        for file_path in file_paths:
            print(f'Przetwarzam plik: {file_path}')
            docs = parse_file(file_path)
            all_docs.extend(docs)

        if all_docs:
            create_db_and_insert_data(all_docs)
            print("Dane zostały zapisane w bazie danych.")
        else:
            print("Brak danych do zapisania.")

Alternatywna wersja rozwiazania( uzywanie api wikipedii i sciagnie losowych artykulow )

In [None]:
# implementacja wspomagana przez AI
import sqlite3
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
import time
from tqdm import tqdm

DATABASE_NAME = "wikipedia_fast.db"
WIKIPEDIA_API_URL = "https://en.wikipedia.org/w/api.php"
TARGET_ARTICLE_COUNT = 300000
THREADS = 50  
BATCH_SIZE = 500 

def init_db():
    conn = sqlite3.connect(DATABASE_NAME)
    cursor = conn.cursor()
    cursor.execute("""
    CREATE TABLE IF NOT EXISTS articles (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        title TEXT UNIQUE,
        content TEXT
    )
    """)
    cursor.execute("CREATE INDEX IF NOT EXISTS idx_title ON articles(title)")
    conn.commit()
    conn.close()

def get_article_list(limit):
    params = {
        "action": "query",
        "format": "json",
        "list": "allpages",
        "aplimit": limit,
        "apfilterredir": "nonredirects"
    }
    response = requests.get(WIKIPEDIA_API_URL, params=params)
    data = response.json()
    return [page["title"] for page in data["query"]["allpages"]]

def fetch_article(title):
    try:
        params = {
            "action": "query",
            "format": "json",
            "prop": "extracts",
            "titles": title,
            "explaintext": True,
            "exsectionformat": "plain"
        }
        response = requests.get(WIKIPEDIA_API_URL, params=params, timeout=10)
        data = response.json()
        page = next(iter(data["query"]["pages"].values()))
        return (title, page.get("extract", ""))
    except:
        return None

def save_batch(batch):
    conn = sqlite3.connect(DATABASE_NAME)
    cursor = conn.cursor()
    cursor.executemany(
        "INSERT OR IGNORE INTO articles (title, content) VALUES (?, ?)",
        [(title, content) for title, content in batch if content]
    )
    conn.commit()
    conn.close()

def main():
    init_db()
    
    conn = sqlite3.connect(DATABASE_NAME)
    cursor = conn.cursor()
    cursor.execute("SELECT COUNT(*) FROM articles")
    existing = cursor.fetchone()[0]
    conn.close()
    
    if existing >= TARGET_ARTICLE_COUNT: return
    
    needed = TARGET_ARTICLE_COUNT - existing
    
    print("Pobieranie...")
    articles = get_article_list(needed * 2)
    articles = articles[:needed + 10000] 
    
    batch = []
    start_time = time.time()
    
    with ThreadPoolExecutor(max_workers=THREADS) as executor:
        futures = {executor.submit(fetch_article, title): title for title in articles}
        
        for future in tqdm(as_completed(futures), total=len(futures), desc="Pob"):
            result = future.result()
            if result:
                batch.append(result)
                
                if len(batch) >= BATCH_SIZE:
                    save_batch(batch)
                    batch = []
    
    if batch:
        save_batch(batch)
    
    elapsed = time.time() - start_time
    print(f"Zakonczono w {elapsed:.2f} seknd")

Prosty Web scraper( nie uzyty, bo dziala za wolno)

In [None]:
import requests
from bs4 import BeautifulSoup
import sqlite3
import time

NUM_ARTICLES = 300000

conn = sqlite3.connect('articles.db')
cursor = conn.cursor()

cursor.execute('''
    CREATE TABLE IF NOT EXISTS articles (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        url TEXT UNIQUE,
        title TEXT,
        text TEXT
    )
''')

def scrape_random_article():
    try:
        response = requests.get('https://pl.wikipedia.org/wiki/Special:Random', allow_redirects=True)
        final_url = response.url 
        soup = BeautifulSoup(response.text, 'html.parser')
        title = soup.find('h1').get_text()
        paragraphs = soup.select('div.mw-parser-output > p')
        text = '\n'.join(p.get_text() for p in paragraphs if p.get_text(strip=True))
        return final_url, title, text
    except Exception as e:
        print(f"Error: {e}")
        return None, None, None

for _ in range(NUM_ARTICLES):
    url, title, text = scrape_random_article()
    if title and text:
        try:
            cursor.execute(
                'INSERT INTO articles (url, title, text) VALUES (?, ?, ?)',
                (url, title, text)
            )
            conn.commit()
        except sqlite3.IntegrityError:
    time.sleep(1)  # zeby nie zablokowac serwera

conn.close()


Łącznie użyto ~350k dokumentów ( 370847), a słownik zawierał ~500k słów (494618)

# Frontend
Implementacja w React


# Backend

Realizowany w Ruscie

Do komunikacji Actix(api do rusta) na localhoscie