In [8]:
import pandas as pd

df = pd.read_csv('../data/wiki_text_cleaned_v1.csv')

# check if the id column is unique
print(df['id'].is_unique)

# check if the id column is a primary key
print(df['id'].is_monotonic_increasing)

True
True


In [9]:
# see if there are missing values in the page_name column
print(df['page_name'].isnull().values.any())

# print those rows where the page_name column is missing
print(df[df['page_name'].isnull()])

False
Empty DataFrame
Columns: [id, page_name, section_name, subsection_name, subsubsection_name, text, section_hierarchy, text_cleaned, word_count, is_bad]
Index: []


In [1]:
import asyncio
import logging
from sqlalchemy.ext.asyncio import create_async_engine, AsyncSession
from sqlalchemy.orm import sessionmaker
from sqlalchemy.future import select
from tqdm.notebook import tqdm

from fleecekmbackend.services.dataset.fleece_qa import generate_answer, generate_answer_rating
from fleecekmbackend.db.models import Question, Answer, Rating
from fleecekmbackend.core.config import DATABASE_URL

# Database connection and session setup
engine = create_async_engine(DATABASE_URL, echo=False)
AsyncSession = sessionmaker(engine, class_=AsyncSession)



async def process_questions(db):
    async with db() as session:
        # Fetch all questions
        result = await session.execute(select(Question))
        questions = result.scalars().all()

        # Wrap the loop with tqdm for a progress bar
        for question in tqdm(questions, desc="Processing Questions"):
            try:
                # Generate zero-shot answer
                answer_id = await generate_answer(session, question.id, setting='zs')
                if answer_id:
                    # Generate rating for the answer
                    await generate_answer_rating(session, question.id, answer_id)
            except Exception as e:
                logging.error(f"Failed processing question ID {question.id}: {str(e)}")

async def main():
    # Run processing for all questions
    await process_questions(AsyncSession)

# As this will be run in a Jupyter notebook or other async-capable environment:
await main()


Processing Questions:   0%|          | 0/429562 [00:00<?, ?it/s]

CancelledError: 