## <span style="color:#ff5f27">📝 Imports </span>

In [None]:
import os
from openai import OpenAI
import getpass
import json
import pandas as pd
import json_repair
from tqdm import tqdm

## <span style="color:#ff5f27">⚙️ Settings </span>

In [None]:
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY") or getpass.getpass('🔑 Enter your OpenAI API key: ')

In [None]:
client = OpenAI(
    api_key=os.environ["OPENAI_API_KEY"],
)

## <span style="color:#ff5f27;"> 🔮 Connecting to Hopsworks Feature Store </span>

In [None]:
import hopsworks

project = hopsworks.login()

fs = project.get_feature_store() 

In [None]:
# Retrieve the 'documents' feature view
feature_view = fs.get_feature_view(
    name='documents',
    version=1,
)

In [None]:
# Initialize batch scoring for feature view
feature_view.init_batch_scoring()

# Get batch data from the feature view
data = feature_view.get_batch_data()

# Filter data to include only rows where the 'text' column length is greater than 2500
data_filtered = data[data.text.str.len() > 2500]

# Display the filtered data
data_filtered

## <span style="color:#ff5f27">🪄 Dataset Generation</span>

In [None]:
def generate_questions(context):

    instruction = """
    The given text is the result of the text extraction from the PDF files. 
    Generate 3 meaningful questions on the text and the respective answers.
    Reply strictly in the JSON format:
    {
      "questions": ["question1", "question2", "question3"],
      "answers": ["answer1", "answer2", "answer3"]
    }

    Ensure that the lists of questions and answers are complete and properly formatted. 
    DO NOT include any additional information or characters outside the specified JSON format. 
    The response must consist only of the requested JSON structure. 
    If the generated content does not meet the specified format, please make the necessary adjustments to ensure compliance."""

    prompt = f"\nContext: {context}\nQuestion: {instruction}"

    # Create a chatbot
    completion = client.chat.completions.create(
        model="gpt-3.5-turbo",
        # Pre-define conversation messages for the possible roles 
        messages=[
            {"role": "user", "content": prompt},
        ]
    )
    response = json_repair.loads(completion.choices[0].message.content)
    
    response['context'] = context
    
    return response


In [None]:
# Generate question-answer pairs
generated_questions = [
    generate_questions(text)
    for text 
    in tqdm(data_filtered['text'])
]

In [None]:
# Create a DataFrame from the generated_questions
df = pd.DataFrame(generated_questions)

# Display the first few rows of the DataFrame
df.head()

In [None]:
# Explode the DataFrame to expand lists in specified columns ('questions' and 'answers')
df_expanded = df.explode(['questions', 'answers']).reset_index(drop=True)

# Reset the index to create a new default integer index
df_expanded.reset_index(inplace=True)

# Rename the 'index' column to 'record_id' for clarity
df_expanded.rename(columns={'index': 'record_id'}, inplace=True)

# Display the expanded DataFrame
df_expanded

## <span style="color:#ff5f27;"> 🪄 CQA Feature Group Creation </span>

In [None]:
# Get or create the 'cqa_fg' feature group
cqa_fg = fs.get_or_create_feature_group(
    name="cqa_fg",
    version=1,
    description='Context-Question-Response Data',
    primary_key=['record_id'],
)

cqa_fg.insert(df_expanded)

## <span style="color:#ff5f27;"> 🪄 CQA Feature View Creation </span>

In [None]:
# Get or create the 'cqa' feature view
feature_view = fs.get_or_create_feature_view(
    name="cqa",
    version=1,
    query=cqa_fg.select(["context", "questions", "responses"]),
    description='Context-Question-Response pairs for model fine-tuning',
)

---