In [None]:
board_path = r'c:\Users\Rayan\Desktop\extraction pipeline\token_exctracted_jsonl\board.jsonl'
df = pd.read_json(board_path, lines=True, orient='records', encoding='utf-8')
df.head()


def filter_non_image_questions(df):
    """
    Filter DataFrame to include only questions without images in stem or options.
    
    Parameters:
    df (pd.DataFrame): Input DataFrame containing exam questions
    
    Returns:
    pd.DataFrame: Filtered DataFrame containing only non-image questions
    """
    try:
        # Create mask for questions without images
        no_image_mask = df['image'].apply(
            lambda x: (
                x.get('has_image_stem', True) == False and  # Using True as default to exclude if key missing
                x.get('options_has_pic', True) == False
            )
        )
        
        # Apply filter
        filtered_df = df[no_image_mask].copy()
        
        # Print summary statistics
        total_questions = len(df)
        filtered_questions = len(filtered_df)
        
        print(f"Original number of questions: {total_questions}")
        print(f"Questions without images: {filtered_questions}")
        print(f"Percentage of non-image questions: {(filtered_questions/total_questions)*100:.2f}%")
        
        return filtered_df
    
    except Exception as e:
        print(f"Error occurred while filtering: {str(e)}")
        raise


# Assuming your DataFrame is called 'df':
non_image_df = filter_non_image_questions(df)
print(len(non_image_df))



import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

def stratified_sample_exam_questions(df, n_samples=500):
    """
    Perform stratified sampling on exam questions based on exam_type and exam_topic combinations.
    
    Parameters:
    df (pd.DataFrame): Input DataFrame containing the exam questions
    n_samples (int): Number of samples desired in the final dataset
    
    Returns:
    pd.DataFrame: Stratified sample of the input DataFrame
    """
    # Create a combined stratum from exam_type and exam_topic
    df['stratum'] = df['metadata'].apply(lambda x: f"{x['exam_type']}_{x['exam_topic']}")
    
    # Get the frequency of each stratum
    stratum_counts = df['stratum'].value_counts()
    
    # Calculate the proportion of samples to take from each stratum
    total_samples = len(df)
    proportions = stratum_counts / total_samples
    
    # Calculate number of samples to take from each stratum
    # Using ceil to ensure we get at least 1 sample from each stratum
    samples_per_stratum = np.ceil(proportions * n_samples).astype(int)
    
    # Adjust if we're taking too many samples
    while samples_per_stratum.sum() > n_samples:
        # Find the stratum with the most samples and reduce it by 1
        max_stratum = samples_per_stratum.idxmax()
        samples_per_stratum[max_stratum] -= 1
    
    # Sample from each stratum
    sampled_dfs = []
    for stratum, n in samples_per_stratum.items():
        stratum_df = df[df['stratum'] == stratum]
        # If we need more samples than available in the stratum, take all available
        n = min(n, len(stratum_df))
        sampled_dfs.append(stratum_df.sample(n=n, random_state=42))
    
    # Combine all sampled DataFrames
    final_sample = pd.concat(sampled_dfs)
    
    # Drop the temporary stratum column
    final_sample = final_sample.drop('stratum', axis=1)
    
    # Shuffle the final sample
    final_sample = final_sample.sample(frac=1, random_state=42).reset_index(drop=True)
    
    return final_sample

# Example usage:
# df = your_dataframe  # Your original DataFrame


# To verify the distribution:
def print_distribution(df):
    """
    Print the distribution of exam_type and exam_topic combinations in the DataFrame.
    """
    counts = df.groupby(
        df['metadata'].apply(lambda x: (x['exam_type'], x['exam_topic']))
    ).size()
    
    print("Distribution of samples:")
    for (exam_type, exam_topic), count in counts.items():
        print(f"Exam Type: {exam_type}, Topic: {exam_topic}, Count: {count}")


sampled_df = stratified_sample_exam_questions(non_image_df, n_samples=500)
print_distribution(sampled_df)


sampled_df.to_json('sampled_board.jsonl', orient='records', lines=True, force_ascii=False)