In [2]:
# 1) Imports and Environment Setup

from dotenv import load_dotenv  # For loading environment variables from a .env file
import os
import pandas as pd
import matplotlib.pyplot as plt

# Load environment variables
load_dotenv()

print("[INFO] Environment variables loaded and libraries imported.")

[INFO] Environment variables loaded and libraries imported.


In [3]:
questions = pd.read_csv("questions_with_distractors.csv")
questions_images = pd.read_csv("questions_with_images.csv")

In [4]:
# Add a column to questions df indicating if the question has an image
questions_images_list = questions_images['question_id'].tolist()
questions['has_image'] = questions['question_id'].isin(questions_images_list)

# Display the first few rows to verify
print(f"Total questions: {len(questions)}")
print(f"Questions with images: {questions['has_image'].sum()}")
print("\nSample of questions with has_image column:")
print(questions[['question_id', 'has_image']].head())


Total questions: 9769
Questions with images: 1518

Sample of questions with has_image column:
   question_id  has_image
0        28193      False
1        21093      False
2        21906      False
3        15904      False
4        16039       True


In [6]:
# Add a total_count column that sums all the individual option counts
questions['total_count'] = questions['count_a'] + questions['count_b'] + questions['count_c'] + questions['count_d'] + questions['count_e']

# Display the first few rows to verify
print("\nSample of questions with total_count column:")
print(questions[['question_id', 'count_a', 'count_b', 'count_c', 'count_d', 'count_e', 'total_count']].head())



Sample of questions with total_count column:
   question_id  count_a  count_b  count_c  count_d  count_e  total_count
0        28193        0        2        3        5        0           10
1        21093        0        0        0        0        3            3
2        21906        2        9       12        5       31           59
3        15904        6        0        4        3        0           13
4        16039       12        9       12        5        0           38


In [7]:
# Filter out questions with less than 10 total responses
filtered_questions = questions[questions['total_count'] >= 10]

# Display information about the filtering
print(f"Original number of questions: {len(questions)}")
print(f"Number of questions with at least 10 responses: {len(filtered_questions)}")
print(f"Number of questions removed: {len(questions) - len(filtered_questions)}")

# Replace the original dataframe with the filtered one
questions = filtered_questions

# Display the first few rows to verify
print("\nSample of filtered questions:")
print(questions[['question_id', 'total_count']].head())


Original number of questions: 9769
Number of questions with at least 10 responses: 4696
Number of questions removed: 5073

Sample of filtered questions:
   question_id  total_count
0        28193           10
2        21906           59
3        15904           13
4        16039           38
5        16141           28


In [8]:
questions.to_csv("questions_filtered.csv", index=False)