In [2]:
import pandas as pd
import numpy as np
from pathlib import Path

input_path = Path("questionsv4.csv")
output_path = Path("questionsv4_cleaned.csv")


In [3]:
df = pd.read_csv(input_path, dtype=str)
df.columns = [c.strip().lower() for c in df.columns]
if {"questions", "answers"}.issubset(df.columns):
    df = df[["questions", "answers"]]
else:
    if df.shape[1] >= 2:
        df = df.iloc[:, :2]
        df.columns = ["questions", "answers"]
    else:
        df = pd.DataFrame(columns=["questions", "answers"])


In [4]:
df = df.replace({np.nan: None})
df["questions"] = df["questions"].apply(lambda x: x.strip() if isinstance(x, str) else x)
df["answers"] = df["answers"].apply(lambda x: x.strip() if isinstance(x, str) else x)

mask_nonempty = df["questions"].notna() & df["answers"].notna() & (df["questions"].str.len() > 0) & (df["answers"].str.len() > 0)
df = df[mask_nonempty].copy()


In [5]:
irrelevant_patterns = [
    r"^irrelevant$",
    r"^not relevant$",
    r"^n/?a$",
    r"^none$",
    r"^no data$",
]

pattern_union = "|".join(irrelevant_patterns)
mask_irrelevant_q = df["questions"].str.strip().str.lower().str.match(pattern_union, na=False)
mask_irrelevant_a = df["answers"].str.strip().str.lower().str.match(pattern_union, na=False)

df = df[~(mask_irrelevant_q | mask_irrelevant_a)].copy()


In [6]:
df = df.drop_duplicates(subset=["questions", "answers"]).reset_index(drop=True)
df.to_csv(output_path, index=False)
