In [1]:
pip install gspread oauth2client pandas

Collecting gspread
  Using cached gspread-6.2.0-py3-none-any.whl.metadata (11 kB)
Collecting oauth2client
  Using cached oauth2client-4.1.3-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting google-auth-oauthlib>=0.4.1 (from gspread)
  Using cached google_auth_oauthlib-1.2.1-py2.py3-none-any.whl.metadata (2.7 kB)
Collecting httplib2>=0.9.1 (from oauth2client)
  Using cached httplib2-0.22.0-py3-none-any.whl.metadata (2.6 kB)
Collecting requests-oauthlib>=0.7.0 (from google-auth-oauthlib>=0.4.1->gspread)
  Using cached requests_oauthlib-2.0.0-py2.py3-none-any.whl.metadata (11 kB)
Collecting oauthlib>=3.0.0 (from requests-oauthlib>=0.7.0->google-auth-oauthlib>=0.4.1->gspread)
  Using cached oauthlib-3.2.2-py3-none-any.whl.metadata (7.5 kB)
Using cached gspread-6.2.0-py3-none-any.whl (59 kB)
Using cached oauth2client-4.1.3-py2.py3-none-any.whl (98 kB)
Using cached google_auth_oauthlib-1.2.1-py2.py3-none-any.whl (24 kB)
Using cached httplib2-0.22.0-py3-none-any.whl (96 kB)
Using cached reque

In [2]:
import os
import pandas as pd

all_variable_names = []
country_variables = {}
folder_path = './top20'

# Loop over all relevant files
for filename in os.listdir(folder_path):
    if filename.startswith("top_20_") and filename.endswith("_features.csv"):
        file_path = os.path.join(folder_path, filename)

        # Extract the country name from the filename
        country = filename.replace("top_20_", "").replace("_features.csv", "")

        try:
            # Read the second row (row index 1), split by commas
            with open(file_path, "r", encoding="utf-8") as f:
                lines = f.readlines()
                if len(lines) >= 2:
                    variable_line = lines[1].strip()
                    variables = [v.strip() for v in variable_line.split(",") if v.strip()]
                    country_variables[country] = variables
                else:
                    print(f"⚠️ Not enough lines in {filename}")
        except Exception as e:
            print(f"❌ Failed to process {filename}: {e}")

In [3]:
# Show a preview
for country, variables in list(country_variables.items())[:5]:  # show first 5
    print(f"{country}: {variables}")

Global: ['ST253Q01JA', 'ST004D01T', 'ST059Q02JA', 'ST255Q01JA', 'GRADE', 'ST349Q01JA_1', 'LANGN_156', 'REPEAT', 'ST297Q09JA', 'ST268Q04JA', 'SC211Q03JA', 'ST256Q03JA', 'ST259Q01JA', 'ST251Q06JA', 'ST349Q01JA_2', 'ST268Q01JA', 'ST230Q01JA', 'WORKPAY', 'ST251Q04JA', 'MACTIV']


In [4]:
# OLD AUTH BLOCK - replace this
# sheet = client.open(SHEET_NAME).sheet1
# data = sheet.get_all_values()

# ✅ NEW BLOCK
import pandas as pd
import requests
import io
import json

SHEET_ID = "1nPLjWPaHIK-fFRYxoHTwdl6LL3DBi_vLqNIGs5KWHOQ"
csv_url = f"https://docs.google.com/spreadsheets/d/{SHEET_ID}/export?format=csv"

response = requests.get(csv_url)
df = pd.read_csv(io.StringIO(response.text), header=None)
data = df.values.tolist()
#df.to_csv("codebook.csv", index=False)

In [5]:
manual_variable_info = { 
    "GRADE": {
        "Variable_label": "What grade were you in when you completed this study?",
        "Variable_context": "",
        "Variable_answers": json.dumps([
             {"value": -3, "text": "7th Grade"}, 
             {"value": -2, "text": "8th Grade"}, 
             {"value": -1, "text": "9th Grade"}, 
             {"value": 0, "text": "10th Grade"}, 
             {"value": 1, "text": "11th Grade"},
             {"value": 2, "text": "12th Grade"}
        ], ensure_ascii=False)
    },
    "ST259Q01JA": {
        "Variable_label": "Now think about where you would place your family on this scale. Where would you say your family stands at this time?",
        "Variable_context": """The scale below represents how society in your country is set up.
                            At the top of the scale (value 10) are the people who are the best off.
                            They earn the most money, receive the best education, and have the
                            most respected jobs. At the bottom of the scale (value 1) are the people who are the worst
                            off. They earn the least money, receive no education, and have no jobs
                            or the least respected jobs.""",
        "Variable_answers": json.dumps([
            {"range": {"min": 1, "max": 10, "step": 1}}
        ], ensure_ascii=False)
    },
    "PA003Q05IA": {
        "Variable_label": "Discuss political or social issues with my child",
        "Variable_context": "",
        "Variable_answers": json.dumps([
            {"value": 1, "text": "Never or almost never"}, 
            {"value": 2, "text": "About once or twice a year"}, 
            {"value": 3, "text": "About once or twice a month"}, 
            {"value": 4, "text": "About once or twice a week"}, 
            {"value": 5, "text": "Every day or almost every day"}
        ], ensure_ascii=False)
    },
    "PA008Q05TA": {
        "Variable_label": "Participated in local school government, e.g. parent council or school management committee ",
        "Variable_context": "",
        "Variable_answers": json.dumps([
            {"value": 1, "text": "Yes"}, 
            {"value": 2, "text": "No"}, 
            {"value": 3, "text": "Not supported by school"}
        ], ensure_ascii=False)
    },
    "FL170Q04JA": {
        "Variable_label": "In the past 12 months did you obtain money from Working in a family business?",
        "Variable_context": "",
        "Variable_answers": json.dumps([
            {"value": 1, "text": "Never or almost never"}, 
            {"value": 2, "text": "About once or twice a year"}, 
            {"value": 3, "text": "About once or twice a month"}, 
            {"value": 4, "text": "About once or twice a week"}, 
            {"value": 5, "text": "Every day or almost every day"}
        ], ensure_ascii=False)
    },
    "FL162Q06HA": {
        "Variable_label": "How confident would you feel about doing the following things? Planning my spending with consideration of my current financial situation",
        "Variable_context": "",
        "Variable_answers": json.dumps([
            {"value": 1, "text": "Not at all confident"}, 
            {"value": 2, "text": "Not very confident"}, 
            {"value": 3, "text": "Confident"}, 
            {"value": 4, "text": "Very confident"}
        ], ensure_ascii=False)
    },
    "FL167Q02HA": {
        "Variable_label": "How often do you discuss the following matters with your parents (or guardians or relatives)? Your savings decisions",
        "Variable_context": "",
        "Variable_answers": json.dumps([
            {"value": 1, "text": "Never or almost never"}, 
            {"value": 2, "text": "About once or twice a year"}, 
            {"value": 3, "text": "About once or twice a month"}, 
            {"value": 4, "text": "About once or twice a week"}, 
            {"value": 5, "text": "Every day or almost every day"}
        ], ensure_ascii=False)
    },
    "FL170Q02JA": {
        "Variable_label": "Thinking of the last 12 months, how often did you get money from any of these sources? An allowance or pocket money, without having to do any chores ",
        "Variable_context": "",
        "Variable_answers": json.dumps([
            {"value": 1, "text": "Never or almost never"}, 
            {"value": 2, "text": "About once or twice a year"}, 
            {"value": 3, "text": "About once or twice a month"}, 
            {"value": 4, "text": "About once or twice a week"}, 
            {"value": 5, "text": "Every day or almost every day"}
        ], ensure_ascii=False)
    },
    "FL171Q01JA": {
        "Variable_label": "In the last 12 months, how often have you done the following things? Checked that you were given the right change when you bought something with cash ",
        "Variable_context": "",
        "Variable_answers": json.dumps([
            {"value": 1, "text": "Never or almost never"}, 
            {"value": 2, "text": "About once or twice a year"}, 
            {"value": 3, "text": "About once or twice a month"}, 
            {"value": 4, "text": "About once or twice a week"}, 
            {"value": 5, "text": "Every day or almost every day"}
        ], ensure_ascii=False)
    },
     "FL171Q10JA": {
        "Variable_label": "In the last 12 months, how often have you done the following things? Sent money to other people with a smartphone (i.e. mobile phone with Internet access)  ",
        "Variable_context": "",
        "Variable_answers": json.dumps([
            {"value": 1, "text": "Never or almost never"}, 
            {"value": 2, "text": "About once or twice a year"}, 
            {"value": 3, "text": "About once or twice a month"}, 
            {"value": 4, "text": "About once or twice a week"}, 
            {"value": 5, "text": "Every day or almost every day"}
        ], ensure_ascii=False)
    },
    "FL174Q01JA": {
        "Variable_label": "Have you encountered money related tasks during math classes or activities? ",
        "Variable_context": "",
        "Variable_answers": json.dumps([
            {"value": 1, "text": "Yes"}, 
            {"value": 2, "text": "No"}, 
            {"value": 3, "text": "I dont know"},
            {"value": 4, "text": "I dont have this class"}
        ], ensure_ascii=False)
    },
    "FL174Q02JA": {
        "Variable_label": "Have you encountered money related tasks during social science classes or activities? ",
        "Variable_context": "",
        "Variable_answers": json.dumps([
            {"value": 1, "text": "Yes"}, 
            {"value": 2, "text": "No"}, 
            {"value": 3, "text": "I dont know"},
            {"value": 4, "text": "I dont have this class"}
        ], ensure_ascii=False)
    },
    "FL174Q03JA": {
        "Variable_label": "Have you encountered money related tasks during civic education/citizenship classes or activities?",
        "Variable_context": "",
        "Variable_answers": json.dumps([
            {"value": 1, "text": "Yes"}, 
            {"value": 2, "text": "No"}, 
            {"value": 3, "text": "I dont know"},
            {"value": 4, "text": "I dont have this class"}
        ], ensure_ascii=False)
    },
    "FL174Q04JA": {
        "Variable_label": "Have you encountered money related tasks during economics/business classes or activities?",
        "Variable_context": "",
        "Variable_answers": json.dumps([
            {"value": 1, "text": "Yes"}, 
            {"value": 2, "text": "No"}, 
            {"value": 3, "text": "I dont know"},
            {"value": 4, "text": "I dont have this class"}
        ], ensure_ascii=False)
    },
    "MATHEXC_3": {
        "Variable_label": "Are mathematics extension courses offered at school?",
        "Variable_context": "",
        "Variable_answers": json.dumps([
            {"value": 0, "text": "None"}, 
            {"value": 1, "text": "Mathematics extension courses offered without differentiation depending on the prior achievement level of the students"}, 
            {"value": 2, "text": "Mathematics extension courses offered for enrichment or remediation"}, 
            {"value": 3, "text": "Mathematics extension courses offered for enrichment and remediation"}
        ], ensure_ascii=False)
    },
    "PA003Q11JA": {
        "Variable_label": "How often do you or someone else in your home do the following things with your child? Talk to my child about the importance of <completing ISCED 3>  ",
        "Variable_context": "",
        "Variable_answers": json.dumps([
            {"value": 1, "text": "Never or almost never"}, 
            {"value": 2, "text": "About once or twice a year"}, 
            {"value": 3, "text": "About once or twice a month"}, 
            {"value": 4, "text": "About once or twice a week"}, 
            {"value": 5, "text": "Every day or almost every day"}
        ], ensure_ascii=False)
    },
     "PA003Q12JA": {
        "Variable_label": "How often do you or someone else in your home do the following things with your child? Talk to my child about any problems he/she may have at school  ",
        "Variable_context": "",
        "Variable_answers": json.dumps([
            {"value": 1, "text": "Never or almost never"}, 
            {"value": 2, "text": "About once or twice a year"}, 
            {"value": 3, "text": "About once or twice a month"}, 
            {"value": 4, "text": "About once or twice a week"}, 
            {"value": 5, "text": "Every day or almost every day"}
        ], ensure_ascii=False)
    },
     "PA003Q16JA": {
        "Variable_label": "How often do you or someone else in your home do the following things with your child? Talk to my child about his/her future education  ",
        "Variable_context": "",
        "Variable_answers": json.dumps([
            {"value": 1, "text": "Never or almost never"}, 
            {"value": 2, "text": "About once or twice a year"}, 
            {"value": 3, "text": "About once or twice a month"}, 
            {"value": 4, "text": "About once or twice a week"}, 
            {"value": 5, "text": "Every day or almost every day"}
        ], ensure_ascii=False)
    },
     "PA003Q18WA": {
        "Variable_label": "How often do you or someone else in your home do the following things with your child? Help my child with his/her mathematics homework  ",
        "Variable_context": "",
        "Variable_answers": json.dumps([
            {"value": 1, "text": "Never or almost never"}, 
            {"value": 2, "text": "About once or twice a year"}, 
            {"value": 3, "text": "About once or twice a month"}, 
            {"value": 4, "text": "About once or twice a week"}, 
            {"value": 5, "text": "Every day or almost every day"}
        ], ensure_ascii=False)
    },
     "PA003Q19WA": {
        "Variable_label": "How often do you or someone else in your home do the following things with your child? Obtain mathematics materials (e.g. applications, software, study guides etc.) for my child  ",
        "Variable_context": "",
        "Variable_answers": json.dumps([
            {"value": 1, "text": "Never or almost never"}, 
            {"value": 2, "text": "About once or twice a year"}, 
            {"value": 3, "text": "About once or twice a month"}, 
            {"value": 4, "text": "About once or twice a week"}, 
            {"value": 5, "text": "Every day or almost every day"}
        ], ensure_ascii=False)
    },
     "PA003Q20WA": {
        "Variable_label": "How often do you or someone else in your home do the following things with your child? Discuss with my child how mathematics can be applied in everyday life  ",
        "Variable_context": "",
        "Variable_answers": json.dumps([
            {"value": 1, "text": "Never or almost never"}, 
            {"value": 2, "text": "About once or twice a year"}, 
            {"value": 3, "text": "About once or twice a month"}, 
            {"value": 4, "text": "About once or twice a week"}, 
            {"value": 5, "text": "Every day or almost every day"}
        ], ensure_ascii=False)
    },
     "PA008Q02TA": {
        "Variable_label": "During <the last academic year>, have you participated in any of the following school-related activities? Discussed my child’s behaviour on the initiative of one of his/her teachers  ",
        "Variable_context": "",
        "Variable_answers": json.dumps([
            {"value": 1, "text": "Yes"}, 
            {"value": 2, "text": "No"}, 
            {"value": 3, "text": "Not Supported By School"}
        ], ensure_ascii=False)
    },
     "PA008Q04TA": {
        "Variable_label": "During <the last academic year>, have you participated in any of the following school-related activities? Discussed my child’s progress on the initiative of one of their teachers",
        "Variable_context": "",
        "Variable_answers": json.dumps([
            {"value": 1, "text": "Yes"}, 
            {"value": 2, "text": "No"}, 
            {"value": 3, "text": "Not Supported By School"}
        ], ensure_ascii=False)
    },
     "PA189Q03JA": {
        "Variable_label": "My child enjoys solving complex problems. ",
        "Variable_context": "",
        "Variable_answers": json.dumps([
            {"value": 1, "text": "Strongly agree"}, 
            {"value": 2, "text": "Agree"}, 
            {"value": 3, "text": "Disagree"}, 
            {"value": 4, "text": "Strongly disagree"}
        ], ensure_ascii=False)
    },
     "PA189Q05JA": {
        "Variable_label": "My child enjoys artistic activities. ",
        "Variable_context": "",
        "Variable_answers": json.dumps([
            {"value": 1, "text": "Strongly agree"}, 
            {"value": 2, "text": "Agree"}, 
            {"value": 3, "text": "Disagree"}, 
            {"value": 4, "text": "Strongly disagree"}
        ], ensure_ascii=False)
    },
     "PA197Q01WA": {
        "Variable_label": "Does anybody in your family (including you) work in a <mathematics-related career>? ",
        "Variable_context": "",
        "Variable_answers": json.dumps([
            {"value": 1, "text": "Yes"}, 
            {"value": 2, "text": "No"}
        ], ensure_ascii=False)
    },
     "PA197Q02WA": {
        "Variable_label": "Does your child show an interest in working in a <mathematics-related career>?",
        "Variable_context": "",
        "Variable_answers": json.dumps([
            {"value": 1, "text": "Yes"}, 
            {"value": 2, "text": "No"}
        ], ensure_ascii=False)
    },
     "PA197Q03WA": {
        "Variable_label": "Do you expect your child will go into a <mathematics-related career>? ",
        "Variable_context": "",
        "Variable_answers": json.dumps([
            {"value": 1, "text": "Yes"}, 
            {"value": 2, "text": "No"}
        ], ensure_ascii=False)
    },
     "PA197Q04WA": {
        "Variable_label": "Has your child shown interest in studying mathematics after completing <secondary school>?",
        "Variable_context": "",
        "Variable_answers": json.dumps([
            {"value": 1, "text": "Yes"}, 
            {"value": 2, "text": "No"}
        ], ensure_ascii=False)
    },

    # Add more as needed...
}

In [6]:
# 📌 Cell 3 – Process and Generate Output
import pandas as pd
import json
import re

not_found_vars = set()

# Replacement map: {unfound_variable: replacement_variable}
variable_replacements = {
    "SCHLTYPE_1": "SCHLTYPE",
    "SCHLTYPE_2": "SCHLTYPE",
    "SCHLTYPE_3": "SCHLTYPE",
    "ST349Q01JA_1": "ST349Q01JA",
    "ST349Q01JA_2": "ST349Q01JA",
    "SC177Q01JA_1": "SC177Q01JA",
    "SC177Q01JA_2": "SC177Q01JA",
    "SC177Q01JA_3": "SC177Q01JA",
    "SC177Q03JA_1": "SC177Q03JA",
    "ST349Q01JA_4": "ST349Q01JA"
    # Add more as needed
}

# Show a preview
for country, variables in list(country_variables.items())[:150]:  # show first 5
    variable_names = variables
    stop_words = {"Valid Skip", "Not Applicable", "Invalid", "No Response", "Missing"}
    output_rows = []

    for variable in variable_names:
        header_row_index = None
        original_variable = variable  # Save original name

        # Default values
        hide = ""
        hidden_value = ""
        admin_only = ""
        variable_label = ""
        variable_context = ""
        answers_json = ""
        
        if variable not in manual_variable_info:
            # First try the variable as-is
            for i, row in enumerate(data):
                if len(row) > 0 and str(row[0]).strip() == variable:
                    header_row_index = i
                    break
        else:
            print(f"Skipping FOUND variable '{variable}' due to manual override.")
        
        # If not found, try replacement
        if header_row_index is None and variable in variable_replacements:
            variable = variable_replacements[variable]
            for i, row in enumerate(data):
                if len(row) > 0 and str(row[0]).strip() == variable:
                    header_row_index = i
                    break
        
        # Still not found? Log it
        if header_row_index is None:
            manual_info = manual_variable_info.get(variable, {})
            if manual_info:
                print(f"📝 Using manual override for '{variable}'")
            else:
                not_found_vars.add(original_variable)
            
        if header_row_index is not None:
            row = data[header_row_index]
            variable_label = str(row[1]).strip() if len(row) > 1 else ""
        
            # Check what the next row's text is
            next_row = data[header_row_index + 1] if header_row_index + 1 < len(data) else []
            first_text = str(next_row[7]).strip() if len(next_row) > 7 else ""
        
            # 👉 CASE 1: Use range if next row is stop word and range is in header
            if first_text in stop_words:
                range_text = str(row[5]).strip() if len(row) > 5 else ""
                # Attempt to split the range by the middle dash (not the negative sign)
                range_text_clean = range_text.replace("–", "-").replace("−", "-")  # handle unicode dashes
                try:
                    parts = re.split(r'(?<!^)-(?=\d)', range_text_clean)
                    if len(parts) == 2:
                        min_val, max_val = map(float, parts)
                        answers_json = json.dumps({
                            "range": {
                                "min": int(min_val),
                                "max": int(max_val),
                                "step": 1
                            }
                        }, ensure_ascii=False)
                    else:
                        raise ValueError(f"Could not split cleanly into 2 parts: '{range_text_clean}'")
                except Exception as e:
                    print(f"❌ Range parsing failed for variable '{variable}' with range text '{range_text}':", e)
                    continue
            else:
                # CASE 2: Collect answer options from G/H
                answers = []
                row_index = header_row_index + 1
        
                while row_index < len(data):
                    row = data[row_index]
                    if len(row) < 8:
                        break
        
                    value = str(row[6]).strip()
                    text = str(row[7]).strip()
        
                    if text in stop_words:
                        break
        
                    if text.lower() in {"nan", ""}:
                        text = 'None'
        
                    try:
                        answers.append({"value": int(float(value)), "text": text})
                    except ValueError:
                        pass
        
                    row_index += 1
        
                answers_json = json.dumps(answers, ensure_ascii=False)
        
        else:
             # Special handling for LANGN_ variables
            if original_variable.startswith("LANGN_"):
                print("⚠️ Patching LANG -> "+str(variable))
                hide = "Yes"
                hidden_value = "1"
                admin_only = "Yes"
                variable_label = "Language used to complete questionaire"
            else:
                # Variable is being handled fully from manual_variable_info or it is LANGN_
                variable_label = manual_info.get("Variable_label", "")
                variable_context = manual_info.get("Variable_context", "")
                answers_json = manual_info.get("Variable_answers", "")
                
        # Append to output
        output_rows.append({
            "Variable_name": variable,
            "Variable_label": variable_label,
            "Variable_context": variable_context,
            "Variable_answers": answers_json,
            "Recommendation_Threshold": "",
            "Recommended_Intervention": "",
            "Hide": hide,
            "Hidden_Value": hidden_value,
            "Admin_Only": admin_only
        })

    # Create DataFrame and write to CSV
    df_output = pd.DataFrame(output_rows)
    fl = "./top20/questions/questions_"+str(country)+'.csv'
    df_output.to_csv(fl, index=False)
    print("✅ Output saved to "+str(fl))

if not_found_vars:
    print("⚠️ Variables not found (unique list):")
    for v in sorted(not_found_vars):
        print(f" - {v}")

Skipping FOUND variable 'GRADE' due to manual override.
📝 Using manual override for 'GRADE'
⚠️ Patching LANG -> LANGN_156
Skipping FOUND variable 'ST259Q01JA' due to manual override.
📝 Using manual override for 'ST259Q01JA'
✅ Output saved to ./top20/questions/questions_Global.csv
⚠️ Variables not found (unique list):
 - LANGN_156


✅ Output saved to 'variable_metadata_output_v3.csv'
