# Interview Analysis Notebook

This notebook is set up to load the provided CSV files and explore them systematically.

In [1]:
# Minimal Colab bootstrap — clone once; pull thereafter
import os, subprocess, sys

REPO   = "monkeymoves/nesta"
BRANCH = "master"
TARGET = "/content/nesta"

IN_COLAB = "google.colab" in sys.modules

def sh(*args): 
    subprocess.run(list(args), check=True)

if IN_COLAB:
    if not os.path.isdir(TARGET):
        sh("git", "clone", "--depth", "1", "--branch", BRANCH, f"https://github.com/{REPO}.git", TARGET)
    else:
        sh("git", "-C", TARGET, "pull", "origin", BRANCH)
    os.chdir(TARGET)

print("CWD:", os.getcwd())

CWD: /Users/lukemaggs/Desktop/Nesta_Interview_LM/notebooks


In [2]:
from pathlib import Path

# Repo-aware project root (works if you run from / or /notebooks)
PROJECT_ROOT = Path.cwd()
if PROJECT_ROOT.name == "notebooks":
    PROJECT_ROOT = PROJECT_ROOT.parent

DATA_RAW       = PROJECT_ROOT / "data" / "raw"
DATA_PROCESSED = PROJECT_ROOT / "data" / "processed"
DATA_PROCESSED.mkdir(parents=True, exist_ok=True)

print("PROJECT_ROOT:", PROJECT_ROOT)
print("DATA_RAW:", DATA_RAW)

PROJECT_ROOT: /Users/lukemaggs/Desktop/Nesta_Interview_LM
DATA_RAW: /Users/lukemaggs/Desktop/Nesta_Interview_LM/data/raw


In [3]:
import pandas as pd

# Files to load
FILENAMES = [
    "lfs_monthly_variables.csv",
    "lfs_quarterly_variables.csv",
    "qual_survey_responses.csv",
    "variable_names.csv",
]

# Only the two LFS files need the first 3 rows dropped (CDID/PreUnit/Unit)
SKIPROWS = {
    "lfs_monthly_variables.csv":    [1, 2, 3],  # keep header row (row 0), drop 1–3
    "lfs_quarterly_variables.csv":  [1, 2, 3],
}

def read_csv_safely(path: Path) -> pd.DataFrame:
    """Try a few encodings; keep header row; optionally skip metadata rows."""
    fname = path.name
    kwargs = {}
    if fname in SKIPROWS:
        kwargs["skiprows"] = SKIPROWS[fname]
    for enc in ("utf-8", "utf-8-sig", "cp1252", "latin-1"):
        try:
            return pd.read_csv(path, encoding=enc, **kwargs)
        except UnicodeDecodeError:
            continue
    # last resort: replace bad bytes
    return pd.read_csv(path, encoding="latin-1", encoding_errors="replace", **kwargs)

# Load all four into a dict
dfs = {}
for fn in FILENAMES:
    p = DATA_RAW / fn
    df = read_csv_safely(p)
    dfs[fn] = df
    print(f"{fn:>28s}  →  {df.shape}")

   lfs_monthly_variables.csv  →  (385, 1831)
 lfs_quarterly_variables.csv  →  (121, 1831)
   qual_survey_responses.csv  →  (110, 4)
          variable_names.csv  →  (1830, 1)


In [None]:
import re

def parse_period_col(df: pd.DataFrame) -> pd.DataFrame:
    """Return a copy with a 'period' index from the first column; drops blanks."""
    df = df.copy()
    time_col = df.columns[0]
    s = df[time_col].astype(str).str.strip()

    def to_timestamp(x: str):
        # Quarter like "1994 Q1"
        m = re.match(r"^\s*(\d{4})\s*Q([1-4])\s*$", x, re.I)
        if m:
            yr, q = int(m.group(1)), int(m.group(2))
            return pd.Period(f"{yr}Q{q}", freq="Q").to_timestamp("Q")
        # Month like "1992 MAR" or "1992 Apr"
        m2 = re.match(r"^\s*(\d{4})\s+([A-Za-z]{3,})\s*$", x)
        if m2:
            dt = pd.to_datetime(f"{m2.group(1)} {m2.group(2)}", errors="coerce", format="%Y %b")
            if pd.notna(dt):
                # coerce to quarter end for consistency
                q = (dt.month - 1)//3 + 1
                return pd.Period(f"{dt.year}Q{q}", freq="Q").to_timestamp("Q")
        # Fallback: pandas parser → quarter end
        try:
            dt = pd.to_datetime(x, errors="raise")
            q = (dt.month - 1)//3 + 1
            return pd.Period(f"{dt.year}Q{q}", freq="Q").to_timestamp("Q")
        except Exception:
            return pd.NaT

    idx = s.apply(to_timestamp)
    out = df[idx.notna()].copy()
    out.index = idx[idx.notna()]
    out = out.drop(columns=[time_col]).sort_index()
    return out

# Apply to the two LFS tables
df_lfs_m = parse_period_col(dfs["lfs_monthly_variables.csv"])
df_lfs_q = parse_period_col(dfs["lfs_quarterly_variables.csv"])

print("lfs_monthly_variables (parsed): ", df_lfs_m.shape, "| index:", df_lfs_m.index.min(), "→", df_lfs_m.index.max())
print("lfs_quarterly_variables (parsed):", df_lfs_q.shape, "| index:", df_lfs_q.index.min(), "→", df_lfs_q.index.max())

lfs_monthly_variables (parsed):  (385, 1830) | index: 1992-03-31 00:00:00 → 2024-03-31 00:00:00
lfs_quarterly_variables (parsed): (121, 1830) | index: 1994-03-31 00:00:00 → 2024-03-31 00:00:00


# Optional API Call for thematic Analysis of Survey Response

This notebook is set up for you to enter an OPEN_AI API key if running in Colabs OR 
If running locally set via .env OPENAI_API_KEY=XYZ

 **if you do not have an OpenAI key, please refer to report for description of manual thematic analysis and example  api call results**

In [5]:
import os, getpass
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY") or getpass.getpass("Enter OPENAI_API_KEY: ")

In [6]:
# Step 2: Set up OpenAI client (quietly installs if needed), no network call yet
try:
    from openai import OpenAI
except ModuleNotFoundError:
    import sys, subprocess
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "openai"])
    from openai import OpenAI

client = OpenAI(api_key=OPENAI_API_KEY)
print("Client ready")

Client ready


In [7]:
# Step 3: Load qual_survey_responses.csv using existing imported dfs dict 
df = dfs["qual_survey_responses.csv"]
print("Shape:", df.shape)
print("Columns:", list(df.columns))
df.head(3)

Shape: (110, 4)
Columns: ['Participant ID', 'Employment Status', 'UK Region', 'Response']


Unnamed: 0,Participant ID,Employment Status,UK Region,Response
0,P001,Unemployed,Scotland,The government should focus on providing more ...
1,P002,Employed,North West,I believe the government should create more in...
2,P003,Self-employed,London,The government needs to address the root cause...


In [None]:
# --- 1. DEFINE ANALYSIS PARAMETERS ---
import json
import textwrap
#  configuration 
MODEL = "gpt-5-mini"
TEXT_COL = "Response"

SYSTEM_PROMPT = """
You are an expert Social Science Researcher performing a rigorous, inductive thematic analysis based on the principles of Braun & Clarke. Your task is to analyse the following collection of survey responses to identify the general, overarching themes present across the entire dataset.

A theme is a patterned response or meaning found across multiple responses, not just a summary of one. Your analysis must be grounded exclusively in the provided text.

Your entire output must be a single, valid JSON object that adheres to the required structure.
"""

JSON_SCHEMA = {
    "type": "object",
    "properties": {
        "analysis_summary": {
            "type": "string",
            "description": "A one-paragraph synthesis of the key findings and dominant themes from the entire set of responses."
        },
        "themes": {
            "type": "array",
            "items": {
                "type": "object",
                "properties": {
                    "theme_label": {"type": "string", "description": "A concise, 3-5 word name for the theme."},
                    "definition": {"type": "string", "description": "A one-sentence explanation of the theme's central concept."},
                    "interpretation": {"type": "string", "description": "Analysis of why this theme is significant and what it implies."},
                    "representative_quotes": {
                        "type": "array",
                        "items": {"type": "string"},
                        "description": "2-3 verbatim quotes from different responses that strongly illustrate this theme."
                    }
                },
                "required": ["theme_label", "definition", "interpretation", "representative_quotes"]
            }
        }
    },
    "required": ["analysis_summary", "themes"]
}


# --- 2. PREPARE AND EXECUTE THE API CALL ---

# Combine all survey responses into a single string.
all_responses_text = "\n---\n".join(df[TEXT_COL].dropna().astype(str))

print(f"📞 Sending {len(df)} responses to '{MODEL}' for analysis...")

try:
    # Make the single API call using the client we created in Cell 2
    resp = client.chat.completions.create(
        model=MODEL,
        response_format={"type": "json_schema", "json_schema": {"name": "holistic_thematic_analysis", "schema": JSON_SCHEMA}},
        messages=[
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": all_responses_text}
        ],
    )

    analysis_result = json.loads(resp.choices[0].message.content)

    # --- 3. NICELY PRINT THE RESULTS ---
    print("\n✅ Analysis Complete!\n")
    print("="*30)
    print("   OVERALL ANALYSIS SUMMARY")
    print("="*30)
    summary = analysis_result.get('analysis_summary', 'Not found.')
    print(textwrap.fill(summary, width=80))

    print("\n" + "="*30)
    print("      IDENTIFIED THEMES")
    print("="*30)
    for i, theme in enumerate(analysis_result.get('themes', []), 1):
        print(f"\nTHEME {i}: {theme.get('theme_label', 'No Label').upper()}")
        print("-" * 25)
        print(f"  Definition: {theme.get('definition', 'N/A')}")
        print(f"  Interpretation: {theme.get('interpretation', 'N/A')}")
        print("  Supporting Quotes:")
        for quote in theme.get('representative_quotes', []):
            print(f'    > "{quote}"')

except Exception as e:
    print(f"\n🔴 An error occurred: {e}")

📞 Sending 110 responses to 'gpt-5-mini' for analysis...

✅ Analysis Complete!

   OVERALL ANALYSIS SUMMARY
Respondents coalesce around two broad priorities: preparing and protecting
workers for a changing labour market, and removing structural barriers that
prevent people from getting and keeping work. The most frequent prescriptions
are expanded skills and retraining programs, targeted support for vulnerable
groups (older workers, disabled people, carers, ex-offenders), and stronger
protections for precarious and gig-economy workers. Many also call for active
job-creation through public investment (infrastructure, green projects, regional
development), incentives/regulation to shape employer behaviour, and improved
safety nets—while noting barriers such as childcare, housing, healthcare
backlogs, transport and digital exclusion. There is a recurring tension between
proposals for greater government intervention (UBI, benefit extensions, public
works) and voices emphasising personal res

In [9]:
# import json, textwrap, re
# import pandas as pd
# from openai import OpenAI

# client = OpenAI(api_key=OPENAI_API_KEY)
# MODEL = "gpt-5-mini"
# TEXT_COL = "Response"
# BATCH_SIZE = 20

# SYSTEM_PROMPT = """
# You are an expert Social Science Researcher performing a rigorous, inductive thematic analysis 
# based on the principles of Braun & Clarke. Your task is to analyse the provided survey text 
# response to identify and articulate the salient underlying themes.

# A theme is a patterned response or meaning within the data, not just a topic summary. 
# Your analysis must be grounded exclusively in the provided text.

# Your entire output must be a single, valid JSON object with:
# - analysis_summary (string)
# - themes (array of objects, each with: theme_label, definition, interpretation, representative_quotes)
# Return JSON only.
# """

In [10]:
# def analyse_batch(items):
#     """
#     Sends a batch of survey responses and expects back a valid JSON array.
#     """
#     chunks = [
#         f"ID={it['row_id']}\n{textwrap.shorten(it['text'], width=2000, placeholder=' …')}"
#         for it in items
#     ]
#     user_text = (
#         f"{SYSTEM_PROMPT}\n\n"
#         "Analyse each item independently and return a JSON ARRAY in the same order.\n\n"
#         "Items:\n\n" + "\n\n---\n\n".join(chunks)
#     )

#     resp = client.responses.create(
#         model=MODEL,
#         input=user_text,
#         max_output_tokens=6000
#     )
#     content = resp.output_text.strip()

#     # Try to extract valid JSON array
#     start, end = content.find("["), content.rfind("]")
#     if start != -1 and end != -1:
#         return json.loads(content[start:end+1])
#     else:
#         raise ValueError(f"Model did not return JSON array:\n{content[:300]}...")

In [11]:
# import pandas as pd
# import json
# from openai import OpenAI
# import os
# import textwrap

# # --- 1. SETUP ---
# # Use your specified model and the name of the column containing the text.
# MODEL = "gpt-5-mini" # As you specified. (If it fails, try "gpt-4o-mini")
# TEXT_COL = "Response" # The column in your DataFrame with the survey text

# # This assumes you have already loaded your data, e.g.:
# # df = pd.read_csv('your_file.csv')


# # --- 2. PREPARE THE API CALL ---

# # Initialize the OpenAI client (it will automatically look for the OPENAI_API_KEY environment variable)
# try:
#     client = OpenAI()
# except Exception as e:
#     exit(f" Error: OpenAI client could not be initialized. Is your API key set?\n{e}")

# # A prompt tailored for analysing a COLLECTION of responses.
# SYSTEM_PROMPT = """
# You are an expert Social Science Researcher performing a rigorous, inductive thematic analysis based on the principles of Braun & Clarke. Your task is to analyse the following collection of survey responses to identify the general, overarching themes present across the entire dataset.

# A theme is a patterned response or meaning found across multiple responses, not just a summary of one. Your analysis must be grounded exclusively in the provided text.

# Your entire output must be a single, valid JSON object that adheres to the required structure.
# """

# # The JSON structure the AI must return.
# JSON_SCHEMA = {
#     "type": "object",
#     "properties": {
#         "analysis_summary": {
#             "type": "string",
#             "description": "A one-paragraph synthesis of the key findings and dominant themes from the entire set of responses."
#         },
#         "themes": {
#             "type": "array",
#             "items": {
#                 "type": "object",
#                 "properties": {
#                     "theme_label": {"type": "string", "description": "A concise, 3-5 word name for the theme."},
#                     "definition": {"type": "string", "description": "A one-sentence explanation of the theme's central concept."},
#                     "interpretation": {"type": "string", "description": "Analysis of why this theme is significant and what it implies."},
#                     "representative_quotes": {
#                         "type": "array",
#                         "items": {"type": "string"},
#                         "description": "2-3 verbatim quotes from different responses that strongly illustrate this theme."
#                     }
#                 },
#                 "required": ["theme_label", "definition", "interpretation", "representative_quotes"]
#             }
#         }
#     },
#     "required": ["analysis_summary", "themes"]
# }

# # Combine all survey responses into a single string.
# # Using '---' as a separator helps the model distinguish between individual responses.
# all_responses_text = "\n---\n".join(df[TEXT_COL].dropna().astype(str))


# # --- 3. EXECUTE AND DISPLAY ---

# print(f"📞 Sending {len(df)} responses to '{MODEL}' for analysis...")

# try:
#     # Make the single API call
#     resp = client.chat.completions.create(
#         model=MODEL,
#         response_format={"type": "json_schema", "json_schema": {"name": "holistic_thematic_analysis", "schema": JSON_SCHEMA}},
#         messages=[
#             {"role": "system", "content": SYSTEM_PROMPT},
#             {"role": "user", "content": all_responses_text}
#         ],
#     )

#     # Extract and parse the JSON content
#     analysis_result = json.loads(resp.choices[0].message.content)

#     # --- Nicely print the results ---
#     print("\n Analysis Complete!\n")
#     print("="*30)
#     print("   OVERALL ANALYSIS SUMMARY")
#     print("="*30)
#     # Use textwrap to format the summary paragraph nicely
#     summary = analysis_result.get('analysis_summary', 'Not found.')
#     print(textwrap.fill(summary, width=80))

#     print("\n" + "="*30)
#     print("      IDENTIFIED THEMES")
#     print("="*30)
#     for i, theme in enumerate(analysis_result.get('themes', []), 1):
#         print(f"\nTHEME {i}: {theme.get('theme_label', 'No Label').upper()}")
#         print("-" * 25)
#         print(f"  Definition: {theme.get('definition', 'N/A')}")
#         print(f"  Interpretation: {theme.get('interpretation', 'N/A')}")
#         print("  Supporting Quotes:")
#         for quote in theme.get('representative_quotes', []):
#             print(f'    > "{quote}"')

# except Exception as e:
#     print(f"\n An error occurred: {e}")