# Interview Analysis Notebook

This notebook is set up to load the provided CSV files and explore them systematically.

In [3]:
# === Config ===
from pathlib import Path
import sys
import os

# if notebook is inside /notebooks, go up one level, otherwise stay where you are
PROJECT_ROOT = Path.cwd().parents[0] if Path.cwd().name == "notebooks" else Path.cwd()
SRC = PROJECT_ROOT / "src"

# Add src folder to Python path
sys.path.append(str(SRC))

DATA_RAW = PROJECT_ROOT / "data" / "raw"
DATA_EXTERNAL = PROJECT_ROOT / "data" / "external"
DATA_PROCESSED = PROJECT_ROOT / "data" / "processed"

FILENAMES = [
    'nh_monthly_variables.csv',
    'ifs_quarterly_variables.csv',
    'qual_survey_responses.csv',
    'variable_names.csv'
]

print("Project root:", PROJECT_ROOT)
print("Raw data dir:", DATA_RAW)
print("Files to load:", FILENAMES)

Project root: /Users/lukemaggs/Desktop/Nesta_Interview_LM
Raw data dir: /Users/lukemaggs/Desktop/Nesta_Interview_LM/data/raw
Files to load: ['nh_monthly_variables.csv', 'ifs_quarterly_variables.csv', 'qual_survey_responses.csv', 'variable_names.csv']


In [None]:
# === Imports ===
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from data_io import read_csv, raw_path, processed_path


pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 120)


In [None]:
# === Load data ===
dfs = {}
for fn in FILENAMES:
    path = DATA_RAW / fn
    if not path.exists():
        print(f"WARNING: {path} not found. Place your CSVs in data/raw.")
    else:
        dfs[fn] = read_csv(path)
        print(f"Loaded {fn}: shape = {dfs[fn].shape}")
dfs.keys()


In [None]:
# === Quick overview ===
for fn, df in dfs.items():
    print('\n---', fn, '---')
    display(df.head())
    display(df.describe(include='all').T)
    print('Nulls by column:')
    display(df.isna().sum())


In [None]:
# === Example plot ===
for fn, df in dfs.items():
    num_cols = df.select_dtypes(include='number').columns.tolist()
    if not num_cols:
        continue
    col = num_cols[0]
    plt.figure()
    df[col].dropna().hist()
    plt.title(f"{fn}: Distribution of {col}")
    plt.xlabel(col)
    plt.ylabel('Count')
    plt.show()
