**OBJECTIVE:** Document my thought process as I develop the function for the `/v1/country-data/` endpoint

In [1]:
import pandas as pd

# Read the Data

In [None]:
df_assessments = pd.read_excel("./data/TPI ASCOR data - 13012025/ASCOR_assessments_results.xlsx")
df_assessments['Assessment date'] = pd.to_datetime(df_assessments['Assessment date'])
df_assessments['Publication date'] = pd.to_datetime(df_assessments['Publication date'])

## Test how to filter the df:

In [None]:


selected_row = (
    (df_assessments["Country"] == country) &
    (df_assessments['Assessment date'].dt.year == assessment_year)
)

In [None]:
country = 'United Kingdom'
assessment_year = 2023

# This returns a pd.Series of Trues and Falses
selected_country = df_assessments["Country"] == country
selected_year    = df_assessments['Assessment date'].dt.year == assessment_year


df_assessments[selected_country & selected_year]

# The function I am designing 

In [6]:
def get_country_data(country: str, assessment_year: int):

    selected_row = (
        (df_assessments["Country"] == country) &
        (df_assessments['Assessment date'].dt.year == assessment_year)
    )

    # Filter the data
    data = df_assessments[selected_row]

    # Selected and filter columns
    area_columns = [col for col in df_assessments.columns if col.startswith("area")]
    data = data[area_columns]
    
    # JSON does not allow for NaN or NULL. 
    # The equivalent is just to leave an empty string instead
    data = data.fillna('')

    #Rename columns
    data['country'] = country
    data['assessment_year'] = assessment_year

    remap_area_column_names = {
        col: col.replace('area ', '')
        for col in area_columns
    }

    data = data.rename(columns=remap_area_column_names)

    # Grab just the first element (there should only be one anyway)
    # and return it as a dictionary
    return data.iloc[0].to_dict()

Test how the function behaves:

In [None]:
get_country_data('Italy', 2024)

# Testing out Pydantic Models

In [17]:
from pydantic import BaseModel

class CountryData(BaseModel):
    country: str
    assesment_year: int

In the end, we want our API to produce an **instance** of the CountryData object like this:

In [None]:
output = CountryData(country="United Kingdom", assesment_year=2024)

output

In reality, all I have (usually) is a dictionary or a list that looks like this:

In [None]:
output_dict = {"country": "United Kingdom", "assesment_year":2024}

output_dict

The ** operator allows to pass a dictionary to a class or a function so that each key becomes a parameter/argument: 

In [None]:
CountryData(**output_dict)

# Start working on the deeply hierarchical structure

Eventually, I want to serve data like this:

```json
{
    "pillars": [
    {
        "name": "EP",
        "areas": [
            {
                "name": "EP.1",
                "assessment": "Partial",
                "indicators": [
                    {
                        "name": "EP.1.a",
                        "assessment": "Yes",
                        "metrics": ""
                    },
                    ...
                    {
                        "name": "EP.2.1",
                        "assessment": "Yes",
                        "metrics": {
                            "name": "EP.2.a.1",
                            "value": "-25%"
                        }
                    }
                ]
            }
        ]
    },
    {
        "name": "CP",
        "areas": [
            {
                "name": "CP.1",
                "assessment": "Partial",
                "indicators": [
                    ...
                ]
            },
            {
                ...
            }
        ]
    },
    {
        "name": "CF",
        "areas": [
            {
                "name": "CF.1",
                "assessment": "Partial",
                "indicators": [
                    ...
                ]
            },
            {
                ...
            }
        ]
    }
    ]
}
```


## Focus on just the Metrics part

In [11]:
# This is the data model for Metrics
class Metric(BaseModel):
    name: str
    value: str

Before I actually write the code to the API, let me see how I'd have to filter the dataframe:

In [None]:
# Filter for just the metrics columns
selected_columns = [col for col in df_assessments.columns if col.startswith('metric')]
selected_columns

Filter the data point to include only those columns:

In [None]:
# Get a random data point
data = df_assessments.iloc[0]

data[selected_columns]

How would the Metric object be constructed?

In [None]:
Metric(name="metric EP.2.a.i", value="-25%")

In [None]:
data_as_dict = data[selected_columns].to_dict()

list_metrics = []
for name, value in data_as_dict.items():
    individual_metric = Metric(name=name, value=value)
    list_metrics.append(individual_metric)

list_metrics

## Test the whole new function 

In [48]:
def get_country_metrics(country: str, assessment_year: int):

    selected_row = (
        (df_assessments["Country"] == country) &
        (df_assessments['Assessment date'].dt.year == assessment_year)
    )

    # Filter the data
    data = df_assessments[selected_row]

    if data.empty:
        raise HTTPException(status_code=404, 
                            detail=f"There is no data for country: {country} and year: {assessment_year}")

    # Select just the metrics
    metric_columns = [col for col in df_assessments.columns 
                      if col.startswith('metric')]
    data = data[metric_columns]

    # JSON does not allow for NaN or NULL. 
    # The equivalent is just to leave an empty string instead
    data = data.fillna('')

    remap_area_column_names = {
        col: col.replace('metric ', '')
        for col in metric_columns
    }

    data = data.rename(columns=remap_area_column_names)

    data_as_dict = data.iloc[0].to_dict()

    list_metrics = []
    for name, value in data_as_dict.items():
        individual_metric = Metric(name=name, value=value)
        list_metrics.append(individual_metric)
    # Grab just the first element (there should only be one anyway)
    # and return it as a dictionary
    return list_metrics


In [None]:
get_country_metrics('Italy', 2024)

## How to group together metrics with their relevant indicators?

In [None]:
selected_row = (
        (df_assessments["Country"] == country) &
        (df_assessments['Assessment date'].dt.year == assessment_year)
    )

# Filter the data
data = df_assessments[selected_row]
data


In [None]:
indicator_columns = data.loc[:, data.columns.str.startswith('indicator')]
indicator_columns

In [None]:
metric_columns = data.loc[:, data.columns.str.startswith('metric')]
metric_columns

In [None]:
EP = {col: data[col] for col in data.index if col.startswith("EP")}
CP = {col: data[col] for col in data.index if col.startswith("CP")}
CF = {col: data[col] for col in data.index if col.startswith("CF")}

In [18]:
from typing import List, Optional
from pydantic import BaseModel

class CountryData(BaseModel):
    country: str
    assessment_year: int
    EP_1: str 
    EP_2: str
    EP_3: str
    CP_1: str
    CP_2: str
    CP_3: str
    CP_4: str
    CP_5: str
    CP_6: str
    CF_1: str
    CF_2: str
    CF_3: str
    CF_4: str


class Metric(BaseModel):
    name: str
    value: Optional[str]=None


class Indicator(BaseModel):
    name: str
    assessment: Optional[str]=None
    metrics: Optional[List[Metric]]=None
    source: Optional[str]=None

class Area(BaseModel):
    name: str
    assessment: Optional[str]=None
    indicators: List[Indicator] = []

class Pillar(BaseModel):
    name: str
    area: List[Area] =[]

class Metadata(BaseModel):
    metadata: str
    assessment_year: int

class ResponseData(BaseModel):
    metadata: Metadata
    pillars: List[Pillar]=[]

class ErrorResponse(BaseModel):
    message: str
    details: dict={}

In [23]:
def get_country_metrics(country: str, assessment_year: int):
    selected_row = (
        (df_assessments["Country"] == country) &
        (df_assessments['Assessment date'].dt.year == assessment_year)
    )

    # Filter the data
    data = df_assessments[selected_row]

    if data.empty:
        raise HTTPException(status_code=404, 
                            detail=f"There is no data for country: {country} and year: {assessment_year}")

    row = data.iloc[0]
    pillars = []
    for pillar_name in ["EP", "CP", "CF"]:  # Dynamically process pillar names
        areas = []
        for col_name in row.index:  # Iterate through the columns of the selected row
            if col_name.startswith(f"area {pillar_name}"):  # Match area columns (e.g., "area EP.1")
                area_name = col_name.split(" ")[1]  # Extract area name (e.g., "EP.1")
                assessment = assessment = row[col_name] if pd.notna(row[col_name]) else None  # Handle NaN values
                indicators = []

                # Match indicators related to the area
                for indicator_col in row.index:
                    if indicator_col.startswith(f"indicator {area_name}"):
                        indicator_name = indicator_col.split(" ")[1]  # Extract indicator name (e.g., "EP.1.a")
                        assessment = row[indicator_col]
                        metrics = []

                        # Match metrics related to the indicator
                        for metric_col in row.index:
                            if metric_col.startswith(f"metric {indicator_name}"):
                                metric_name = metric_col.split(" ")[1]  # Extract metric name (e.g., "EP.1.a.1")
                                value = row[metric_col]
                                metrics.append(Metric(name=metric_name, value=value))
                        
                        # Add the indicator
                        indicators.append(Indicator(name=indicator_name, assessment=assessment, metrics=metrics))
                
                # Add the area
                areas.append(Area(name=area_name, assessment=assessment, indicators=indicators))
        
        pillars.append(Pillar(name=pillar_name, area=areas))
    
    # Create metadata
    metadata = Metadata(
        metadata="Country metrics data",
        assessment_year=assessment_year
    )
    
    # Return the response
    return ResponseData(metadata=metadata, pillars=pillars)

In [None]:
get_country_metrics('Italy', 2024)

In [None]:
row = data.iloc[0]
areas = []
for pillar_name in ["EP", "CP", "CF"]:  # Dynamically process pillar names
        for col_name in row.index:  # Iterate through the columns of the selected row
            if col_name.startswith(f"area {pillar_name}"):  # Match area columns (e.g., "area EP.1")
                area_name = col_name.split(" ")[1]  # Extract area name (e.g., "EP.1")
                assessment = row[col_name]   # Handle NaN value
                areas.append((area_name, assessment))

areas

In [None]:
area_columns = data.loc[:, data.columns.str.startswith('area')]
area_columns

In [68]:
row = data.iloc[0]
pillars = []
for pillar_name in ["EP", "CP", "CF"]:  # Dynamically process pillar names
    areas = []
    for col_name in row.index:  # Iterate through the columns of the selected row
        if col_name.startswith(f"area {pillar_name}"):  # Match area columns (e.g., "area EP.1")
            area_name = col_name.split(" ")[1]  # Extract area name (e.g., "EP.1")
            assessment = assessment = row[col_name] if pd.notna(row[col_name]) else None  # Handle NaN values
            indicators = []

            # Match indicators related to the area
            for indicator_col in row.index:
                if indicator_col.startswith(f"indicator {area_name}"):
                    indicator_name = indicator_col.split(" ")[1]  # Extract indicator name (e.g., "EP.1.a")
                    assessment = row[indicator_col]
                    metrics = []

                    # Match metrics related to the indicator
                    for metric_col in row.index:
                        if metric_col.startswith(f"metric {indicator_name}"):
                            metric_name = metric_col.split(" ")[1]  # Extract metric name (e.g., "EP.1.a.1")
                            value = row[metric_col]
                            metrics.append(Metric(name=metric_name, value=value))
                        
                    # Add the indicator
                    indicators.append(Indicator(name=indicator_name, assessment=assessment, metrics=metrics))
                
            # Add the area
            areas.append(Area(name=area_name, assessment=assessment, indicators=indicators))

        
    pillars.append(Pillar(name=pillar_name, area=areas))

pillars

[Pillar(name='EP', area=[Area(name='EP.1', assessment='No', indicators=[Indicator(name='EP.1.a', assessment='Yes', metrics=[], source=None), Indicator(name='EP.1.b', assessment='No', metrics=[], source=None), Indicator(name='EP.1.c', assessment='No', metrics=[], source=None)]), Area(name='EP.2', assessment='No', indicators=[Indicator(name='EP.2.a', assessment='Yes', metrics=[Metric(name='EP.2.a.i', value="'-42%")], source=None), Indicator(name='EP.2.b', assessment='No', metrics=[Metric(name='EP.2.b.i', value='No or unsuitable disclosure')], source=None), Indicator(name='EP.2.c', assessment='No', metrics=[Metric(name='EP.2.c.i', value='37%')], source=None), Indicator(name='EP.2.d', assessment='No', metrics=[Metric(name='EP.2.d.i', value='162%')], source=None)]), Area(name='EP.3', assessment='No', indicators=[Indicator(name='EP.3.a', assessment='Yes', metrics=[Metric(name='EP.3.a.i', value='2050')], source=None), Indicator(name='EP.3.b', assessment='Yes', metrics=[], source=None), Indica

In [None]:
def get_country_metrics(country: str, assessment_year: int):
    selected_row = (
        (df_assessments["Country"] == country) &
        (df_assessments['Assessment date'].dt.year == assessment_year)
    )

    # Filter the data
    data = df_assessments[selected_row]

    if data.empty:
        raise HTTPException(status_code=404, 
                            detail=f"There is no data for country: {country} and year: {assessment_year}")

    row = data.iloc[0]
    pillars = []
    for pillar_name in ["EP", "CP", "CF"]:  # Dynamically process pillar names
        areas = []
        for col_name in row.index:  # Iterate through the columns of the selected row
            if col_name.startswith(f"area {pillar_name}"):  # Match area columns (e.g., "area EP.1")
                area_name = col_name.split(" ")[1]  # Extract area name (e.g., "EP.1")
                assessment = assessment = row[col_name] if pd.notna(row[col_name]) else None  # Handle NaN values
                indicators = []

                # Match indicators related to the area
                for indicator_col in row.index:
                    if indicator_col.startswith(f"indicator {area_name}"):
                        indicator_name = indicator_col.split(" ")[1]  # Extract indicator name (e.g., "EP.1.a")
                        assessment = row[indicator_col]
                        metrics = []

                        # Match metrics related to the indicator
                        for metric_col in row.index:
                            if metric_col.startswith(f"metric {indicator_name}"):
                                metric_name = metric_col.split(" ")[1]  # Extract metric name (e.g., "EP.1.a.1")
                                value = row[metric_col]
                                metrics.append(Metric(name=metric_name, value=value))
                        
                        # Add the indicator
                        indicators.append(Indicator(name=indicator_name, assessment=assessment, metrics=metrics))
                
                # Add the area
                areas.append(Area(name=area_name, assessment=assessment, indicators=indicators))
        
        pillars.append(Pillar(name=pillar_name, area=areas))
    
    # Create metadata
    metadata = Metadata(
        metadata="Country metrics data",
        assessment_year=assessment_year
    )
    
    # Return the response
    return ResponseData(metadata=metadata, pillars=pillars)

get_country_metrics("Italy", assessment_year: int)