# Policy Atlas prototyping

- Generate and save vectors
- Visualise with BERTopic
- Allow for medium number of clusters + drop down menu

In [3]:
import pandas as pd
import importlib

from src import PROJECT_DIR
from src import viz_landscape
importlib.reload(viz_landscape);

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/karlis.kanders/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
uk_df = pd.read_csv(PROJECT_DIR / "data/2025_01_policy_atlas/uk_aid.csv")

In [5]:
# Generate embeddings 
from sentence_transformers import SentenceTransformer
MODEL = "all-MiniLM-L6-v2"
model = SentenceTransformer(MODEL)


2025-01-17 19:18:57,084 - sentence_transformers.SentenceTransformer - INFO - Load pretrained SentenceTransformer: all-MiniLM-L6-v2
2025-01-17 19:18:58,235 - sentence_transformers.SentenceTransformer - INFO - Use pytorch device_name: mps


In [6]:
uk_df.columns

Index(['iati_identifier', 'title_narrative', 'reporting_org_ref',
       'reporting_org_narrative', 'participating_org_ref',
       'participating_org_narrative', 'recipient_country_code',
       'recipient_country_narrative', 'sector_narrative',
       'description_narrative', 'activity_status_code',
       'activity_date_iso_date', 'activity_date_type',
       'activity_date_narrative', 'min_date', 'max_date', 'min_year',
       'max_year', 'activity_status', 'text'],
      dtype='object')

In [7]:
embeddings = model.encode(uk_df['text'].tolist())

Batches:   0%|          | 0/1188 [00:00<?, ?it/s]

In [8]:
uk_df['vector'] = embeddings.tolist()

In [10]:
viz_df, centroids_df = (
    viz_landscape.generate_landscape_viz(
        uk_df,
        min_cluster_size=50,
        nr_topics = 20,
        verbose=True,
    )
)

2025-01-17 19:23:06,994 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.
2025-01-17 19:23:33,240 - BERTopic - Dimensionality - Completed ✓
2025-01-17 19:23:33,243 - BERTopic - Cluster - Start clustering the reduced embeddings
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
huggingface/t

In [26]:
uk_viz_df = (
    viz_df
    .rename(columns={"title_narrative": "title", "text": "description", "Name": "category"})
    .assign(url = lambda df: df.iati_identifier.apply(lambda x: f"https://datastore.iatistandard.org/activity/{x}"))
)

In [30]:
import altair as alt

def chart_landscape(
    viz_df: pd.DataFrame,
    width: int = 900,
    height: int = 750,
    _opacity: float = 0.5,
) -> alt.Chart:
    """Generate the Crunchbase landscape visualisation"""
    
    # Dropdown menus
    name_dropdown = alt.binding_select(
        options=[None] + list(sorted(list(viz_df['category'].unique()))),
        name="Category:"
    )
    name_selection = alt.selection_point(
        fields=['category'],
        bind=name_dropdown,
        name="SelectName"
    )

    status_dropdown = alt.binding_select(
        options=[None] + list(sorted(list(viz_df['activity_status'].unique()))),
        name="Activity status:"
    )
    status_selection = alt.selection_point(
        fields=['activity_status'],
        bind=status_dropdown,
        name="ActivityStatus"
    )


    return (
        alt.Chart(viz_df, width=width, height=height)
        .mark_point(size=30, opacity=_opacity)
        .encode(
            x=alt.X("umap_x:Q", axis=None),
            y=alt.Y("umap_y:Q", axis=None),
            tooltip=["title", "description_narrative", "recipient_country_code", "category", "min_year", "max_year", "activity_status"],
            color=alt.Color("category", legend=alt.Legend(title="Category",  labelLimit=300)),
            #shape=alt.Shape("recent_project", legend=alt.Legend(title="Recent project (started since 2020)")),
            opacity=alt.condition(
                status_selection & name_selection, alt.value(_opacity), alt.value(0.0)
            ),
            href="url"
        )
        .add_params(
            status_selection,
            name_selection,
        )
        .interactive()
    )

In [31]:
fig_keywords = viz_landscape.chart_keywords(centroids_df)
fig = chart_landscape(uk_viz_df)

fig_final = viz_landscape.scatter_keyword_chart(fig, fig_keywords)

In [32]:
save_name = 'fcdo_2025_01_17'
output_path = PROJECT_DIR / f'data/2025_01_policy_atlas/landscape_{save_name}.html'
uk_viz_df.to_csv(PROJECT_DIR / f"data/2025_01_policy_atlas/table_{save_name}.csv", index=False)
fig_final.save(str(output_path))

In [42]:
# Generate dummy data with id and text fields, and 100 rows

import random
import string

def random_string(length):
    letters = string.ascii_lowercase
    return ''.join(random.choice(letters) for i in range(length))

dummy_data = pd.DataFrame({
    'id': range(100),
    'text': [random_string(100) for _ in range(100)]
})
dummy_data.to_csv("text.csv", index=False)