In [102]:
import requests
import os
import polars as pl
import re
from dotenv import load_dotenv, find_dotenv

In [103]:
load_dotenv(find_dotenv(), override=True)
factal_api_key = os.getenv('FACTAL_API_KEY')

In [104]:
def get_id(country, kind=None, category=None):
    """
    Get the topic ID for a given location name from the Factal API.
    Args:
        country (str): Name of the location to search for (e.g., "West Kordofan, Sudan" or "Sudan")
        kind (str): Topics are categorized predominantly into three kinds: "tag", "arc" and "location". The most common topic kind is "location"
        category (str): Location categories in order of granularity: "POI", "Airport", "Suburb", "Town", "Township", "NaturalFeature", "County", "State", "Colloquial", "Country"
    Returns:
        int or None: The topic ID if found, None otherwise
    """
    
    url = "https://www.factal.com/api/v2/topic/"
    params = {
        "name": country,
        "kind": kind,
        "category": category
    }

    headers = {
        'Authorization': f'Token {factal_api_key}'
    }

    try:
        response = requests.get(url, params=params, headers=headers)
        response.raise_for_status()
        
        # Parse the JSON response
        data = response.json()
        if 'results' in data and len(data['results']) > 0:
            topic_id = data['results'][0]['id']
            print(f"Topic ID for {country}: {topic_id}")
            return topic_id
        else:
            print(f"No results found for {country}")
            return None
    except requests.exceptions.RequestException as e:
        print(f"Error making API request: {e}")
        return None
    except ValueError as e:
        print(f"Error parsing JSON response: {e}")
        return None

## Set up the country keyword

In [105]:
country = "Ethiopia"

## Get news items with location code

In [106]:
def get_id(country=country, kind=None, category=None):
    """
    Get the topic ID for a given location name from the Factal API.
    Args:
        country (str): Name of the location to search for (e.g., "West Kordofan, Sudan" or "Sudan")
        kind (str): Topics are categorized predominantly into three kinds: "tag", "arc" and "location". The most common topic kind is "location"
        category (str): Location categories in order of granularity: "POI", "Airport", "Suburb", "Town", "Township", "NaturalFeature", "County", "State", "Colloquial", "Country"
    Returns:
        int or None: The topic ID if found, None otherwise
    """
    
    url = "https://www.factal.com/api/v2/topic/"
    params = {
        "name": country,
        "kind": kind,
        "category": category
    }

    headers = {
        'Authorization': f'Token {factal_api_key}'
    }

    try:
        response = requests.get(url, params=params, headers=headers)
        response.raise_for_status()
        
        # Parse the JSON response
        data = response.json()
        if 'results' in data and len(data['results']) > 0:
            topic_id = data['results'][0]['id']
            print(f"Topic ID for {country}: {topic_id}")
            return topic_id
        else:
            print(f"No results found for {country}")
            return None
    except requests.exceptions.RequestException as e:
        print(f"Error making API request: {e}")
        return None
    except ValueError as e:
        print(f"Error parsing JSON response: {e}")
        return None

def get_items_for_topic(country=country, kind=None, category=None, topic_id=None, page_size=100, limit=None, start_date=None, end_date=None):
    """
    Retrieve items for a given topic from the Factal API.
    
    Args:
        country (str, optional): Name of the topic location to search for (e.g., "Sudan")
        kind (str): Topics are categorized predominantly into three kinds: "tag", "arc" and "location". The most common topic kind is "location"
        category (str): Location categories in order of granularity: "POI", "Airport", "Suburb", "Town", "Township", "NaturalFeature", "County", "State", "Colloquial", "Country"
        topic_id (int, optional): Topic ID if already known
        page_size (int): Number of items per page (max 100)
        limit (int, optional): Maximum number of items to retrieve
        start_date (str, optional): ISO date format (YYYY-MM-DD) to filter items from
        end_date (str, optional): ISO date format (YYYY-MM-DD) to filter items until
        
    Returns:
        pandas.DataFrame: DataFrame containing all items
    """
    
    # If we don't have a topic_id but have a name, get the ID first
    if topic_id is None and country is not None:
        topic_id = get_id(country, kind, category)
        if topic_id is None:
            print(f"Topic ID for {country} not found.")
            return pl.DataFrame()  # Return empty dataframe if topic not found
    elif country is not None and topic_id is not None:
        print("Both country and topic_id provided. Using topic_id.")
    
    if topic_id is None:
        print("Error: Either country or topic_id must be provided")
        return pl.DataFrame()
        
    # Base URL for items endpoint
    url = 'https://www.factal.com/api/v2/item/'
    
    # Set up parameters
    params = {
        'topics': str(topic_id),
        'kind': str(kind) if kind else None,
        'category': str(category) if kind and category else None,
        'page_size': page_size
    }
    
    # Add optional date filter if provided
    if start_date:
        params['date__gte'] = start_date
        
    if start_date and end_date:
        params['date__range'] = f"{start_date},{end_date}"
    
    headers = {
        'Authorization': f'Token {factal_api_key}'
    }
    
    all_results = []
    next_url = url
    items_retrieved = 0
    
    try:
        while next_url:
            print(f"Fetching data from: {next_url}")
            response = requests.get(next_url, headers=headers, params=params)
            response.raise_for_status()
            
            data = response.json()
            results = data.get('results', [])
            all_results.extend(results)
            
            items_retrieved += len(results)
            print(f"Retrieved {len(results)} items. Total: {items_retrieved}")
            
            # Check if we've reached the limit
            if limit and items_retrieved >= limit:
                all_results = all_results[:limit]
                break
                
            # Get the next page URL
            next_url = data.get('next')
            
            # If moving to next page, we don't need params anymore (they're in the URL)
            if next_url:
                params = {}
            
        # Convert to DataFrame
        df = pl.DataFrame(all_results)
        return df
        
    except requests.exceptions.RequestException as e:
        print(f"Error making API request: {e}")
        return pl.DataFrame(all_results) if all_results else pl.DataFrame()
    except ValueError as e:
        print(f"Error parsing JSON response: {e}")
        return pl.DataFrame(all_results) if all_results else pl.DataFrame()

In [107]:
# Get only items starting from 2025
news_items = get_items_for_topic(
    country=country,
    topic_id=None,
    kind="location", # None / "location" / "arc" / "tag"
    category="Country", # None / "POI" / "Country" / "State" / "Town" / "Township" / "Suburb" / "NaturalFeature" / "Colloquial" / "Airport"
    start_date="2025-01-01",
    end_date="2025-06-27", # last day not included
    limit=None # None or int, if set, will limit the number of items returned
) 

Topic ID for Ethiopia: 2269
Fetching data from: https://www.factal.com/api/v2/item/
Retrieved 50 items. Total: 50
Fetching data from: https://www.factal.com/api/v2/item/?category=Country&date__gte=2025-01-01&date__range=2025-01-01%2C2025-06-27&kind=location&limit=50&offset=50&page_size=100&topics=2269
Retrieved 10 items. Total: 60


## Extract topics for each item

In [108]:
# Extract the topics column into a separate DataFrame
def extract_topics_to_df(df, column):
    # Create a list to store all topics
    all_topics = []
    
    # Check if column exists
    if column not in df.columns:
        print(f"Error: '{column}' not found in DataFrame")
        return pl.DataFrame()
    
    # Iterate through each row in the original DataFrame using polars approach
    for row in df.iter_rows(named=True):
        topics_list = row[column]
        if not topics_list:
            continue
        for topic in topics_list:
            topic_dict = dict(topic)
            topic_dict['item_id'] = row['id']
            all_topics.append(topic_dict)
    
    # Normalize the 'topic' field and keep 'item_id'
    if not all_topics:
        return pl.DataFrame()
    
    topics_df = pl.DataFrame(all_topics)
    
    # Extract topic data - polars doesn't have json_normalize, so we extract fields directly
    if 'topic' in topics_df.columns:
        # Extract fields from the topic dictionary
        topic_data = []
        for topic in topics_df['topic'].to_list():
            topic_data.append(topic)
        
        # Create a DataFrame from the extracted topic data
        topics_flat = pl.DataFrame(topic_data)
        # Add item_id from the original topics_df
        topics_flat = topics_flat.with_columns(topics_df['item_id'])
        columns = ['item_id'] + [col for col in topics_flat.columns if col != 'item_id']
        topics_flat = topics_flat.select(columns)
        return topics_flat
    else:
        return topics_df

# Create the topics DataFrame
topics_df = extract_topics_to_df(news_items, "topics")
topics_df

item_id,id,resource_uri,local_url,permalink,items_resource_uri,related_topics_uri,slug,parents,active,visible,moderation_status,name,symbol,created_on,kind,category,googleid,latitude,longitude,point,bounding_box,description,wikipedia_url,wikipedia_content_date,latest_item_date,item_count,published_count,recent_count,daily_count,trend_rank_last,trend_rank_current,asset_count,subscription_count
i64,i64,str,str,str,str,str,str,list[i64],bool,bool,str,str,str,str,str,str,str,f64,f64,str,str,str,str,str,str,i64,i64,i64,i64,i64,i64,i64,i64
52095941,5349778,"""https://www.factal.com/api/v2/…","""/topic/incident-52095941/""","""https://www.factal.com/topic/i…","""/api/v2/item/?topics=5349778""","""/api/v2/topic/?related_to=5349…","""incident-52095941""",[],false,false,"""approved""","""Emerging incident in Ethiopia""",,"""2025-06-10T22:38:54.365739Z""","""arc""","""Incident""",,,,,,"""""","""""",,"""2025-06-10T22:38:54.170012Z""",1,1,0,0,0,0,0,0
52095941,2269,"""https://www.factal.com/api/v2/…","""/topic/ethiopia/""","""https://www.factal.com/topic/e…","""/api/v2/item/?topics=2269""","""/api/v2/topic/?related_to=2269""","""ethiopia""","[1489717, 851848, 247481]",true,true,"""approved""","""Ethiopia""",,"""2017-08-23T20:26:41.041976Z""","""location""","""Country""","""ChIJK_1s3c7QNRYRWsVIU3-m9ns""",9.145,40.489673,"""SRID=4326;POINT (40.489673 9.1…","""SRID=4326;POLYGON ((32.997734 …","""<p><b>Ethiopia</b>, officially…","""https://en.wikipedia.org/wiki/…","""2025-06-22T15:30:25.517692Z""","""2025-06-10T22:38:54.170012Z""",2568,2568,0,0,0,0,3,205
52095941,1489717,"""https://www.factal.com/api/v2/…","""/topic/africa/""","""https://www.factal.com/topic/a…","""/api/v2/item/?topics=1489717""","""/api/v2/topic/?related_to=1489…","""africa""",[],true,true,"""approved""","""Africa""",,"""2021-05-28T16:37:44.498420Z""","""region""","""Region""",,,,,,"""Countries in this region:<br /…","""""",,"""2025-06-26T15:18:54.079447Z""",43626,43626,0,0,0,0,0,152
52095941,851848,"""https://www.factal.com/api/v2/…","""/topic/eastern-africa/""","""https://www.factal.com/topic/e…","""/api/v2/item/?topics=851848""","""/api/v2/topic/?related_to=8518…","""eastern-africa""",[],true,true,"""approved""","""Eastern Africa""",,"""2020-02-18T21:27:34.518844Z""","""region""","""Region""",,,,,,"""Countries in this region:<br /…","""""",,"""2025-06-26T15:06:16.206876Z""",19052,19052,0,0,0,0,0,107
52095941,247481,"""https://www.factal.com/api/v2/…","""/topic/emea/""","""https://www.factal.com/topic/e…","""/api/v2/item/?topics=247481""","""/api/v2/topic/?related_to=2474…","""emea""",[],true,true,"""approved""","""EMEA""",,"""2018-08-31T17:12:40.464968Z""","""region""","""Region""",,,,,,"""Countries in this region:<br /…","""""","""2021-05-26T09:41:35.862595Z""","""2025-06-26T15:18:54.079447Z""",313829,313829,10,41,0,0,0,178
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
46322657,1489717,"""https://www.factal.com/api/v2/…","""/topic/africa/""","""https://www.factal.com/topic/a…","""/api/v2/item/?topics=1489717""","""/api/v2/topic/?related_to=1489…","""africa""",[],true,true,"""approved""","""Africa""",,"""2021-05-28T16:37:44.498420Z""","""region""","""Region""",,,,,,"""Countries in this region:<br /…","""""",,"""2025-06-26T15:18:54.079447Z""",43626,43626,0,0,0,0,0,152
46322657,851848,"""https://www.factal.com/api/v2/…","""/topic/eastern-africa/""","""https://www.factal.com/topic/e…","""/api/v2/item/?topics=851848""","""/api/v2/topic/?related_to=8518…","""eastern-africa""",[],true,true,"""approved""","""Eastern Africa""",,"""2020-02-18T21:27:34.518844Z""","""region""","""Region""",,,,,,"""Countries in this region:<br /…","""""",,"""2025-06-26T15:06:16.206876Z""",19052,19052,0,0,0,0,0,107
46322657,247481,"""https://www.factal.com/api/v2/…","""/topic/emea/""","""https://www.factal.com/topic/e…","""/api/v2/item/?topics=247481""","""/api/v2/topic/?related_to=2474…","""emea""",[],true,true,"""approved""","""EMEA""",,"""2018-08-31T17:12:40.464968Z""","""region""","""Region""",,,,,,"""Countries in this region:<br /…","""""","""2021-05-26T09:41:35.862595Z""","""2025-06-26T15:18:54.079447Z""",313829,313829,10,41,0,0,0,178
46322657,4492009,"""https://www.factal.com/api/v2/…","""/topic/african-union-support-a…","""https://www.factal.com/topic/a…","""/api/v2/item/?topics=4492009""","""/api/v2/topic/?related_to=4492…","""african-union-support-and-stab…",[],true,true,"""needs_review""","""African Union Support and Stab…",,"""2024-10-09T09:08:12.779673Z""","""tag""","""Event""",,,,,,"""""","""""",,"""2025-01-03T00:36:57.862300Z""",1,1,0,0,0,0,0,0


## Merge DFs

In [109]:
items_merged = news_items.join(topics_df, left_on='id', right_on='item_id', how='left')

## Fill missing url, get full Twitter url

In [110]:
items_merged = items_merged.with_columns(
    pl.when((pl.col('url')=='') & (pl.col('url_domain')=='x.com'))
    .then(pl.col('source') + pl.lit('/status/') + pl.col('tweet_id').cast(pl.Utf8))
    .otherwise(pl.col('url'))
    .alias('url')
)

items_merged = items_merged.with_columns(
    (pl.lit("factal_") + pl.col("id").cast(pl.Utf8)).alias("item_id")
)

## Drop some columns

In [111]:
clean_df = items_merged.select([
    pl.col('item_id'),
    pl.col('url'),
    pl.col('content').alias('text'),
    pl.col('source').alias('domain'),
    pl.col('date').str.to_datetime().dt.date(),  # Convert string to datetime first, then extract date
    pl.col('severity'),
    pl.col('name').alias('topic'),
    pl.col('kind'),
    pl.col('category'),
    pl.col('description').alias('topic_summary')
])

In [112]:
def fill_categories(df):
    df = df.with_columns(
        pl.when(pl.col("category") == "Country")
        .then(pl.col("topic"))
        .otherwise(None)
        .alias("country")
    )

    df = df.with_columns(
        pl.when(pl.col("category") == "State")
        .then(pl.col("topic"))
        .otherwise(None)
        .alias("state")
    )

    df = df.with_columns(
        pl.when(pl.col("category") == "State")
        .then(pl.col("topic"))
        .otherwise(None)
        .alias("state")
    )
    
    df = df.with_columns(
        pl.when(pl.col("category") == "Town")
        .then(pl.col("topic"))
        .otherwise(None)
        .alias("town")
    )
    
    df = df.with_columns(
        pl.when(pl.col("category") == "POI")
        .then(pl.col("topic"))
        .otherwise(None)
        #.str.extract(r", (.+)", 1) # regex to clean location names like "2QQ2+257, Muglad, Sudan"
        .alias("location")
    )
    
    df = df.with_columns(
        pl.when(pl.col("kind") == "arc")
        .then(pl.col("topic"))
        .otherwise(None)
        .str.replace(r" at \w+\+\w+, ", " at ")
        .str.replace(r"^\w+\+\w+\s", "")
        .alias("topic2")
    )
    
    df = df.with_columns(
        pl.when(pl.col("kind") == "vertical")
        .then(pl.col("topic"))
        .otherwise(None)
        .alias("theme")
    )
    
    df = df.with_columns(
        pl.when(pl.col("kind") == "tag")
        .then(pl.col("topic"))
        .otherwise(None)
        .alias("tag")
    )
    
    return df

clean_df = fill_categories(clean_df)

In [113]:
def group_by_factal_id(df):
	"""
	Group the dataframe by item_id and combine relevant columns
	
	Args:
		df: polars DataFrame with multiple rows per item_id
	
	Returns:
		polars DataFrame with one row per item_id
	"""
	# Get unique factal_ids
	unique_ids = df.select(pl.col("item_id")).unique()
	
	result_rows = []
	
	# For each unique ID, collect all values
	for row in unique_ids.iter_rows(named=True):
		item_id = row["item_id"]
		
		# Filter data for this specific ID
		id_data = df.filter(pl.col("item_id") == item_id)
		
		# Get first value for columns that should be the same for all rows with this ID
		first_row = id_data.row(0, named=True)
		
		# Create a new row with combined data
		new_row = {
			"item_id": item_id,
			"url": first_row["url"],
			"text": first_row["text"],
			"domain": first_row["domain"],
			"date": first_row["date"],
			"severity": first_row["severity"],
		}
		
		# Combine categorical fields (non-null values only)
		for col in ["country", "state", "town", "location", "topic2", "theme", "tag"]:
			values = id_data.select(pl.col(col)).filter(pl.col(col).is_not_null()).unique().to_series().to_list()
			new_row[col] = values[0] if values else None

		# Extract country from state if possible
		state_val = new_row.get("state")
		country_val = new_row.get("country")

		if state_val is not None:
			if isinstance(state_val, list):
				state_str = next((s for s in state_val if isinstance(s, str) and s.strip()), None)
			else:
				state_str = state_val

			if isinstance(state_str, str) and ',' in state_str:
				match = re.search(r',\s*([^,]+)$', state_str)
				if match:
					new_row["country"] = match.group(1).strip()
				else:
					new_row["country"] = country_val
			else:
				new_row["country"] = country_val
		else:
			new_row["country"] = country_val
	

		# Get all unique topics
		topics = id_data.select(pl.col("topic")).unique().to_series().to_list()
		new_row["topics"] = topics
		
		# Get topic summary (use the first non-null value)
		summaries = id_data.select(pl.col("topic_summary")).filter(pl.col("topic_summary").is_not_null()).to_series().to_list()
		new_row["topic_summary"] = summaries[0] if summaries else None
		
		result_rows.append(new_row)
	
	# Convert to DataFrame
	result_df = pl.DataFrame(result_rows)
	return result_df

# Group the dataframe by item_id
grouped_df = group_by_factal_id(clean_df)

In [114]:
grouped_df = grouped_df.with_columns(
    pl.lit(country).alias("country_keyword")
)

grouped_df = grouped_df.rename({"topic2": "topic"})

new_col_order = ["country_keyword"] + [col for col in grouped_df.columns if col != "country_keyword"]
grouped_df = grouped_df.select(new_col_order)

In [115]:
grouped_df.write_parquet(f"Factal_{country}.parquet")