In [17]:
import requests
import os
import polars as pl
import re
from dotenv import load_dotenv, find_dotenv

In [2]:
load_dotenv(find_dotenv(), override=True)
factal_api_key = os.getenv('FACTAL_API_KEY')

In [5]:
def get_id(topic_name, kind=None, category=None):
    """
    Get the topic ID for a given location name from the Factal API.
    Args:
        topic_name (str): Name of the location to search for (e.g., "West Kordofan, Sudan" or "Sudan")
        kind (str): Topics are categorized predominantly into three kinds: "tag", "arc" and "location". The most common topic kind is "location"
        category (str): Location categories in order of granularity: "POI", "Airport", "Suburb", "Town", "Township", "NaturalFeature", "County", "State", "Colloquial", "Country"
    Returns:
        int or None: The topic ID if found, None otherwise
    """
    
    url = "https://www.factal.com/api/v2/topic/"
    params = {
        "name": topic_name,
        "kind": kind,
        "category": category
    }

    headers = {
        'Authorization': f'Token {factal_api_key}'
    }

    try:
        response = requests.get(url, params=params, headers=headers)
        response.raise_for_status()
        
        # Parse the JSON response
        data = response.json()
        if 'results' in data and len(data['results']) > 0:
            topic_id = data['results'][0]['id']
            print(f"Topic ID for {topic_name}: {topic_id}")
            return topic_id
        else:
            print(f"No results found for {topic_name}")
            return None
    except requests.exceptions.RequestException as e:
        print(f"Error making API request: {e}")
        return None
    except ValueError as e:
        print(f"Error parsing JSON response: {e}")
        return None

In [19]:
# Example usage
topic_id = get_id("Sudan", "location", "Country")

Topic ID for Sudan: 1443


## Set up the country keyword

In [73]:
coutnry_keyword = "Sudan"

## Get news items with location code

In [74]:
def get_id(topic_name=coutnry_keyword, kind=None, category=None):
    """
    Get the topic ID for a given location name from the Factal API.
    Args:
        topic_name (str): Name of the location to search for (e.g., "West Kordofan, Sudan" or "Sudan")
        kind (str): Topics are categorized predominantly into three kinds: "tag", "arc" and "location". The most common topic kind is "location"
        category (str): Location categories in order of granularity: "POI", "Airport", "Suburb", "Town", "Township", "NaturalFeature", "County", "State", "Colloquial", "Country"
    Returns:
        int or None: The topic ID if found, None otherwise
    """
    
    url = "https://www.factal.com/api/v2/topic/"
    params = {
        "name": coutnry_keyword,
        "kind": kind,
        "category": category
    }

    headers = {
        'Authorization': f'Token {factal_api_key}'
    }

    try:
        response = requests.get(url, params=params, headers=headers)
        response.raise_for_status()
        
        # Parse the JSON response
        data = response.json()
        if 'results' in data and len(data['results']) > 0:
            topic_id = data['results'][0]['id']
            print(f"Topic ID for {topic_name}: {topic_id}")
            return topic_id
        else:
            print(f"No results found for {topic_name}")
            return None
    except requests.exceptions.RequestException as e:
        print(f"Error making API request: {e}")
        return None
    except ValueError as e:
        print(f"Error parsing JSON response: {e}")
        return None

def get_items_for_topic(topic_name=coutnry_keyword, kind=None, category=None, topic_id=None, page_size=100, limit=None, start_date=None, end_date=None):
    """
    Retrieve items for a given topic from the Factal API.
    
    Args:
        topic_name (str, optional): Name of the topic location to search for (e.g., "Sudan")
        kind (str): Topics are categorized predominantly into three kinds: "tag", "arc" and "location". The most common topic kind is "location"
        category (str): Location categories in order of granularity: "POI", "Airport", "Suburb", "Town", "Township", "NaturalFeature", "County", "State", "Colloquial", "Country"
        topic_id (int, optional): Topic ID if already known
        page_size (int): Number of items per page (max 100)
        limit (int, optional): Maximum number of items to retrieve
        start_date (str, optional): ISO date format (YYYY-MM-DD) to filter items from
        end_date (str, optional): ISO date format (YYYY-MM-DD) to filter items until
        
    Returns:
        pandas.DataFrame: DataFrame containing all items
    """
    
    # If we don't have a topic_id but have a name, get the ID first
    if topic_id is None and topic_name is not None:
        topic_id = get_id(topic_name, kind, category)
        if topic_id is None:
            print(f"Topic ID for {topic_name} not found.")
            return pl.DataFrame()  # Return empty dataframe if topic not found
    elif topic_name is not None and topic_id is not None:
        print("Both topic_name and topic_id provided. Using topic_id.")
    
    if topic_id is None:
        print("Error: Either topic_name or topic_id must be provided")
        return pl.DataFrame()
        
    # Base URL for items endpoint
    url = 'https://www.factal.com/api/v2/item/'
    
    # Set up parameters
    params = {
        'topics': str(topic_id),
        'kind': str(kind) if kind else None,
        'category': str(category) if kind and category else None,
        'page_size': page_size
    }
    
    # Add optional date filter if provided
    if start_date:
        params['date__gte'] = start_date
        
    if start_date and end_date:
        params['date__range'] = f"{start_date},{end_date}"
    
    headers = {
        'Authorization': f'Token {factal_api_key}'
    }
    
    all_results = []
    next_url = url
    items_retrieved = 0
    
    try:
        while next_url:
            print(f"Fetching data from: {next_url}")
            response = requests.get(next_url, headers=headers, params=params)
            response.raise_for_status()
            
            data = response.json()
            results = data.get('results', [])
            all_results.extend(results)
            
            items_retrieved += len(results)
            print(f"Retrieved {len(results)} items. Total: {items_retrieved}")
            
            # Check if we've reached the limit
            if limit and items_retrieved >= limit:
                all_results = all_results[:limit]
                break
                
            # Get the next page URL
            next_url = data.get('next')
            
            # If moving to next page, we don't need params anymore (they're in the URL)
            if next_url:
                params = {}
            
        # Convert to DataFrame
        df = pl.DataFrame(all_results)
        return df
        
    except requests.exceptions.RequestException as e:
        print(f"Error making API request: {e}")
        return pl.DataFrame(all_results) if all_results else pl.DataFrame()
    except ValueError as e:
        print(f"Error parsing JSON response: {e}")
        return pl.DataFrame(all_results) if all_results else pl.DataFrame()

In [78]:
# Get only items starting from 2025
news_items = get_items_for_topic(
    topic_name=coutnry_keyword,
    topic_id=None,
    kind="location", # None / "location" / "arc" / "tag"
    category="Country", # None / "POI" / "Country" / "State" / "Town" / "Township" / "Suburb" / "NaturalFeature" / "Colloquial" / "Airport"
    start_date="2025-03-25",
    end_date="2025-06-26", # last day not included
    limit=50
) 

Topic ID for Sudan: 1443
Fetching data from: https://www.factal.com/api/v2/item/
Retrieved 50 items. Total: 50


## Extract topics for each item

In [79]:
# Extract the topics column into a separate DataFrame
def extract_topics_to_df(df, column):
    # Create a list to store all topics
    all_topics = []
    
    # Check if column exists
    if column not in df.columns:
        print(f"Error: '{column}' not found in DataFrame")
        return pl.DataFrame()
    
    # Iterate through each row in the original DataFrame using polars approach
    for row in df.iter_rows(named=True):
        topics_list = row[column]
        if not topics_list:
            continue
        for topic in topics_list:
            topic_dict = dict(topic)
            topic_dict['item_id'] = row['id']
            all_topics.append(topic_dict)
    
    # Normalize the 'topic' field and keep 'item_id'
    if not all_topics:
        return pl.DataFrame()
    
    topics_df = pl.DataFrame(all_topics)
    
    # Extract topic data - polars doesn't have json_normalize, so we extract fields directly
    if 'topic' in topics_df.columns:
        # Extract fields from the topic dictionary
        topic_data = []
        for topic in topics_df['topic'].to_list():
            topic_data.append(topic)
        
        # Create a DataFrame from the extracted topic data
        topics_flat = pl.DataFrame(topic_data)
        # Add item_id from the original topics_df
        topics_flat = topics_flat.with_columns(topics_df['item_id'])
        columns = ['item_id'] + [col for col in topics_flat.columns if col != 'item_id']
        topics_flat = topics_flat.select(columns)
        return topics_flat
    else:
        return topics_df

# Create the topics DataFrame
topics_df = extract_topics_to_df(news_items, "topics")
topics_df

item_id,id,resource_uri,local_url,permalink,items_resource_uri,related_topics_uri,slug,parents,active,visible,moderation_status,name,symbol,created_on,kind,category,googleid,latitude,longitude,point,bounding_box,description,wikipedia_url,wikipedia_content_date,latest_item_date,item_count,published_count,recent_count,daily_count,trend_rank_last,trend_rank_current,asset_count,subscription_count
i64,i64,str,str,str,str,str,str,list[i64],bool,bool,str,str,str,str,str,str,str,f64,f64,str,str,str,str,str,str,i64,i64,i64,i64,i64,i64,i64,i64
52802111,326230,"""https://www.factal.com/api/v2/…","""/topic/incident-2550729/""","""https://www.factal.com/topic/i…","""/api/v2/item/?topics=326230""","""/api/v2/topic/?related_to=3262…","""incident-2550729""",[60969],true,true,"""approved""","""Sudan civil war""",,"""2018-12-19T18:34:03.627365Z""","""arc""","""Ongoing""",,,,,,"""On April 15, 2023, Sudan's mil…","""""",,"""2025-06-25T20:23:22.576406Z""",3982,3982,0,2,0,0,0,244
52802111,3226516,"""https://www.factal.com/api/v2/…","""/topic/dilling-sudan/""","""https://www.factal.com/topic/d…","""/api/v2/item/?topics=3226516""","""/api/v2/topic/?related_to=3226…","""dilling-sudan""","[1443, 1489717, … 851841]",true,true,"""needs_review""","""Dilling, Sudan""",,"""2023-10-28T03:53:46.224084Z""","""location""","""Town""","""ChIJMVkkOQ6R6BYRh8phn1yhs1U""",12.052724,29.655711,"""SRID=4326;POINT (29.655711 12.…","""SRID=4326;POLYGON ((29.6240329…","""<p><b>Dalang</b> (Arabic الدلن…","""https://en.wikipedia.org/wiki/…","""2023-10-28T03:54:39.567300Z""","""2025-06-25T20:23:22.576406Z""",10,10,0,0,0,0,1,0
52802111,1443,"""https://www.factal.com/api/v2/…","""/topic/sudan/""","""https://www.factal.com/topic/s…","""/api/v2/item/?topics=1443""","""/api/v2/topic/?related_to=1443""","""sudan""","[1489717, 247481, … 851841]",true,true,"""approved""","""Sudan""",,"""2017-08-23T04:38:32.786955Z""","""location""","""Country""","""ChIJlbFyEMQc2RURNythKkZwv9I""",12.862807,30.217636,"""SRID=4326;POINT (30.217636 12.…","""SRID=4326;POLYGON ((21.8146344…","""<p><b>Sudan</b>, officially th…","""https://en.wikipedia.org/wiki/…","""2025-06-22T20:30:16.783694Z""","""2025-06-26T04:35:56.472115Z""",5764,5764,0,2,0,0,5,154
52802111,406308,"""https://www.factal.com/api/v2/…","""/topic/south-kordofan-sudan/""","""https://www.factal.com/topic/s…","""/api/v2/item/?topics=406308""","""/api/v2/topic/?related_to=4063…","""south-kordofan-sudan""","[1443, 1489717, … 851841]",true,true,"""approved""","""South Kordofan, Sudan""",,"""2019-04-17T15:50:41.332970Z""","""location""","""State""","""ChIJwS3ocpKq5RYR89BzE3Jsdzo""",11.036544,30.895824,"""SRID=4326;POINT (30.8958242 11…","""SRID=4326;POLYGON ((27.2556731…","""<p><b>South Kordofan</b> (Arab…","""https://en.wikipedia.org/wiki/…","""2025-06-21T05:30:26.203768Z""","""2025-06-25T20:23:22.576406Z""",96,96,0,0,0,0,45,5
52802111,1489717,"""https://www.factal.com/api/v2/…","""/topic/africa/""","""https://www.factal.com/topic/a…","""/api/v2/item/?topics=1489717""","""/api/v2/topic/?related_to=1489…","""africa""",[],true,true,"""approved""","""Africa""",,"""2021-05-28T16:37:44.498420Z""","""region""","""Region""",,,,,,"""Countries in this region:<br /…","""""",,"""2025-06-26T08:54:20.141164Z""",43615,43615,0,0,0,0,0,152
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
51427433,1489712,"""https://www.factal.com/api/v2/…","""/topic/mena-region/""","""https://www.factal.com/topic/m…","""/api/v2/item/?topics=1489712""","""/api/v2/topic/?related_to=1489…","""mena-region""",[],true,true,"""approved""","""MENA""",,"""2021-05-28T16:36:08.453938Z""","""region""","""Region""",,,,,"""SRID=4326;POLYGON ((24.696775 …","""Countries in this region:<br /…","""""",,"""2025-06-26T09:00:26.976276Z""",101184,101184,0,0,0,0,0,144
51427433,851841,"""https://www.factal.com/api/v2/…","""/topic/northern-africa/""","""https://www.factal.com/topic/n…","""/api/v2/item/?topics=851841""","""/api/v2/topic/?related_to=8518…","""northern-africa""",[],true,true,"""approved""","""Northern Africa""",,"""2020-02-18T21:25:03.586177Z""","""region""","""Region""",,,,,,"""Countries in this region:<br /…","""""",,"""2025-06-26T05:54:36.361036Z""",20255,20255,0,0,0,0,0,90
51427433,247481,"""https://www.factal.com/api/v2/…","""/topic/emea/""","""https://www.factal.com/topic/e…","""/api/v2/item/?topics=247481""","""/api/v2/topic/?related_to=2474…","""emea""",[],true,true,"""approved""","""EMEA""",,"""2018-08-31T17:12:40.464968Z""","""region""","""Region""",,,,,,"""Countries in this region:<br /…","""""","""2021-05-26T09:41:35.862595Z""","""2025-06-26T09:59:22.536731Z""",313781,313781,10,41,0,0,0,178
51427433,77834,"""https://www.factal.com/api/v2/…","""/topic/floods/""","""https://www.factal.com/topic/f…","""/api/v2/item/?topics=77834""","""/api/v2/topic/?related_to=7783…","""floods""",[67],true,true,"""approved""","""floods""","""💦""","""2017-12-25T18:41:48.125975Z""","""tag""","""Subvertical""",,,,,,"""<p>A <b>flood</b> is an overfl…","""https://en.wikipedia.org/wiki/…","""2025-06-23T00:30:17.341339Z""","""2025-06-26T09:17:28.132950Z""",28339,28339,0,8,0,0,0,288


## Merge DFs

In [80]:
items_merged = news_items.join(topics_df, left_on='id', right_on='item_id', how='left')

## Fill missing url, get full Twitter url

In [81]:
items_merged = items_merged.with_columns(
    pl.when((pl.col('url')=='') & (pl.col('url_domain')=='x.com'))
    .then(pl.col('source') + pl.lit('/status/') + pl.col('tweet_id').cast(pl.Utf8))
    .otherwise(pl.col('url'))
    .alias('url')
)

items_merged = items_merged.with_columns(
    (pl.lit("factal_") + pl.col("id").cast(pl.Utf8)).alias("item_id")
)

## Drop some columns

In [82]:
clean_df = items_merged.select([
    pl.col('item_id'),
    pl.col('url'),
    pl.col('content').alias('text'),
    pl.col('source').alias('domain'),
    pl.col('date').str.to_datetime().dt.date(),  # Convert string to datetime first, then extract date
    pl.col('severity'),
    pl.col('name').alias('topic'),
    pl.col('kind'),
    pl.col('category'),
    pl.col('description').alias('topic_summary')
])

In [85]:
def fill_categories(df):
    df = df.with_columns(
        pl.when(pl.col("category") == "Country")
        .then(pl.col("topic"))
        .otherwise(None)
        .alias("country")
    )

    df = df.with_columns(
        pl.when(pl.col("category") == "State")
        .then(pl.col("topic"))
        .otherwise(None)
        .alias("state")
    )

    df = df.with_columns(
        pl.when(pl.col("category") == "State")
        .then(pl.col("topic"))
        .otherwise(None)
        .alias("state")
    )
    
    df = df.with_columns(
        pl.when(pl.col("category") == "Town")
        .then(pl.col("topic"))
        .otherwise(None)
        .alias("town")
    )
    
    df = df.with_columns(
        pl.when(pl.col("category") == "POI")
        .then(pl.col("topic"))
        .otherwise(None)
        #.str.extract(r", (.+)", 1) # regex to clean location names like "2QQ2+257, Muglad, Sudan"
        .alias("location")
    )
    
    df = df.with_columns(
        pl.when(pl.col("kind") == "arc")
        .then(pl.col("topic"))
        .otherwise(None)
        .str.replace(r" at \w+\+\w+, ", " at ")
        .str.replace(r"^\w+\+\w+\s", "")
        .alias("topic2")
    )
    
    df = df.with_columns(
        pl.when(pl.col("kind") == "vertical")
        .then(pl.col("topic"))
        .otherwise(None)
        .alias("theme")
    )
    
    df = df.with_columns(
        pl.when(pl.col("kind") == "tag")
        .then(pl.col("topic"))
        .otherwise(None)
        .alias("tag")
    )
    
    return df

clean_df = fill_categories(clean_df)

In [86]:
def group_by_factal_id(df):
	"""
	Group the dataframe by item_id and combine relevant columns
	
	Args:
		df: polars DataFrame with multiple rows per item_id
	
	Returns:
		polars DataFrame with one row per item_id
	"""
	# Get unique factal_ids
	unique_ids = df.select(pl.col("item_id")).unique()
	
	result_rows = []
	
	# For each unique ID, collect all values
	for row in unique_ids.iter_rows(named=True):
		item_id = row["item_id"]
		
		# Filter data for this specific ID
		id_data = df.filter(pl.col("item_id") == item_id)
		
		# Get first value for columns that should be the same for all rows with this ID
		first_row = id_data.row(0, named=True)
		
		# Create a new row with combined data
		new_row = {
			"item_id": item_id,
			"url": first_row["url"],
			"text": first_row["text"],
			"domain": first_row["domain"],
			"date": first_row["date"],
			"severity": first_row["severity"],
		}
		
		# Combine categorical fields (non-null values only)
		for col in ["country", "state", "town", "location", "topic2", "theme", "tag"]:
			values = id_data.select(pl.col(col)).filter(pl.col(col).is_not_null()).unique().to_series().to_list()
			new_row[col] = values[0] if values else None

		# Extract country from state if possible
		state_val = new_row.get("state")
		country_val = new_row.get("country")

		if state_val is not None:
			if isinstance(state_val, list):
				state_str = next((s for s in state_val if isinstance(s, str) and s.strip()), None)
			else:
				state_str = state_val

			if isinstance(state_str, str) and ',' in state_str:
				match = re.search(r',\s*([^,]+)$', state_str)
				if match:
					new_row["country"] = match.group(1).strip()
				else:
					new_row["country"] = country_val
			else:
				new_row["country"] = country_val
		else:
			new_row["country"] = country_val
	

		# Get all unique topics
		topics = id_data.select(pl.col("topic")).unique().to_series().to_list()
		new_row["topics"] = topics
		
		# Get topic summary (use the first non-null value)
		summaries = id_data.select(pl.col("topic_summary")).filter(pl.col("topic_summary").is_not_null()).to_series().to_list()
		new_row["topic_summary"] = summaries[0] if summaries else None
		
		result_rows.append(new_row)
	
	# Convert to DataFrame
	result_df = pl.DataFrame(result_rows)
	return result_df

# Group the dataframe by item_id
grouped_df = group_by_factal_id(clean_df)

In [87]:
grouped_df = grouped_df.with_columns(
    pl.lit(coutnry_keyword).alias("country_keyword")
)

grouped_df = grouped_df.rename({"topic2": "topic"})

new_col_order = ["country_keyword"] + [col for col in grouped_df.columns if col != "country_keyword"]
grouped_df = grouped_df.select(new_col_order)

In [89]:
grouped_df.write_parquet(f"Factal_{coutnry_keyword}.parquet")