In [1]:
import gdeltngrams as gdn
print(gdn.__version__)

1.0.1


In [2]:
help(gdn.ingestion)

Help on function ingestion in module gdeltngrams.ingestion:

ingestion(dates: Union[int, str, List[Union[int, str]]], hours: Union[int, str, List[Union[int, str]], NoneType] = None, output_dir: str = '.', language_filter: Optional[str] = None, url_filter: Optional[str] = None) -> None
    Import data from the GDELT Web News NGrams 3.0 Dataset.

    Args:
        dates (int, str, list of int or str): One or more dates in YYYYMMDD format.
        hours (int, str, list of int or str, optional): One or more hours in HH format. Defaults to None (all hours).
        output_dir (str, optional): Directory to save output files. Defaults to ".".
        language_filter (str, optional): ISO 639-1 two-letter language code to filter articles. Defaults to None.
        url_filter (str, optional): Substring to filter URLs. Defaults to None.

    Returns:
        None: Writes JSON files to the specified output directory.



In [3]:
gdn.ingestion(
    dates = "20250101", 
    hours = "00", 
    output_dir = 'ingestion_folder', 
    language_filter = "en", 
    url_filter = None)

2025-08-13 19:57:40,870 INFO Starting ingestion process...
2025-08-13 19:58:14,733 INFO Webngrams for timestamp 20250101000100 filtered and saved to ingestion_folder/20250101000100.webngrams.json
2025-08-13 19:58:38,989 INFO Webngrams for timestamp 20250101001600 filtered and saved to ingestion_folder/20250101001600.webngrams.json
2025-08-13 19:58:56,585 INFO Webngrams for timestamp 20250101003100 filtered and saved to ingestion_folder/20250101003100.webngrams.json
2025-08-13 19:58:57,658 INFO Webngrams for timestamp 20250101003200 filtered and saved to ingestion_folder/20250101003200.webngrams.json
2025-08-13 19:59:26,483 INFO Webngrams for timestamp 20250101004600 filtered and saved to ingestion_folder/20250101004600.webngrams.json
2025-08-13 19:59:28,149 INFO Webngrams for timestamp 20250101004700 filtered and saved to ingestion_folder/20250101004700.webngrams.json
2025-08-13 19:59:30,129 INFO Ingestion process completed.


In [4]:
help(gdn.multiprocess)

Help on function multiprocess in module gdeltngrams.multiprocess:

multiprocess(input_path: str, output_file: str, language_filter: str = 'en', url_filter: Optional[str] = None, num_processes: Optional[int] = None, keywords: Union[str, List[str], NoneType] = None, text_condition: Optional[Callable[[str], bool]] = None) -> None
    Process JSON files in parallel using multiprocessing.

    Args:
        input_path (str): Path to a JSONL file or directory with JSONL files.
        output_file (str): Path where processed results are saved.
        language_filter (str, optional): ISO 639-1 two-letter language code to filter articles. Defaults to "en".
        url_filter (str, optional): URL substring filter. Defaults to None.
        num_processes (int, optional): Number of processes to use. Defaults to None (all cores).
        keywords (str or list of str, optional): Keyword(s) to filter text content. Defaults to None.
        text_condition (Callable[[str], bool], optional): Function t

In [5]:
gdn.multiprocess(
    input_path = 'ingestion_folder', 
    output_file = 'example.gdeltnews.webngrams.csv', 
    language_filter = "en",
    url_filter = None, 
    num_processes = None,
    keywords = ["Trump", "der Leyen"],
    text_condition = lambda text: len(text) > 300 and "israel" in text.lower() 
) 

2025-08-13 19:59:30,265 INFO Loading and filtering ingestion_folder/20250101000100.webngrams.json...
2025-08-13 19:59:51,709 INFO Using 4 logical cores to process 145/3095 articles from ingestion_folder/20250101000100.webngrams.json...
Processing articles: 100%|████████████████████| 145/145 [02:47<00:00,  1.16s/it]
2025-08-13 20:02:39,683 INFO Loading and filtering ingestion_folder/20250101001600.webngrams.json...
2025-08-13 20:03:00,611 INFO Using 4 logical cores to process 117/2948 articles from ingestion_folder/20250101001600.webngrams.json...
Processing articles: 100%|████████████████████| 117/117 [03:09<00:00,  1.62s/it]
2025-08-13 20:06:10,141 INFO Loading and filtering ingestion_folder/20250101003100.webngrams.json...
2025-08-13 20:06:30,356 INFO Using 4 logical cores to process 68/2329 articles from ingestion_folder/20250101003100.webngrams.json...
Processing articles: 100%|██████████████████████| 68/68 [03:27<00:00,  3.05s/it]
2025-08-13 20:09:58,223 INFO Loading and filtering