In [1]:
import gdeltngrams as gdgrams
print(gdgrams.__version__)

1.0.0


In [2]:
help(gdgrams.ingestion)

Help on function ingestion in module gdeltngrams.ingestion:

ingestion(
    dates: Union[str, List[str]],
    hours: Optional[str] = None,
    output_dir='.',
    language_filter: Optional[str] = None,
    url_filter: Optional[str] = None
) -> None
    This function imports data from the GDELT Web News NGrams 3.0 Dataset, which provides near-real-time global news content.

    Parameters:
    - dates (str or List[str]): a single date (YYYYMMDD) or a list of such elements.
    - hours (str or List[str], optional): a single hour (HH) or list of hours to filter. Default is None (no hour filtering).
    - output_dir (str, optional): directory to save the GDELT JSON data. Default is "." (current directory).
    - language_filter (str, optional): language code to filter articles. Default is None (no language filtering).
    - url_filter (str, optional): substring that must appear in the source URL. Default is None (no URL filtering).
    Returns:
    - None. Writes output to given JSON and C

In [3]:
gdgrams.ingestion(
    dates = "20250101", 
    hours = "00", 
    output_dir = 'ingestion_folder', 
    language_filter = "en", 
    url_filter = None)

2025-07-07 14:18:03,143 INFO Starting ingestion process...
2025-07-07 14:18:50,940 INFO Webngrams for timestamp 20250101000100 filtered and saved to ingestion_folder/20250101000100.webngrams.json
2025-07-07 14:19:25,320 INFO Webngrams for timestamp 20250101001600 filtered and saved to ingestion_folder/20250101001600.webngrams.json
2025-07-07 14:19:50,150 INFO Webngrams for timestamp 20250101003100 filtered and saved to ingestion_folder/20250101003100.webngrams.json
2025-07-07 14:19:53,083 INFO Webngrams for timestamp 20250101003200 filtered and saved to ingestion_folder/20250101003200.webngrams.json
2025-07-07 14:20:28,291 INFO Webngrams for timestamp 20250101004600 filtered and saved to ingestion_folder/20250101004600.webngrams.json
2025-07-07 14:20:29,607 INFO Webngrams for timestamp 20250101004700 filtered and saved to ingestion_folder/20250101004700.webngrams.json
2025-07-07 14:20:32,306 INFO Ingestion process completed.


In [4]:
help(gdgrams.multiprocess)

Help on function multiprocess in module gdeltngrams.multiprocess:

multiprocess(
    input_path: str,
    output_file: str,
    language_filter: str = 'en',
    url_filter: Optional[str] = None,
    num_processes: Optional[int] = None,
    keywords: Union[str, List[str], NoneType] = None,
    text_condition: Optional[Callable[[str], bool]] = None
) -> None
    Reads one or more line-based JSON files (a single file or a folder) and processes
    articles in parallel using multiprocessing. All results are written to a single output file.

    Parameters:
        input_path (str): Path to a JSONL file or a directory containing multiple JSONL files.
        output_file (str): Path to the output file where results will be saved.
        language_filter (str, optional): Language code to filter articles. Default is "en".
        url_filter (str, optional): Substring to filter articles by URL. Default is None (no filtering).
        num_processes (int, optional): Number of processes for multip

In [5]:
gdgrams.multiprocess(
    input_path = 'ingestion_folder', 
    output_file = 'example.gdeltnews.webngrams.csv', 
    language_filter = "en",
    url_filter = None, 
    num_processes = None,
    keywords = ["Trump", "der Leyen"],
    text_condition = lambda text: len(text) > 300 and "israel" in text.lower() 
) 

2025-07-07 14:20:32,421 INFO Loading and filtering ingestion_folder/20250101000100.webngrams.json using 4 processes...
2025-07-07 14:20:53,498 INFO Processing 145/3095 articles from ingestion_folder/20250101000100.webngrams.json...
Processing articles: 100%|████████████████████| 145/145 [02:44<00:00,  1.13s/it]
2025-07-07 14:23:37,771 INFO Loading and filtering ingestion_folder/20250101001600.webngrams.json using 4 processes...
2025-07-07 14:23:58,314 INFO Processing 117/2948 articles from ingestion_folder/20250101001600.webngrams.json...
Processing articles: 100%|████████████████████| 117/117 [03:12<00:00,  1.65s/it]
2025-07-07 14:27:11,212 INFO Loading and filtering ingestion_folder/20250101003100.webngrams.json using 4 processes...
2025-07-07 14:27:30,043 INFO Processing 68/2329 articles from ingestion_folder/20250101003100.webngrams.json...
Processing articles: 100%|██████████████████████| 68/68 [03:48<00:00,  3.37s/it]
2025-07-07 14:31:19,229 INFO Loading and filtering ingestion_f