# Process the OS datasets

Goals:
- figure out how to integrate the data and what is in the datasets
- get some basic dataset statistics

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.notebook import trange, tqdm

## Get the datasets
from https://data.opensanctions.org/datasets/latest/index.json

In [2]:
INDEX_URL = "https://data.opensanctions.org/datasets/latest/index.json"

In [3]:
import requests
import json

In [4]:
def parse_jsonl(text: str) -> str:
    try:
        data = []
        for line in text.split("\n"):
            # Load each JSON line into a Python dictionary
            l = line.strip()
            if l:
                jd = json.loads(l)
                data.append(jd)
        return data
    except FileNotFoundError:
        print(f"File not found: {file_path}")
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON: {e}")
    except Exception as e:
        print(f"An error occurred: {e}")
        

def get_json_from_url(url: str) -> str:
    try:
        # Make a GET request to the URL
        response = requests.get(url)
    
        # Check if the request was successful (status code 200)
        if response.status_code == 200:
            # Parse JSON data from the response
            try:
                return response.json()
            except json.JSONDecodeError as e:
                return parse_jsonl(response.text)
        else:
            print(f"Error: {response.status_code}")
    except requests.exceptions.RequestException as e:
        print(f"Request error: {e}")

In [5]:
index_json = get_json_from_url(INDEX_URL)

In [6]:
def clean_json(json_dict: dict) -> dict:
    return {
        'id': json_dict["id"],
        'caption': json_dict["caption"],
        'schema': json_dict["schema"],
        'properties': json_dict["properties"],
    }

In [None]:
TARGET_PATH = "../data/sources/"
for dataset in tqdm(index_json["datasets"]):
    data_url = "Undefined"
    try:
        resources = dataset["resources"]
        data_url = [res for res in resources if res["mime_type"] == "application/json+ftm"][0]["url"]
        json_data = get_json_from_url(data_url)
        # reduce to only persons and clean
        pdata = [clean_json(d) for d in json_data if d.get("schema") == "Person"]
        filename = TARGET_PATH + f"{dataset['title'].replace(' ', '_')}.jsonl"
        with open(filename, 'w') as jsonl_file:
            for d in pdata:
                jsonl_file.write(json.dumps(d) + '\n')
        print(f"Wrote data to file {filename}.")
    except Exception as e:
        print(f"Failed to get dataset {dataset['title']} from {data_url} {e}")        

  0%|          | 0/171 [00:00<?, ?it/s]

Wrote data to file ../data/sources/US_Health_and_Human_Sciences_Inspector_General_Exclusions.jsonl.
Wrote data to file ../data/sources/Colombian_Administrative_Department_of_Public_Service_PEP_Declarations.jsonl.
Wrote data to file ../data/sources/Indonesia_2018_Regional_Head_Election_Results.jsonl.
Failed to get dataset Custom research data from Undefined list index out of range
Wrote data to file ../data/sources/French_Senators.jsonl.
Wrote data to file ../data/sources/DPRK_Reports.jsonl.


In [1]:
len(data)

NameError: name 'data' is not defined

In [2]:
len(pdata)

NameError: name 'pdata' is not defined

In [12]:
pdata[0]

{'id': 'Q1000053',
 'caption': 'Vasili Nebenzja',
 'schema': 'Person',
 'properties': {'gender': ['male'],
  'name': ['Vasilij Nebenzia',
   'Васіль Аляксеевіч Нябензя',
   'Василий Алексеевич Небензя',
   'Vassili Nebenzia',
   'וסילי נבנזיה',
   'Vasily Nebenzya',
   'Vassili Nebenzja',
   'Василий Небензя',
   'Vasili Nebenzia',
   'Vasilij Alekseevič Nebenzja',
   'Wassili Alexejewitsch Nebensja',
   '瓦西里·涅边贾',
   'فاسيلي نيبينزيا',
   'Vasilij Alexejevič Něbenzja',
   'واسیلی نبنزیا',
   'ワシーリー・ネベンジャ',
   'Vasili Nebenzja',
   'Небензя Василь Олексійович'],
  'birthPlace': ['Volgograd'],
  'topics': ['role.pep', 'role.diplo'],
  'alias': ['Василий Алексеевич Небензя',
   'ワシリー・ネベンジャ',
   'Vasily Alekseevich Nebenzya',
   'Vasilij Něbenzja',
   'Василий Небензя',
   'فاسيلي ألكسيفيتش نيبينزيا',
   'Vasili Alekséyevich Nebenzia',
   'Vasilij Aleksejevitj Nebenzia',
   'Vasilij Nebenzja',
   'Небензя, Василий Алексеевич',
   'ヴァシリー・ネベンジャ'],
  'fatherName': ['Alexeyevich'],
  'wikidat

In [13]:
pdata[186125]

{'id': 'Q7045717',
 'caption': 'Noble Ellington',
 'schema': 'Person',
 'properties': {'education': ['Louisiana Tech University'],
  'lastName': ['Ellington'],
  'gender': ['male'],
  'nationality': ['us'],
  'modifiedAt': ['2023-12-06'],
  'position': ['member of the Louisiana House of Representatives',
   'member of the State Senate of Louisiana'],
  'birthDate': ['1942-05-25'],
  'notes': ['American politician'],
  'wikidataId': ['Q7045717'],
  'topics': ['role.pep'],
  'alias': ['Noble Ellington'],
  'name': ['Noble Ellington'],
  'firstName': ['Noble']}}

## Basic statistics

In [2]:
# genders
gender_data = [p["properties"].get("gender", ["unknown"])[0] for p in pdata]
# Count the occurrences
gender_counts = {gender: gender_data.count(gender) for gender in set(gender_data)}

# Extract data for plotting
genders = list(gender_counts.keys())
counts = list(gender_counts.values())

# Create a bar plot
plt.bar(genders, counts, color='skyblue')

plt.xlabel('Gender')
plt.ylabel('Count')
plt.title('Gender Distribution')

plt.show()

NameError: name 'pdata' is not defined

In [None]:
# nationality
nationality_data = [p["properties"].get("nationality", ["unknown"])[0] for p in pdata]
# Count the occurrences
nationality_counts = {nationality: nationality_data.count(nationality) for nationality in set(nationality_data)}

# Extract data for plotting
nationalities = list(nationality_counts.keys())
counts = list(nationality_counts.values())

In [None]:
# Create a bar plot
plt.figure(figsize=(28, 8))
plt.bar(nationalities, counts, color='skyblue')

plt.xlabel('Nationality')
plt.ylabel('Count')
plt.xticks(rotation=90)
plt.title('Nationality Distribution')

plt.show()

In [None]:
# filter unknown nationality
plt.figure(figsize=(28, 8))
counts.pop(nationalities.index("unknown"))
nationalities.pop(nationalities.index("unknown"))
plt.bar(nationalities, counts, color='skyblue')

plt.xlabel('Nationality')
plt.ylabel('Count')
plt.xticks(rotation=90)
plt.title('Nationality Distribution')

plt.show()