# Process the OS datasets

Goals:
- figure out how to integrate the data and what is in the datasets
- get some basic dataset statistics

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.notebook import trange, tqdm

## Get the datasets
from https://data.opensanctions.org/datasets/latest/index.json

In [None]:
INDEX_URL = "https://data.opensanctions.org/datasets/latest/index.json"

In [None]:
import requests
import json

In [None]:
def parse_jsonl(text: str) -> str:
    try:
        data = []
        for line in text.split("\n"):
            # Load each JSON line into a Python dictionary
            l = line.strip()
            if l:
                jd = json.loads(l)
                data.append(jd)
        return data
    except FileNotFoundError:
        print(f"File not found: {file_path}")
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON: {e}")
    except Exception as e:
        print(f"An error occurred: {e}")
        

def get_json_from_url(url: str) -> str:
    try:
        # Make a GET request to the URL
        response = requests.get(url)
    
        # Check if the request was successful (status code 200)
        if response.status_code == 200:
            # Parse JSON data from the response
            try:
                return response.json()
            except json.JSONDecodeError as e:
                return parse_jsonl(response.text)
        else:
            print(f"Error: {response.status_code}")
    except requests.exceptions.RequestException as e:
        print(f"Request error: {e}")

In [None]:
index_json = get_json_from_url(INDEX_URL)

In [None]:
def clean_json(json_dict: dict) -> dict:
    return {
        'id': json_dict["id"],
        'caption': json_dict["caption"],
        'schema': json_dict["schema"],
        'properties': json_dict["properties"],
    }

In [None]:
TARGET_PATH = "../data/sources/"
for dataset in tqdm(index_json["datasets"]):
    data_url = "Undefined"
    try:
        resources = dataset["resources"]
        data_url = [res for res in resources if res["mime_type"] == "application/json+ftm"][0]["url"]
        json_data = get_json_from_url(data_url)
        # reduce to only persons and clean
        pdata = [clean_json(d) for d in json_data if d.get("schema") == "Person"]
        filename = TARGET_PATH + f"{dataset['title'].replace(' ', '_')}.jsonl"
        with open(filename, 'w') as jsonl_file:
            for d in pdata:
                jsonl_file.write(json.dumps(d) + '\n')
        print(f"Wrote data to file {filename}.")
    except Exception as e:
        print(f"Failed to get dataset {dataset['title']} from {data_url} {e}")        

In [None]:
len(data)

In [None]:
len(pdata)

In [None]:
pdata[0]

In [None]:
pdata[186125]

## Basic statistics

In [None]:
# genders
gender_data = [p["properties"].get("gender", ["unknown"])[0] for p in pdata]
# Count the occurrences
gender_counts = {gender: gender_data.count(gender) for gender in set(gender_data)}

# Extract data for plotting
genders = list(gender_counts.keys())
counts = list(gender_counts.values())

# Create a bar plot
plt.bar(genders, counts, color='skyblue')

plt.xlabel('Gender')
plt.ylabel('Count')
plt.title('Gender Distribution')

plt.show()

In [None]:
# nationality
nationality_data = [p["properties"].get("nationality", ["unknown"])[0] for p in pdata]
# Count the occurrences
nationality_counts = {nationality: nationality_data.count(nationality) for nationality in set(nationality_data)}

# Extract data for plotting
nationalities = list(nationality_counts.keys())
counts = list(nationality_counts.values())

In [None]:
# Create a bar plot
plt.figure(figsize=(28, 8))
plt.bar(nationalities, counts, color='skyblue')

plt.xlabel('Nationality')
plt.ylabel('Count')
plt.xticks(rotation=90)
plt.title('Nationality Distribution')

plt.show()

In [None]:
# filter unknown nationality
plt.figure(figsize=(28, 8))
counts.pop(nationalities.index("unknown"))
nationalities.pop(nationalities.index("unknown"))
plt.bar(nationalities, counts, color='skyblue')

plt.xlabel('Nationality')
plt.ylabel('Count')
plt.xticks(rotation=90)
plt.title('Nationality Distribution')

plt.show()