In [1]:
import json
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
path_to_data = 'data'
path_to_metadata = 'metadata'

path_to_wgi_data = f'{path_to_data}/wgi'
path_to_wgi_metadata = f'{path_to_metadata}/wgi'

path_to_cpi_data = f'{path_to_data}/cpi'
path_to_cpi_metadata = f'{path_to_metadata}/cpi'

In [3]:
wgi_paths = {
    'country_metadata': f'{path_to_wgi_data}/country_metadata.csv',
    'series_metadata': f'{path_to_wgi_data}/series_metadata.csv',
    'wgi_sample_data': f'{path_to_wgi_data}/wgi_sample_data.csv',
    'wgi_sample_metadata': f'{path_to_wgi_data}/wgi_sample_metadata.csv'
}

cpi_paths = {
    'global_scores': f'{path_to_cpi_data}/global_scores.csv',
    'historical_scores': f'{path_to_cpi_data}/historical_scores.csv',
    'time_series': f'{path_to_cpi_data}/time_series.csv'
}

In [4]:
def explore_data(name, path, dir, sample=False):
    data = pd.read_csv(path, header=0)
    result = {
        'Dataset': name,
        'Shape': data.shape,
        'Columns': data.columns.tolist(),
        'Describe': data.describe().to_dict()
    }

    if sample:
        result.update({
            'Head': data.head().to_dict(),
            'Tail': data.tail().to_dict()
        })

    dir = Path(dir)
    dir.mkdir(parents=True, exist_ok=True)
    
    with open(Path(dir, f'{name}.json'), 'w') as f:
        json.dump(result, f, indent=4)

In [10]:
for name, path in wgi_paths.items():
    explore_data(name, path, dir=path_to_wgi_metadata)

In [11]:
for name, path in cpi_paths.items():
    explore_data(name, path, dir=path_to_cpi_metadata)

In [12]:
cpi_columns = [
    "Country / Territory",
    "ISO3",
    "CPI score"
]

wgi_columns = [
    "Control of Corruption: Estimate",
    "Government Effectiveness: Estimate",
    "Rule of Law: Estimate",
    "Political Stability and Absence of Violence/Terrorism: Estimate"
]

wgi_metadata_columns = [
    "ISO3",
    "Income Group",
    "Region"
]

In [13]:
def subset(data, substrings):
    """
    Given a dataframe `data`, subset it to the columns that contain any of the `substrings`.
    """
    matched_columns = [col for col in data.columns if any(sub in col for sub in substrings)]
    return data[matched_columns]