In [1]:
import sys
sys.path.append("/Users/alex/Downloads/stats/")

import pandas as pd
import os
from db import connection
from db_utils import DBUtils
import xlrd
from tqdm import tqdm
import datetime
import json
from glob import glob

## Preparing csvs

# Datasets

In [2]:
names,ids = ["IHME SDG"], [0]

datasets = pd.DataFrame()
datasets['id'] = ids
datasets['name'] = names
datasets.to_csv('datasets.csv', index=False)

In [3]:
datasets

Unnamed: 0,id,name
0,0,IHME SDG


# Sources

In [4]:
names, desc, d_ids = [], [], []

source_name = "IHME SDG"

description = {}
description['dataPublishedBy'] = "Global Burden of Disease Study 2017 (GBD 2017) Health-related Sustainable Development Goals (SDG) Indicators 1990-2030"
description['dataPublisherSource'] = None
description['link'] = 'http://ghdx.healthdata.org/record/ihme-data/gbd-2017-health-related-sdgs-1990-2030'
description['retrievedDate'] = datetime.datetime.now().strftime("%d-%b-%Y")
description['additionalInfo'] = None
dataset_id = 0

names.append(source_name)
desc.append(json.dumps(description))
d_ids.append(dataset_id)
    
res = pd.DataFrame()
res['name'] = names
res['description'] = desc
res['dataset_id'] = d_ids
res.to_csv("sources.csv", index=False)

In [5]:
res

Unnamed: 0,name,description,dataset_id
0,IHME SDG,"{""dataPublishedBy"": ""Global Burden of Disease ...",0


# Variables

In [6]:
var_names, var_units, var_ids = [], [], []
for x in tqdm(glob("data/*.CSV")):
    
    if "CODEBOOK" in x:
        continue
    data = pd.read_csv(x)
    
    scaling = "Unscaled" if "IHME_GBD_2017_HEALTH_SDG_1990_2030_UNSCALED_Y2019M02D06.CSV" in x else "Scaled"
    
    for i, row in data.iterrows():
        var_names.append(row['ihme_indicator_description'] + " - " + row['estimate_type'].capitalize() + " - " + scaling)
        var_units.append(row['indicator_unit'])
    
        
variables = pd.DataFrame()
variables['name'] = var_names
variables['unit'] = var_units
variables['dataset_id'] = [0 for _ in range(len(var_units))]
variables['id'] = [x for x in range(len(var_units))]

variables.drop_duplicates(subset=['name', 'unit'], inplace=True)

variables.to_csv("variables.csv", index=False)

100%|██████████| 4/4 [01:08<00:00, 17.17s/it]


# Datapoints

In [13]:

for x in tqdm(glob("data/*.CSV")):
    
    if "CODEBOOK" in x:
        continue
    data = pd.read_csv(x)
    
    scaling = "Unscaled" if "IHME_GBD_2017_HEALTH_SDG_1990_2030_UNSCALED_Y2019M02D06.CSV" in x else "Scaled"
    
    data['var_val'] = data['ihme_indicator_description'] + " - " + data['estimate_type'].str.capitalize() + " - " + scaling
    
    distinct_vals = data['var_val'].unique()
    
    for var_name in distinct_vals:
    
        var_id = variables[variables['name'] == var_name]['id'].values[0]
        sub_data = data[data['var_val'] == var_name]
        res = pd.DataFrame()
        
        res['year'] = sub_data['year_id']
        res['country'] = sub_data['location_name']
        if scaling == "Unscaled":
            res['value'] = sub_data['unscaled_value']
        else :
            res['value'] = sub_data['scaled_value']   
        res.to_csv('datapoints/datapoints_%s.csv' % str(var_id), index=False)  


100%|██████████| 4/4 [00:09<00:00,  2.49s/it]


## Get country names

In [79]:
# countries = set()

# for x in tqdm(glob('datapoints/*.csv')):
    
   
#     data = pd.read_csv(x)
#     for j in data['country'].values:
#         countries.add(j)
# res = pd.DataFrame()
# res['name'] = list(countries)
# res.to_csv("distinct_countries_standardized.csv", index=False)

100%|██████████| 168/168 [00:00<00:00, 316.01it/s]


In [80]:
res

Unnamed: 0,name
0,Sri Lanka
1,Democratic Republic of the Congo
2,Bangladesh
3,American Samoa
4,Tonga
...,...
190,Turkmenistan
191,South Africa
192,Slovakia
193,Ukraine


## Insert db

In [14]:
with connection as c:
    db = DBUtils(c)
    
    entities = pd.read_csv("distinct_countries_standardized.csv")
    datasets = pd.read_csv("datasets.csv")
    sources = pd.read_csv("sources.csv")
    variables = pd.read_csv('variables.csv')
    
    new_entities = entities[entities['db_entity_id'].isnull()]
    for _, entity in new_entities.iterrows():
        entity_id = entity.name
        entity_name = entity['name']
        db_entity_id = db.get_or_create_entity(entity_name)
        entities.loc[entity_id, 'db_entity_id'] = db_entity_id
    
    # upsert datasets
    dataset_name_ids = {}
    for i, row in tqdm(datasets.iterrows()):
        dataset_id = db.upsert_dataset(name=row['name'], namespace="unwpp", user_id=15)
        dataset_name_ids[row['name']] = dataset_id
        
        
    # upsert sources
    
    dataset_to_source_ids = {}
    for i, row in tqdm(sources.iterrows()):

        dataset_name = datasets[datasets['id'] == row['dataset_id']]['name'].values[0]
        source_id = db.upsert_source(name=row['name'], description=json.dumps(row['description']), dataset_id=dataset_name_ids[dataset_name])

        dataset_to_source_ids[dataset_name] = source_id

        
    # upsert variables
    names_to_ids = {}
    for i, row in tqdm(variables.iterrows()):
        
        dataset_name = datasets[datasets['id'] == row['dataset_id']]['name'].values[0]
        dataset_id = dataset_name_ids[dataset_name]
        source_id = dataset_to_source_ids[dataset_name]
        
        unit = row['unit'] if pd.notnull(row['unit']) else ""
        
        variable_id = db.upsert_variable(
                                        name=row['name'], 
                                        code=None, 
                                        unit=unit, 
                                        short_unit=None, 
                                        source_id=source_id, 
                                        dataset_id=dataset_id, 
                                        description=None, 
                                        timespan='', 
                                        coverage='', 
                                        display={}
                                        )
        names_to_ids[row['name']] = variable_id
        
    #Inserting datapoints


    datapoints_files = glob("datapoints/*.csv")
    for x in tqdm(datapoints_files): 
        # to get variable is
        v_id = int(x.split("_")[1].split(".")[0])
       
        # to get variable name
        variable_name = variables[variables['id']==v_id]['name'].values[0]
       
        # to get variable id from db
        variable_id = names_to_ids[variable_name]
        data = pd.read_csv(x)

        for i, row in data.iterrows():
            entity_id = entities[entities['name'] == row['country']]['db_entity_id'].values[0]

            year = row['year']
            val = row['value']

            db.upsert_one("""
                INSERT INTO data_values
                    (value, year, entityId, variableId)
                VALUES
                    (%s, %s, %s, %s)
                ON DUPLICATE KEY UPDATE
                    value = VALUES(value),
                    year = VALUES(year),
                    entityId = VALUES(entityId),
                    variableId = VALUES(variableId)
            """, [val, int(year), str(int(entity_id)), str(variable_id)])
    

1it [00:00, 10.83it/s]
1it [00:00, 19.82it/s]
168it [00:00, 436.89it/s]
100%|██████████| 168/168 [13:43<00:00,  4.61s/it]
