In [1]:
import sys
sys.path.append("/Users/aermolaev/Downloads/data_processing/")

import pandas as pd
import os
from db import connection
from db_utils import DBUtils
import xlrd
from tqdm import tqdm
import datetime
import json
from glob import glob

# povstats

# excel_filename =  "data_povstats/PovStatsEXCEL.xlsx"
# prefix = "World Bank Poverty and Equity database"
# url = "https://data.worldbank.org/data-catalog/poverty-and-equity-database" 
# files_folder = "files_povstats/"
# datapoints_folder = "datapoints_povstats/"

# edstats

# excel_filename = "data_edstats/EdStatsEXCEL.xlsx" 
# prefix = "World Bank EdStats"
# url = "https://datacatalog.worldbank.org/dataset/education-statistics"
# files_folder = "files_edstats/"
# datapoints_folder = "datapoints_edstats/"

# wdi

excel_filename =  "data_wdi/WDIEXCEL.xlsx"
prefix = "World Bank Poverty and Equity database"
url = "https://datacatalog.worldbank.org/dataset/world-development-indicators" 
files_folder = "files_wdi/"
datapoints_folder = "datapoints_wdi/"

## Preparing csvs

# Datasets

In [2]:
names = []

data = pd.read_excel(excel_filename, sheet_name="Series")

for item in data['Topic'].unique():
    names.append(prefix + " - " + item)
    
datasets = pd.DataFrame()
datasets['id'] = [x for x in range(len(names))]
datasets['name'] = names

datasets.to_csv(files_folder + 'datasets.csv', index=False)

# Sources

In [3]:
names, desc, d_ids = [], [], []

for x in data['Indicator Name'].unique():
    
    df = data[data['Indicator Name'] == x]
    
    dataset_id = datasets[datasets['name'] == prefix + " - " + df['Topic'].values[0]]['id'].values[0]
    source_name = prefix + ": " + x
    
    description = {}
    description['dataPublishedBy'] = prefix
    description['link'] = url
    description['retrievedDate'] = datetime.datetime.now().strftime("%d-%b-%Y")
    description['additionalInfo'] = "Definitions and characteristics of countries and other territories: " + "https://ourworldindata.org/grapher/povstats/POVSTATS_Country_info.xls\r\n"
    description['additionalInfo'] += "Limitations and exceptions:\n" + df["Limitations and exceptions"].values[0] + "\n" if pd.notnull(df["Limitations and exceptions"].values[0]) else ""
    description['additionalInfo'] += "Notes from original source:\n" + df["Notes from original source"].values[0] + "\n" if pd.notnull(df["Notes from original source"].values[0]) else ""
    description['additionalInfo'] += "General comments:\n" + df["General comments"].values[0] + "\n" if pd.notnull(df["General comments"].values[0]) else ""
    description['additionalInfo'] += "Statistical concept and methodology:\n" + df["Statistical concept and methodology"].values[0] + "\n" if pd.notnull(df["Statistical concept and methodology"].values[0]) else ""
    description['additionalInfo'] += "Related source links:\n" + df["Related source links"].values[0] + "\n" if pd.notnull(df["Related source links"].values[0]) else ""
    description['additionalInfo'] += "Other web links:\n" + df["Other web links"].values[0] + "\n" if pd.notnull(df["Other web links"].values[0]) else ""
    description['dataPublisherSource'] = df['Source'].values[0] 
    
    
    names.append(source_name)
    desc.append(description)
    d_ids.append(dataset_id)
    
res = pd.DataFrame()
res['name'] = names
res['description'] = desc
res['dataset_id'] = d_ids
res.to_csv(files_folder + "sources.csv", index=False)

# Variables

In [4]:
var_names, var_units, var_ids, var_codes = [], [], [], []

for i, row in data.iterrows():
    var_names.append(row['Indicator Name'] if pd.notnull(row['Indicator Name']) else "")
    var_units.append(row['Unit of measure'] if pd.notnull(row['Unit of measure']) else "")
    #var_codes[row['Series Code']] = row['Indicator Name']
    var_codes.append(row['Series Code'].lower().strip())
    dataset_id = datasets[datasets['name'] == prefix + " - " + row['Topic']]['id'].values[0]
    var_ids.append(dataset_id)
    
variables = pd.DataFrame()
variables['name'] = var_names
variables['unit'] = var_units
variables['dataset_id'] = var_ids
variables['id'] = [x for x in range(len(var_units))]
variables.to_csv(files_folder + "variables.csv", index=False)

variables['Indicator Code'] = var_codes



# Datapoints

In [5]:

data = pd.read_excel(excel_filename, sheet_name="Data")



In [6]:
def normalize_country(row):
        
        row['country'] = row['country'].str.replace(r'\s*[^A-Za-z\s]*$', '')
        return row

In [7]:
for x in data['Indicator Code'].unique():
    try:
        var_id = variables[variables['Indicator Code'] == x.lower().strip()]['id'].values[0]

        subdata = data[data['Indicator Code'] == x].drop(["Indicator Code", "Indicator Name", "Country Code"], axis=1)
        subdata.dropna(how='all')
        res = subdata.set_index('Country Name').transpose().T.unstack().reset_index()
        res = res.dropna(subset=[0],how='all')
        res.rename(columns={"level_0": "year", "Country Name": "country", 0: "value"}, inplace=True)
        res = normalize_country(res)
        res.to_csv(datapoints_folder + 'datapoints_%s.csv' % str(var_id), index=False)  
        
    except:
        print(x)
        break

# Get country names

In [8]:
countries = set()

for x in tqdm(glob(datapoints_folder + "*.csv")):
    
   
    data = pd.read_csv(x)
    for j in data['country'].values:
        countries.add(j)
res = pd.DataFrame()
res['name'] = list(countries)
res.to_csv(files_folder + "distinct_countries.csv", index=False)

100%|██████████| 3665/3665 [00:07<00:00, 483.91it/s]


In [8]:
with connection as c:
    db = DBUtils(c)
    
    entities = pd.read_csv(files_folder + "distinct_countries_standardized.csv")
    datasets = pd.read_csv(files_folder + "datasets.csv")
    sources = pd.read_csv(files_folder + "sources.csv")
    variables = pd.read_csv(files_folder + 'variables.csv')
    
    new_entities = entities[entities['db_entity_id'].isnull()]
    for _, entity in new_entities.iterrows():
        entity_id = entity.name
        entity_name = entity['name']
        db_entity_id = db.get_or_create_entity(entity_name)
        entities.loc[entity_id, 'db_entity_id'] = db_entity_id
    
    # upsert datasets
    dataset_name_ids = {}
    for i, row in tqdm(datasets.iterrows()):
        dataset_id = db.upsert_dataset(name=row['name'], namespace="unwpp", user_id=15)
        dataset_name_ids[row['name']] = dataset_id
        
        
    # upsert sources
    
    dataset_to_source_ids = {}
    for i, row in tqdm(sources.iterrows()):

        dataset_name = datasets[datasets['id'] == row['dataset_id']]['name'].values[0]
        source_id = db.upsert_source(name=row['name'], description=json.dumps(row['description']), dataset_id=dataset_name_ids[dataset_name])

        dataset_to_source_ids[dataset_name] = source_id

        
    # upsert variables
    names_to_ids = {}
    for i, row in tqdm(variables.iterrows()):
        
        dataset_name = datasets[datasets['id'] == row['dataset_id']]['name'].values[0]
        dataset_id = dataset_name_ids[dataset_name]
        source_id = dataset_to_source_ids[dataset_name]
        
        unit = row['unit'] if pd.notnull(row['unit']) else ""
        
        variable_id = db.upsert_variable(
                                        name=row['name'], 
                                        code=None, 
                                        unit=unit, 
                                        short_unit=None, 
                                        source_id=source_id, 
                                        dataset_id=dataset_id, 
                                        description=None, 
                                        timespan='', 
                                        coverage='', 
                                        display={}
                                        )
        names_to_ids[row['name']] = variable_id
        
    #Inserting datapoints


    datapoints_files = glob(datapoints_folder + "*.csv")
    for x in tqdm(datapoints_files): 
        # to get variable is
        v_id = int(x.split("_")[2].split(".")[0])
       
        # to get variable name
        variable_name = variables[variables['id']==v_id]['name'].values[0]
       
        # to get variable id from db
        variable_id = names_to_ids[variable_name]
        data = pd.read_csv(x)
        
        values_to_insert = []

        for i, row in data.iterrows():
            
            entity_id = entities[entities['name'] == row['country']]['db_entity_id'].values[0]

            year = row['year']
            val = row['value']
            values_to_insert.append((val, int(year), str(int(entity_id)), str(variable_id)))

        db.upsert_many("""
            INSERT INTO data_values
                (value, year, entityId, variableId)
            VALUES
                (%s, %s, %s, %s)
            ON DUPLICATE KEY UPDATE
                value = VALUES(value),
                year = VALUES(year),
                entityId = VALUES(entityId),
                variableId = VALUES(variableId)
        """, values_to_insert)
    

88it [00:00, 1189.35it/s]
1429it [00:02, 629.61it/s]
1429it [00:02, 670.56it/s]
  0%|          | 3/1429 [00:08<1:07:50,  2.85s/it]


KeyboardInterrupt: 