Data Exploration

# FROM DATA TO INSIGHTS

## Introduction
This notebook is created that it should be possible to run it in one go.
Python and pip should be installed.

In [8]:
!python --version

Python 3.7.9


## Install whatever packages that are needed

In [1]:
!pip install pandas
!pip install requests




In [9]:
import pandas as pd
import json
import os
import requests
from pathlib import Path

In [26]:
DATA_FILE = "global_cities_data_set.json"
URL_FILE = "https://iisbvicmidlprdsa.blob.core.windows.net/fileshare/DATA_SET_DS_USE_CASE/global_cities_data_set.json?sv=2019-02-02&st=2021-08-06T08%3A18%3A35Z&se=2021-10-07T08%3A18%3A00Z&sr=b&sp=r&sig=vMOCDzuXhxSM%2BT02Wv3Zm2oW7BsXME2mZCk%2F%2BI5uMSU%3D"
START_FROM_SCRATCH = False
REGION_FILTER = 'EUREG'
YEAR_FILTER = 2021
DATA_DIR_NAME = "data2"


In [27]:
if START_FROM_SCRATCH:
    r = requests.get(URL_FILE)
    open(DATA_FILE, 'wb').write(r.content)

file_object = open(DATA_FILE, encoding='utf8')
data = json.load(file_object)


In [28]:
# Create a directory for derived data.
Path(DATA_DIR_NAME).mkdir(parents=True, exist_ok=True)

## Filtering

In the current setup it's only possible to visualize the data for EU region,
filtering on that data now.
Also filter on one year

In [31]:
df = pd.json_normalize(data['data'])
#print(df.columns.values)

print("df.shape: (all): ", df.shape)
# Make sure the year field is an integer
df.year = df.year.astype('int32')
#rint(df.dtypes)

df = df[(df['databank'] == 'EUREG') & (df['year'] == YEAR_FILTER)]
print("df.shape: (" + REGION_FILTER +  " & " + str(YEAR_FILTER) +  "): ", df.shape)

df.shape: (all):  (894942, 9)
df.shape: (EUREG & 2021):  (61535, 9)


## Indicators

The file provided hosts a number of different types of data as can be seen in the indicator_name field.
Some indicators belong together. For example Population per age range.
These indicator_groups are handled separately.

Singular indicator are written into separate files.

In [32]:

#Some indicator are grouped
indicator_groups = [
    'Household numbers by income band',
    'Population',
    'Consumer spending by product'
]

indicator_groups_strings = (
    'Household numbers by income band',
    'Population',
    'Consumer spending by product'
)

other_indicators = []

for word in df.indicator_name.unique()[:]:
    if not word.startswith(indicator_groups_strings):
        other_indicators.append(word)

# Create separate files for indicators.
for indicator in other_indicators:
    df_filtered = df[(df['indicator_name'] == indicator)]
    filtered_file_name = DATA_DIR_NAME + os.path.sep + indicator.replace(" ", "_"). \
       replace(",", "_").replace("/", "_") + '.csv'
    df_filtered.to_csv(filtered_file_name, sep=";", encoding="utf-8")

# Group some indicators into one file.
for indicator_group in indicator_groups:
    df_filtered = df[(df['indicator_name'].str.startswith(indicator_group))]
    filtered_file_name = DATA_DIR_NAME + os.path.sep + indicator_group + '.csv'
    df_filtered.to_csv(filtered_file_name, sep=";", encoding="utf-8")

df.to_csv(DATA_DIR_NAME + os.path.sep + "total_set.csv", sep=";", encoding="utf-8")

In [33]:
file_object.close()

## Indicator groups

Now process the indicator groups. Different bands of the same kind of data are put into one file for further processing.

As the value_unit might not be the same we can't compare the data is that original form.
For each band a ratio is calculated to indicate what proportion of total this band represents.
This makes it possible to compare the data no matter the country.

In [51]:
file_list = [
    'Consumer spending by product',
    'Population',
    'Household numbers by income band'
]

In [52]:
for file_item in file_list:
    df_data = pd.read_csv(
        DATA_DIR_NAME + os.path.sep + file_item + ".csv",
        sep=";",
        encoding="utf8")

    print("shape: ", df_data.shape)

    # Remove unwanted columns when grouping
    df_sum = df_data.loc[:, ("geographyid", "year", "value")]

    # Sum values
    df_grouped = df_sum.groupby(by=['year', 'geographyid']).sum()
    # Back to a data frame
    df_sum = df_grouped.reset_index()

    def calculate_ratio(par_year, par_geographyid, par_value):
        df_filtered_sum = df_sum[(df_sum['year'] == par_year) &
            (df_sum['geographyid'] == par_geographyid)].sum()
        return par_value / df_filtered_sum.values[2]

    df_data['ratio'] = df_data.apply(
            lambda row : calculate_ratio(
                row['year'],
                row['geographyid'],
                row['value']), axis = 1)

    print("shape: ", df_data.shape)

    df_data.to_csv(DATA_DIR_NAME + os.path.sep + file_item + "_ext.csv",
        sep=";",
        encoding="utf8")

    print("End " + file_item)

print("End cell")

shape:  (10325, 10)
shape:  (10325, 11)
End Consumer spending by product
shape:  (22245, 10)
shape:  (22245, 11)
End Population
shape:  (18238, 10)
shape:  (18238, 11)
End Household numbers by income band
End cell


In [63]:
for file_item in file_list:
    df_data = pd.read_csv(
        DATA_DIR_NAME + os.path.sep + file_item + "_ext.csv",
        sep=";",
        encoding="utf8")

    print("shape: ", df_data.shape)

    #Todo something weird happening.
    # if file_item == "Consumer spending by product":
    #     df_data = df_data #.head(80000)
    # elif file_item == "Population":
    #     df_data = df_data #.head(170000)
    # else:
    #     df_data = df_data #.head(150000)

    # print("shape: ", df_data.shape)

    column_names = []
    df_data_ext = pd.DataFrame()

    # Create single data points
    indicator_names = df_data.indicator_name.unique()
    for indicator_name in indicator_names:
        df_select = df_data[df_data.indicator_name == indicator_name] 
        column_name = indicator_name. \
            replace("resident", ""). \
            replace("based", ""). \
            replace("current", ""). \
            replace("prices", ""). \
            replace("(", ""). \
            replace(")", ""). \
            replace("Consumer spending by product / service - ", ""). \
            replace("Household numbers by income band - ", ""). \
            replace(",", ""). \
            replace(" ", "_"). \
            replace("-", "_"). \
            replace("____", ""). \
            replace("__", "_"). \
            lower()
        print("column_name: ", column_name)
        column_names.append(column_name)
        df_select[column_name] = df_select['ratio']
        #df_select = df_select.drop(columns_to_drop, axis=1)
        df_select = df_select.loc[:, ("geographyid", "year", column_name)]

        if (len(df_data_ext) == 0):
            df_data_ext = df_select
        else:
            df_data_ext = df_data_ext.merge(
                right=df_select,
                on=["geographyid", "year"],
                how="outer")

        print("Shape: ", df_data_ext.shape)

    #df_data_ext = df_data_ext.drop(columns_to_drop, axis=1)

    df_data_ext.to_csv(DATA_DIR_NAME + os.path.sep + file_item + "_ext2.csv",
        sep=";",
        encoding="utf8")

    print("End " + file_item)

print("End cell")

shape:  (10325, 12)
column_name:  furniture_and_furnishings_carpets_and_other_floor_coverings
Shape:  (1475, 3)
column_name:  household_and_garden_tools_and_equipment
Shape:  (1475, 4)
column_name:  household_appliances
Shape:  (1475, 5)
column_name:  household_furnishings_household_equipment_and_other_housing_expenditure__total
Shape:  (1475, 6)
column_name:  household_glassware_tableware_and_household_utensils
Shape:  (1475, 7)
column_name:  household_textiles
Shape:  (1475, 8)
column_name:  routine_household_maintenance_goods_and_services
Shape:  (1475, 9)
End Consumer spending by product
shape:  (22245, 12)
column_name:  population_25_29
Shape:  (1483, 3)
column_name:  population_10_14
Shape:  (1483, 4)
column_name:  population_70_74
Shape:  (1483, 5)
column_name:  population_65_69
Shape:  (1483, 6)
column_name:  population_55_59
Shape:  (1483, 7)
column_name:  population_60_64
Shape:  (1483, 8)
column_name:  population_80+
Shape:  (1483, 9)
column_name:  population_40_44
Shape:  (