### Imports

In [1]:
import psycopg2
import pandas as pd
import numpy as np
import warnings

from config import config
from datetime import datetime, timedelta

warnings.filterwarnings("ignore")


### Define connect method to connect to the postgres database and return the connection

source: https://www.postgresqltutorial.com/postgresql-python/connect/

In [2]:
def connect():
    """ Connect to the PostgreSQL database server """
    conn = None
    try:
        # read connection parameters
        params = config()

        # connect to the PostgreSQL server
        print('Connecting to the PostgreSQL database...')
        conn = psycopg2.connect(**params)

        return conn

        
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)

### Connect to the database

In [3]:
conn = connect()

Connecting to the PostgreSQL database...


### Create the database tables

In [41]:
cur = conn.cursor()

# Country Table
cur.execute("""CREATE TABLE Country (
                    Country_key int PRIMARY KEY,
                    Name varchar(255),
                    Region varchar(255),
                    Continent varchar(255),
                    Currency varchar(255),
                    Capital varchar(255),
                    Total_population int,
                    Birth_rate float,
                    Gross_national_income float,
                    Life_expectancy_at_birth float,
                    Labor_force_total int,
                    Human_capital_index int,
                    Population_grown_annual float
                );""")

# Month Table
cur.execute("""CREATE TABLE Month (
                    Month_Key int PRIMARY KEY,
                    Name varchar(255),
                    Quarter int,
                    Year int,
                    Decade int
                );""")

# Education Table
cur.execute("""CREATE TABLE Education (
                    Education_Key int PRIMARY KEY,
                    Total_Literacy_Rate float,
                    Male_Literacy_Rate float,
                    Female_Literacy_Rate float,
                    Primary_School_Enrollment float,
                    Post_Secondary_School_Enrollment float,
                    Public_Education_Spending float,
                    Pop_compuslory_school_age_total int,
                    Pop_offical_entrance_age_primary_total int,
                    Pop_offical_entrance_age_secondary_total int,
                    Teachers_primary_total int,
                    Teachers_secondary_total int
                );""")

# Health Table
cur.execute("""CREATE TABLE Health (
                    Health_Key int PRIMARY KEY,
                    Domestic_Health_Expenditure float,
                    Hospital_Beds float,
                    Immunization_attr_Hep int,
                    Immunization_attr_DPT int,
                    Immunization_attr_Measles int,
                    Immunization_attr_Polio int,
                    Num_Surgical_procedures int,
                    Num_Death_infant int,
                    Num_Death_stilbirths int,
                    Num_Death_Elderly int,
                    Num_health_professionals_Nurses float,
                    Num_health_professionals_Physicians float,
                    Prevalence_health_condition_overweight float,
                    Prevalence_health_condition_diabetes float,
                    Prevalence_health_condition_hiv float,
                    Adults_HIV_15up float,
                    Adults_new_HIV_15up float,
                    Children_HIV_under15 float,
                    Children_new_HIV_under15 float,
                    Homelessness_rate_male float,
                    Homelessness_rate_female float,
                    Homelessness_rate_total float,
                    CrimeRate float,
                    Cost_of_living_index float
                );""")

# Quality_of_Life Table
cur.execute("""CREATE TABLE Quality_of_Life(
                    Quality_of_Life_Key int PRIMARY KEY,
                    Access_to_Drinking_Water float,
                    Access_to_Sanitation float,
                    Access_to_Basic_Handwashing_Facilities float,
                    Unemployment_Rate_F float,
                    Unemployment_Rate_M float,
                    Unemployment_Rate_T float,
                    Access_to_Electricity_Total float,
                    Access_to_Electricity_Urban float,
                    Access_to_Electricity_Rural float,
                    Part_Time_Employment_T float, 
                    Part_Time_Employment_F float,    
                    Part_Time_Employment_M float
                );""")

# Population Table
cur.execute("""CREATE TABLE Population(
                    Population_Key int PRIMARY KEY,
                    Life_Expectancy_At_Birth_F float,
                    Life_Expectancy_At_Birth_M float,
                    Life_Expectancy_At_Birth_T float,
                    Net_Migration int,
                    Population_ages_0_15 int,
                    Population_ages_16_30 int,
                    Population_ages_31_64 int,
                    Population_ages_65_up int,
                    Rural_Population float,
                    Rural_Population_Growth_Rate float,
                    Rural_Poverty_Rate float,
                    Urban_Population float,
                    Urban_Population_Growth_Rate float,
                    Urban_Poverty_Rate float
                );""")

# Event Table
cur.execute("""CREATE TABLE Event (
                    Event_key int PRIMARY KEY,
                    Name varchar(255),
                    Disaster_Type varchar(255),
                    Start_Day int,
                    End_Day int,
                    Start_Month int,
                    End_Month int,
                    Start_Year int,
                    End_Year int,
                    Disaster_Subgroup varchar(255),
                    Total_Deaths int,
                    No_Injured int,
                    No_Affected int
                );""")

# Fact Table
cur.execute("""CREATE TABLE Fact_Table (
                    Month_Key int,
                    Country_Key int,
                    Education_Key int,
                    Population_Key int,
                    Quality_of_Life_Key int,
                    Health_Key int,
                    Event_Key int,
                    Quality_of_life int,
                    Development_Index int,
                    Human_Development_Index int,
                    CONSTRAINT fk_month FOREIGN KEY(Month_Key) REFERENCES Month(Month_Key),
                    CONSTRAINT fk_country FOREIGN KEY(Country_Key) REFERENCES Country(Country_Key),
                    CONSTRAINT fk_education FOREIGN KEY(Education_Key) REFERENCES Education(Education_Key),
                    CONSTRAINT fk_population FOREIGN KEY(Population_Key) REFERENCES Population(Population_Key),
                    CONSTRAINT fk_quality_of_life FOREIGN KEY(Quality_of_Life_Key) REFERENCES Quality_Of_Life(Quality_of_Life_Key),
                    CONSTRAINT fk_health FOREIGN KEY(Health_Key) REFERENCES Health(Health_Key),
                    CONSTRAINT fk_event FOREIGN KEY(Event_Key) REFERENCES Event(Event_Key)
                );""")

cur.close()


Drop all the tables incase we need to redefine them

In [40]:
cur = conn.cursor()

# cur.execute("DROP TABLE Fact_Table")
# cur.execute("DROP TABLE Country")
# cur.execute("DROP TABLE Month")
# cur.execute("DROP TABLE Education")
# cur.execute("DROP TABLE Health")
# cur.execute("DROP TABLE Quality_of_Life")
# cur.execute("DROP TABLE Population")
# cur.execute("DROP TABLE Event")

cur.close()


Rollback incase that shit breaks

In [23]:
cur.execute("ROLLBACK")

### Read the data from our CSVs using pandas

In [4]:
country_info_data = pd.read_csv("Data/CountryInfo.csv")
development_index_data = pd.read_csv("Data/Development Index.csv")
emdat_data = pd.read_csv("Data/emdat_public_2022_03_14_query_uid-tJR2bL.csv")
hdi_data = pd.read_csv("Data/human-development-index-escosura.csv")
qol_index_data = pd.read_csv("Data/Quality Of Life Index.csv")
wb_education_data = pd.read_csv("Data/WorldBankEducationStatisticsAllIndicators_Data.csv")
wb_hnps_data = pd.read_csv("Data/WorldBankHealthNutritionAndPopulationStatistics_Data.csv")
wb_poverty_and_equity_data = pd.read_csv("Data/WorldBankPovertyAndEquity_Data.csv")
wb_world_development_indicators_data = pd.read_csv("Data/WorldBankWorldDevelopmentIndicators_Data.csv")

### Define our list of countries

In [14]:
countries = [
    "Canada",
    "United States",
    "Mexico",
    "Indonesia",
    "Angola",
    "Cambodia",
    "Thailand",
    "South Africa",
    "Zimbabwe"
]

### A function to convert the data frame structure

This function converts the format of the dataframe after importing the csv to the format we want to import the data into sql

In [6]:
def convert_dataframe_structure(df):
    dataframes = []


    for country in countries:
        temp_df = df.loc[(df['Country Name'] == country)]

        temp_df = temp_df.T
        temp_df = temp_df.drop("Country Name")


        new_header = temp_df.iloc[0]
        temp_df = temp_df[1:]
        temp_df.columns = new_header

        temp_df["Country"] = country

        dataframes.append(temp_df)

    return pd.concat(dataframes)

### Populate the Event DataFrame

In [125]:
event_dimension_df = emdat_data[[
    "Disaster Subgroup",
    "Event Name",
    "Country",
    "Disaster Type", 
    "Start Year", 
    "Start Month", 
    "Start Day", 
    "End Year", 
    "End Month", 
    "End Day", 
    "Total Deaths",
    "No Injured",
    "No Affected"]]

event_dimension_df['Country'] = event_dimension_df['Country'].replace(['United States of America (the)'], 'United States')

event_dimension_df.head()

Unnamed: 0,Disaster Subgroup,Event Name,Country,Disaster Type,Start Year,Start Month,Start Day,End Year,End Month,End Day,Total Deaths,No Injured,No Affected
0,Hydrological,,Angola,Flood,2005,1.0,27.0,2005,3.0,28.0,,,
1,Technological,,Angola,Transport accident,2005,2.0,8.0,2005,2.0,8.0,20.0,70.0,
2,Geophysical,,Indonesia,Earthquake,2005,3.0,28.0,2005,3.0,28.0,915.0,1146.0,104167.0
3,Geophysical,,Indonesia,Earthquake,2005,1.0,23.0,2005,1.0,23.0,1.0,4.0,680.0
4,Hydrological,,Indonesia,Landslide,2005,2.0,21.0,2005,2.0,21.0,143.0,,


### Handle data quality issues for Population data frame

The empty data here follows no pattern, so we will ignore them by setting them to Nulll

In [126]:
event_dimension_df = event_dimension_df.where(pd.notnull(event_dimension_df), "Null")

event_dimension_df.head()

Unnamed: 0,Disaster Subgroup,Event Name,Country,Disaster Type,Start Year,Start Month,Start Day,End Year,End Month,End Day,Total Deaths,No Injured,No Affected
0,Hydrological,Null,Angola,Flood,2005,1.0,27.0,2005,3.0,28.0,Null,Null,Null
1,Technological,Null,Angola,Transport accident,2005,2.0,8.0,2005,2.0,8.0,20.0,70.0,Null
2,Geophysical,Null,Indonesia,Earthquake,2005,3.0,28.0,2005,3.0,28.0,915.0,1146.0,104167.0
3,Geophysical,Null,Indonesia,Earthquake,2005,1.0,23.0,2005,1.0,23.0,1.0,4.0,680.0
4,Hydrological,Null,Indonesia,Landslide,2005,2.0,21.0,2005,2.0,21.0,143.0,Null,Null


### Populate the Population DataFrame

In [23]:
wb_hnps_population_data = wb_hnps_data.loc[
    (wb_hnps_data['Series Name'] == "Life expectancy at birth, female (years)") | 
    (wb_hnps_data['Series Name'] == "Life expectancy at birth, male (years)") | 
    (wb_hnps_data['Series Name'] == "Life expectancy at birth, total (years)") | 
    (wb_hnps_data['Series Name'] == "Net migration") | 
    (wb_hnps_data['Series Name'] == "Population ages 00-14, total") | 
    (wb_hnps_data['Series Name'] == "Population ages 15-19, female") | 
    (wb_hnps_data['Series Name'] == "Population ages 15-19, male") | 
    (wb_hnps_data['Series Name'] == "Population ages 20-24, female") | 
    (wb_hnps_data['Series Name'] == "Population ages 20-24, male") | 
    (wb_hnps_data['Series Name'] == "Population ages 25-29, female") | 
    (wb_hnps_data['Series Name'] == "Population ages 25-29, male") | 
    (wb_hnps_data['Series Name'] == "Population ages 30-34, female") | 
    (wb_hnps_data['Series Name'] == "Population ages 30-34, male") | 
    (wb_hnps_data['Series Name'] == "Population ages 35-39, female") | 
    (wb_hnps_data['Series Name'] == "Population ages 35-39, male") | 
    (wb_hnps_data['Series Name'] == "Population ages 40-44, female") | 
    (wb_hnps_data['Series Name'] == "Population ages 40-44, male") | 
    (wb_hnps_data['Series Name'] == "Population ages 45-49, female") | 
    (wb_hnps_data['Series Name'] == "Population ages 45-49, male") | 
    (wb_hnps_data['Series Name'] == "Population ages 50-54, female") | 
    (wb_hnps_data['Series Name'] == "Population ages 50-54, male") | 
    (wb_hnps_data['Series Name'] == "Population ages 55-59, female") | 
    (wb_hnps_data['Series Name'] == "Population ages 55-59, male") | 
    (wb_hnps_data['Series Name'] == "Population ages 60-64, female") | 
    (wb_hnps_data['Series Name'] == "Population ages 60-64, male") | 
    (wb_hnps_data['Series Name'] == "Population ages 65 and above, female") | 
    (wb_hnps_data['Series Name'] == "Population ages 65 and above, male") | 
    (wb_hnps_data['Series Name'] == "Rural population (% of total population)") | 
    (wb_hnps_data['Series Name'] == "Rural population growth (annual %)") | 
    (wb_hnps_data['Series Name'] == "Rural poverty headcount ratio at national poverty lines (% of rural population)") |
    (wb_hnps_data['Series Name'] == "Urban population (% of total population)") | 
    (wb_hnps_data['Series Name'] == "Urban population growth (annual %)") |  
    (wb_hnps_data['Series Name'] == "Urban poverty headcount ratio at national poverty lines (% of urban population)")
].drop(columns=['Series Code', 'Country Code'])

population_dimension_df = convert_dataframe_structure(wb_hnps_population_data)

#Combine the male and female populations to get total populations for each age group

population_dimension_df.insert(len(population_dimension_df.columns), 'Population ages 15-19, total', pd.to_numeric(population_dimension_df['Population ages 15-19, female']) + pd.to_numeric(population_dimension_df['Population ages 15-19, male']))
population_dimension_df.insert(len(population_dimension_df.columns), 'Population ages 20-24, total', pd.to_numeric(population_dimension_df['Population ages 20-24, female']) + pd.to_numeric(population_dimension_df['Population ages 20-24, male']))
population_dimension_df.insert(len(population_dimension_df.columns), 'Population ages 25-29, total', pd.to_numeric(population_dimension_df['Population ages 25-29, female']) + pd.to_numeric(population_dimension_df['Population ages 25-29, male']))
population_dimension_df.insert(len(population_dimension_df.columns), 'Population ages 30-34, total', pd.to_numeric(population_dimension_df['Population ages 30-34, female']) + pd.to_numeric(population_dimension_df['Population ages 30-34, male']))
population_dimension_df.insert(len(population_dimension_df.columns), 'Population ages 35-39, total', pd.to_numeric(population_dimension_df['Population ages 35-39, female']) + pd.to_numeric(population_dimension_df['Population ages 35-39, male']))
population_dimension_df.insert(len(population_dimension_df.columns), 'Population ages 40-44, total', pd.to_numeric(population_dimension_df['Population ages 40-44, female']) + pd.to_numeric(population_dimension_df['Population ages 40-44, male']))
population_dimension_df.insert(len(population_dimension_df.columns), 'Population ages 45-49, total', pd.to_numeric(population_dimension_df['Population ages 45-49, female']) + pd.to_numeric(population_dimension_df['Population ages 45-49, male']))
population_dimension_df.insert(len(population_dimension_df.columns), 'Population ages 50-54, total', pd.to_numeric(population_dimension_df['Population ages 50-54, female']) + pd.to_numeric(population_dimension_df['Population ages 50-54, male']))
population_dimension_df.insert(len(population_dimension_df.columns), 'Population ages 55-59, total', pd.to_numeric(population_dimension_df['Population ages 55-59, female']) + pd.to_numeric(population_dimension_df['Population ages 55-59, male']))
population_dimension_df.insert(len(population_dimension_df.columns), 'Population ages 60-64, total', pd.to_numeric(population_dimension_df['Population ages 60-64, female']) + pd.to_numeric(population_dimension_df['Population ages 60-64, male']))
population_dimension_df.insert(len(population_dimension_df.columns), 'Population ages 65 and above, total', pd.to_numeric(population_dimension_df['Population ages 65 and above, female']) + pd.to_numeric(population_dimension_df['Population ages 65 and above, male']))

population_dimension_df.drop(columns=['Population ages 15-19, female', 'Population ages 15-19, male'], inplace=True)
population_dimension_df.drop(columns=['Population ages 20-24, female', 'Population ages 20-24, male'], inplace=True)
population_dimension_df.drop(columns=['Population ages 25-29, female', 'Population ages 25-29, male'], inplace=True)
population_dimension_df.drop(columns=['Population ages 30-34, female', 'Population ages 30-34, male'], inplace=True)
population_dimension_df.drop(columns=['Population ages 35-39, female', 'Population ages 35-39, male'], inplace=True)
population_dimension_df.drop(columns=['Population ages 40-44, female', 'Population ages 40-44, male'], inplace=True)
population_dimension_df.drop(columns=['Population ages 45-49, female', 'Population ages 45-49, male'], inplace=True)
population_dimension_df.drop(columns=['Population ages 50-54, female', 'Population ages 50-54, male'], inplace=True)
population_dimension_df.drop(columns=['Population ages 55-59, female', 'Population ages 55-59, male'], inplace=True)
population_dimension_df.drop(columns=['Population ages 60-64, female', 'Population ages 60-64, male'], inplace=True)
population_dimension_df.drop(columns=['Population ages 65 and above, female', 'Population ages 65 and above, male'], inplace=True)



population_dimension_df["Year"] = population_dimension_df.index
population_dimension_df.reset_index(inplace=True)

population_dimension_df.drop(columns=['index'], inplace=True)

population_dimension_df.head()



Series Name,"Life expectancy at birth, total (years)","Life expectancy at birth, female (years)","Life expectancy at birth, male (years)",Net migration,"Population ages 00-14, total",Rural population (% of total population),Rural population growth (annual %),Rural poverty headcount ratio at national poverty lines (% of rural population),Urban poverty headcount ratio at national poverty lines (% of urban population),Urban population (% of total population),...,"Population ages 25-29, total","Population ages 30-34, total","Population ages 35-39, total","Population ages 40-44, total","Population ages 45-49, total","Population ages 50-54, total","Population ages 55-59, total","Population ages 60-64, total","Population ages 65 and above, total",Year
0,80.1926829268293,82.6,77.9,..,5699388,19.878,0.557844254173616,..,..,80.122,...,2162471,2182321,2355672,2746155,2616579,2312163,2005420,1533102,4229595,2005 [YR2005]
1,80.3439024390244,82.7,78.1,..,5667703,19.787,0.551491248878675,..,..,80.213,...,2201714,2190448,2307589,2705963,2664884,2366995,2069935,1609828,4321770,2006 [YR2006]
2,80.5439024390244,82.9,78.3,1326431,5635757,19.604,0.0419781487099549,..,..,80.396,...,2246798,2204519,2279262,2634544,2712998,2422688,2122127,1695397,4419171,2007 [YR2007]
3,80.6951219512195,83.0,78.5,..,5616013,19.422,0.150191983656897,..,..,80.578,...,2298129,2227389,2271998,2548658,2757265,2481678,2169817,1786582,4532190,2008 [YR2008]
4,80.9951219512195,83.3,78.8,..,5608093,19.242,0.210657043137183,..,..,80.758,...,2350113,2258947,2277249,2469160,2782789,2541508,2219902,1874437,4663591,2009 [YR2009]


### Handle data quality issues for Population data frame

All issues in life expectancy was because there were no data for 2020.
We will just use the 2019 data to fill in for 2020 since life expectancy doesn't change much from year to year


The net migration data is only tracked every 5 years for every country.
We will just take the value from the closest year for this(eg. 2009 will take from 2007 and 2010 will take from 2012)

The rural and urban poverty headcount ratio at national poverty lines data was completely empty so we will set them to null so we know to ignore them

In [24]:
# handle life expectancy

for index, row in population_dimension_df[population_dimension_df['Year']=='2019 [YR2019]'].iterrows():
    population_dimension_df.loc[index+1, 'Life expectancy at birth, total (years)'] = row['Life expectancy at birth, total (years)']

for index, row in population_dimension_df[population_dimension_df['Year']=='2019 [YR2019]'].iterrows():
    population_dimension_df.loc[index+1, 'Life expectancy at birth, female (years)'] = row['Life expectancy at birth, female (years)']

for index, row in population_dimension_df[population_dimension_df['Year']=='2019 [YR2019]'].iterrows():
    population_dimension_df.loc[index+1, 'Life expectancy at birth, male (years)'] = row['Life expectancy at birth, male (years)']

# handle Net migration

for index, row in population_dimension_df[population_dimension_df['Year']=='2007 [YR2007]'].iterrows():
    population_dimension_df.loc[index-2, 'Net migration'] = row['Net migration']
    population_dimension_df.loc[index-1, 'Net migration'] = row['Net migration']
    population_dimension_df.loc[index+1, 'Net migration'] = row['Net migration']
    population_dimension_df.loc[index+2, 'Net migration'] = row['Net migration']
for index, row in population_dimension_df[population_dimension_df['Year']=='2012 [YR2012]'].iterrows():
    population_dimension_df.loc[index-2, 'Net migration'] = row['Net migration']
    population_dimension_df.loc[index-1, 'Net migration'] = row['Net migration']
    population_dimension_df.loc[index+1, 'Net migration'] = row['Net migration']
    population_dimension_df.loc[index+2, 'Net migration'] = row['Net migration']
for index, row in population_dimension_df[population_dimension_df['Year']=='2017 [YR2017]'].iterrows():
    population_dimension_df.loc[index-2, 'Net migration'] = row['Net migration']
    population_dimension_df.loc[index-1, 'Net migration'] = row['Net migration']
    population_dimension_df.loc[index+1, 'Net migration'] = row['Net migration']
    population_dimension_df.loc[index+2, 'Net migration'] = row['Net migration']
    population_dimension_df.loc[index+3, 'Net migration'] = row['Net migration']

# Handle Rural and urban poverty headcount ratio at national poverty lines

population_dimension_df = population_dimension_df.applymap(lambda x: "Null" if x==".." else x)


population_dimension_df.head()

Series Name,"Life expectancy at birth, total (years)","Life expectancy at birth, female (years)","Life expectancy at birth, male (years)",Net migration,"Population ages 00-14, total",Rural population (% of total population),Rural population growth (annual %),Rural poverty headcount ratio at national poverty lines (% of rural population),Urban poverty headcount ratio at national poverty lines (% of urban population),Urban population (% of total population),...,"Population ages 25-29, total","Population ages 30-34, total","Population ages 35-39, total","Population ages 40-44, total","Population ages 45-49, total","Population ages 50-54, total","Population ages 55-59, total","Population ages 60-64, total","Population ages 65 and above, total",Year
0,80.1926829268293,82.6,77.9,1326431,5699388,19.878,0.557844254173616,Null,Null,80.122,...,2162471,2182321,2355672,2746155,2616579,2312163,2005420,1533102,4229595,2005 [YR2005]
1,80.3439024390244,82.7,78.1,1326431,5667703,19.787,0.551491248878675,Null,Null,80.213,...,2201714,2190448,2307589,2705963,2664884,2366995,2069935,1609828,4321770,2006 [YR2006]
2,80.5439024390244,82.9,78.3,1326431,5635757,19.604,0.0419781487099549,Null,Null,80.396,...,2246798,2204519,2279262,2634544,2712998,2422688,2122127,1695397,4419171,2007 [YR2007]
3,80.6951219512195,83.0,78.5,1326431,5616013,19.422,0.150191983656897,Null,Null,80.578,...,2298129,2227389,2271998,2548658,2757265,2481678,2169817,1786582,4532190,2008 [YR2008]
4,80.9951219512195,83.3,78.8,1326431,5608093,19.242,0.210657043137183,Null,Null,80.758,...,2350113,2258947,2277249,2469160,2782789,2541508,2219902,1874437,4663591,2009 [YR2009]


### Populate the Quality of Life DataFrame

In [27]:
wb_hnps_qol_data = wb_hnps_data.loc[
    (wb_hnps_data['Series Name'] == "People using at least basic drinking water services (% of population)") | 
    (wb_hnps_data['Series Name'] == "People using at least basic sanitation services (% of population)") | 
    (wb_hnps_data['Series Name'] == "People with basic handwashing facilities including soap and water (% of population)") | 
    (wb_hnps_data['Series Name'] == "Unemployment, female (% of female labor force)") | 
    (wb_hnps_data['Series Name'] == "Unemployment, male (% of male labor force)") | 
    (wb_hnps_data['Series Name'] == "Unemployment, total (% of total labor force)") | 
    (wb_hnps_data['Series Name'] == "Maternal leave benefits (% of wages paid in covered period)")
].drop(columns=['Series Code', 'Country Code'])

wb_world_development_indicators_qol_data = wb_world_development_indicators_data.loc[
    (wb_world_development_indicators_data['Series Name'] == "Access to electricity (% of population)") | 
    (wb_world_development_indicators_data['Series Name'] == "Access to electricity, urban (% of urban population)") | 
    (wb_world_development_indicators_data['Series Name'] == "Access to electricity, rural (% of rural population)") | 
    (wb_world_development_indicators_data['Series Name'] == "Part time employment, total (% of total employment)") | 
    (wb_world_development_indicators_data['Series Name'] == "Part time employment, male (% of total male employment)") | 
    (wb_world_development_indicators_data['Series Name'] == "Part time employment, female (% of total female employment)")
].drop(columns=['Series Code', 'Country Code'])

quality_of_life_dimension_df = convert_dataframe_structure(pd.concat([wb_hnps_qol_data, wb_world_development_indicators_qol_data]))
quality_of_life_dimension_df["Year"] = quality_of_life_dimension_df.index
quality_of_life_dimension_df.reset_index(inplace=True)
quality_of_life_dimension_df.drop(columns=['index'], inplace=True)

quality_of_life_dimension_df.head()

Series Name,People using at least basic drinking water services (% of population),People using at least basic sanitation services (% of population),People with basic handwashing facilities including soap and water (% of population),"Unemployment, female (% of female labor force)","Unemployment, total (% of total labor force)","Unemployment, male (% of male labor force)",Maternal leave benefits (% of wages paid in covered period),"Part time employment, male (% of total male employment)","Part time employment, female (% of total female employment)","Part time employment, total (% of total employment)",Access to electricity (% of population),"Access to electricity, urban (% of urban population)","Access to electricity, rural (% of rural population)",Country,Year
0,99.23464422,99.80519484,..,6.46600008010864,6.76000022888184,7.01700019836426,..,28.8299999237061,48.8499984741211,38.2000007629395,100,100,100,Canada,2005 [YR2005]
1,99.23461957,99.75033789,..,6.09200000762939,6.32000017166138,6.52199983596802,..,30.3600006103516,50.1100006103516,39.6500015258789,100,100,100,Canada,2006 [YR2006]
2,99.23549716,99.69595411,..,5.65299987792969,6.03999996185303,6.38399982452393,..,28.8899993896484,48.689998626709,38.25,100,100,100,Canada,2007 [YR2007]
3,99.23607708,99.6414571,..,5.65199995040894,6.1399998664856,6.57200002670288,..,30.2299995422363,49.7200012207031,39.4500007629395,100,100,100,Canada,2008 [YR2008]
4,99.23635504,99.5868457,..,7.00500011444092,8.34000015258789,9.53299999237061,40,33.6500015258789,52.3300018310547,42.6100006103516,100,100,100,Canada,2009 [YR2009]


### Handle data quality issues for Quality of Life data frame

For Access to electricity, rural (% of rural population), all issues was because there were no data for 2020.
We will just use the 2019 data to fill in for 2020 since the data doesn't change much from year to year

For Handle People with basic handwashing facilities including soap and water (% of population), Maternal leave benefits (% of wages paid in covered period), and Part time employment, male, female, and total (% of total male, female, and total employment), Lots of data is missing and there is no pattern to what data isn't so we will replace missing values with null so we can ignore them

In [28]:
# Access to electricity, rural (% of rural population)
# all issues was because there were no data for 2020
# we will just use the 2019 data to fill in for 2020 since the data doesn't change much from year to year

for index, row in quality_of_life_dimension_df[quality_of_life_dimension_df['Year']=='2019 [YR2019]'].iterrows():
    quality_of_life_dimension_df.loc[index+1, 'Access to electricity (% of population)'] = row['Access to electricity (% of population)']

for index, row in quality_of_life_dimension_df[quality_of_life_dimension_df['Year']=='2019 [YR2019]'].iterrows():
    quality_of_life_dimension_df.loc[index+1, 'Access to electricity (% of population)'] = row['Access to electricity (% of population)']

for index, row in quality_of_life_dimension_df[quality_of_life_dimension_df['Year']=='2019 [YR2019]'].iterrows():
    quality_of_life_dimension_df.loc[index+1, 'Access to electricity (% of population)'] = row['Access to electricity (% of population)']


# Handle People with basic handwashing facilities including soap and water (% of population) and
# Maternal leave benefits (% of wages paid in covered period) and
# Part time employment, male, female, and total (% of total male, female, and total employment)

quality_of_life_dimension_df = quality_of_life_dimension_df.applymap(lambda x: "Null" if x==".." else x)

quality_of_life_dimension_df.head()

Series Name,People using at least basic drinking water services (% of population),People using at least basic sanitation services (% of population),People with basic handwashing facilities including soap and water (% of population),"Unemployment, female (% of female labor force)","Unemployment, total (% of total labor force)","Unemployment, male (% of male labor force)",Maternal leave benefits (% of wages paid in covered period),"Part time employment, male (% of total male employment)","Part time employment, female (% of total female employment)","Part time employment, total (% of total employment)",Access to electricity (% of population),"Access to electricity, urban (% of urban population)","Access to electricity, rural (% of rural population)",Country,Year
0,99.23464422,99.80519484,Null,6.46600008010864,6.76000022888184,7.01700019836426,Null,28.8299999237061,48.8499984741211,38.2000007629395,100,100,100,Canada,2005 [YR2005]
1,99.23461957,99.75033789,Null,6.09200000762939,6.32000017166138,6.52199983596802,Null,30.3600006103516,50.1100006103516,39.6500015258789,100,100,100,Canada,2006 [YR2006]
2,99.23549716,99.69595411,Null,5.65299987792969,6.03999996185303,6.38399982452393,Null,28.8899993896484,48.689998626709,38.25,100,100,100,Canada,2007 [YR2007]
3,99.23607708,99.6414571,Null,5.65199995040894,6.1399998664856,6.57200002670288,Null,30.2299995422363,49.7200012207031,39.4500007629395,100,100,100,Canada,2008 [YR2008]
4,99.23635504,99.5868457,Null,7.00500011444092,8.34000015258789,9.53299999237061,40,33.6500015258789,52.3300018310547,42.6100006103516,100,100,100,Canada,2009 [YR2009]


### Populate the Health DataFrame

In [29]:
wb_hnps_health_data = wb_hnps_data.loc[
    (wb_hnps_data['Series Name'] == "Domestic general government health expenditure (% of GDP)") | 
    (wb_hnps_data['Series Name'] == "Hospital beds (per 1,000 people)") | 
    (wb_hnps_data['Series Name'] == "Immunization, HepB3 (% of one-year-old children)") | 
    (wb_hnps_data['Series Name'] == "Immunization, DPT (% of children ages 12-23 months)") | 
    (wb_hnps_data['Series Name'] == "Immunization, measles (% of children ages 12-23 months)") | 
    (wb_hnps_data['Series Name'] == "Immunization, Pol3 (% of one-year-old children)") | 
    (wb_hnps_data['Series Name'] == "Number of surgical procedures (per 100,000 population)") | 
    (wb_hnps_data['Series Name'] == "Number of infant deaths") | 
    (wb_hnps_data['Series Name'] == "Number of stillbirths") | 
    (wb_hnps_data['Series Name'] == "Number of deaths ages 20-24 years") | 
    (wb_hnps_data['Series Name'] == "Nurses and midwives (per 1,000 people)") | 
    (wb_hnps_data['Series Name'] == "Physicians (per 1,000 people)") | 
    (wb_hnps_data['Series Name'] == "Prevalence of overweight (% of adults)") | 
    (wb_hnps_data['Series Name'] == "Diabetes prevalence (% of population ages 20 to 79)") | 
    (wb_hnps_data['Series Name'] == "Prevalence of HIV, total (% of population ages 15-49)") | 
    (wb_hnps_data['Series Name'] == "Adults (ages 15+) living with HIV") | 
    (wb_hnps_data['Series Name'] == "Adults (ages 15-49) newly infected with HIV") | 
    (wb_hnps_data['Series Name'] == "Children (0-14) living with HIV") | 
    (wb_hnps_data['Series Name'] == "Children (ages 0-14) newly infected with HIV")
].drop(columns=['Series Code', 'Country Code'])

health_dimension_df = convert_dataframe_structure(wb_hnps_health_data)
health_dimension_df["Year"] = health_dimension_df.index
health_dimension_df.reset_index(inplace=True)
health_dimension_df.drop(columns=['index'], inplace=True)

health_dimension_df.head()

Series Name,Domestic general government health expenditure (% of GDP),"Hospital beds (per 1,000 people)","Immunization, Pol3 (% of one-year-old children)","Immunization, measles (% of children ages 12-23 months)","Immunization, HepB3 (% of one-year-old children)","Immunization, DPT (% of children ages 12-23 months)","Number of surgical procedures (per 100,000 population)",Number of infant deaths,Number of stillbirths,Number of deaths ages 20-24 years,...,"Nurses and midwives (per 1,000 people)",Prevalence of overweight (% of adults),Diabetes prevalence (% of population ages 20 to 79),"Prevalence of HIV, total (% of population ages 15-49)",Adults (ages 15+) living with HIV,Adults (ages 15-49) newly infected with HIV,Children (0-14) living with HIV,Children (ages 0-14) newly infected with HIV,Country,Year
0,6.59905815,3.1,93,94,14,93,..,1800,1037,1307,...,9.9467,58.6,..,..,..,..,..,..,Canada,2005 [YR2005]
1,6.4866991,3.02,95,93,14,95,..,1825,1083,1311,...,10.0236,59.2,..,..,..,..,..,..,Canada,2006 [YR2006]
2,6.57271051,2.96,99,94,14,94,..,1845,1129,1307,...,10.1208,59.7,..,..,..,..,..,..,Canada,2007 [YR2007]
3,6.70117044,2.85,96,93,28,92,..,1860,1138,1298,...,10.289,60.2,..,..,..,..,..,..,Canada,2008 [YR2008]
4,7.52118206,2.8,93,91,42,91,..,1869,1126,1288,...,10.3743,60.7,..,..,..,..,..,..,Canada,2009 [YR2009]


### Handle data quality issues for Health data frame
Immunization, Pol3 (% of one-year-old children), Immunization, measles (% of children ages 12-23 months), Immunization, HepB3 (% of one-year-old children), Immunization, DPT (% of children ages 12-23 months) all had issues because data was missing from the year 2020, for the we replaced the empty 2020 data with the data from 2019 since the data did not change much each year and it should not affect our analysis

Everything else that has missing data and there is no pattern to what data isn't so we will replace missing values with null


In [30]:
# Immunization, Pol3 (% of one-year-old children), 
# Immunization, measles (% of children ages 12-23 months), 
# Immunization, HepB3 (% of one-year-old children), 
# Immunization, DPT (% of children ages 12-23 months) 

for index, row in health_dimension_df[health_dimension_df['Year']=='2019 [YR2019]'].iterrows():
    health_dimension_df.loc[index+1, 'Immunization, Pol3 (% of one-year-old children)'] = row['Immunization, Pol3 (% of one-year-old children)']

for index, row in health_dimension_df[health_dimension_df['Year']=='2019 [YR2019]'].iterrows():
    health_dimension_df.loc[index+1, 'Immunization, measles (% of children ages 12-23 months)'] = row['Immunization, measles (% of children ages 12-23 months)']

for index, row in health_dimension_df[health_dimension_df['Year']=='2019 [YR2019]'].iterrows():
    health_dimension_df.loc[index+1, 'Immunization, HepB3 (% of one-year-old children)'] = row['Immunization, HepB3 (% of one-year-old children)']

for index, row in health_dimension_df[health_dimension_df['Year']=='2019 [YR2019]'].iterrows():
    health_dimension_df.loc[index+1, 'Immunization, DPT (% of children ages 12-23 months)'] = row['Immunization, DPT (% of children ages 12-23 months)']

# Everything else 

health_dimension_df = health_dimension_df.applymap(lambda x: "Null" if x==".." else x)

health_dimension_df.head()

Series Name,Domestic general government health expenditure (% of GDP),"Hospital beds (per 1,000 people)","Immunization, Pol3 (% of one-year-old children)","Immunization, measles (% of children ages 12-23 months)","Immunization, HepB3 (% of one-year-old children)","Immunization, DPT (% of children ages 12-23 months)","Number of surgical procedures (per 100,000 population)",Number of infant deaths,Number of stillbirths,Number of deaths ages 20-24 years,...,"Nurses and midwives (per 1,000 people)",Prevalence of overweight (% of adults),Diabetes prevalence (% of population ages 20 to 79),"Prevalence of HIV, total (% of population ages 15-49)",Adults (ages 15+) living with HIV,Adults (ages 15-49) newly infected with HIV,Children (0-14) living with HIV,Children (ages 0-14) newly infected with HIV,Country,Year
0,6.59905815,3.1,93,94,14,93,Null,1800,1037,1307,...,9.9467,58.6,Null,Null,Null,Null,Null,Null,Canada,2005 [YR2005]
1,6.4866991,3.02,95,93,14,95,Null,1825,1083,1311,...,10.0236,59.2,Null,Null,Null,Null,Null,Null,Canada,2006 [YR2006]
2,6.57271051,2.96,99,94,14,94,Null,1845,1129,1307,...,10.1208,59.7,Null,Null,Null,Null,Null,Null,Canada,2007 [YR2007]
3,6.70117044,2.85,96,93,28,92,Null,1860,1138,1298,...,10.289,60.2,Null,Null,Null,Null,Null,Null,Canada,2008 [YR2008]
4,7.52118206,2.8,93,91,42,91,Null,1869,1126,1288,...,10.3743,60.7,Null,Null,Null,Null,Null,Null,Canada,2009 [YR2009]


### Populate the Education DataFrame

In [31]:
wb_hnps_education_data = wb_hnps_data.loc[
    (wb_hnps_data['Series Name'] == "Literacy rate, adult total (% of people ages 15 and above)") | 
    (wb_hnps_data['Series Name'] == "Literacy rate, adult male (% of males ages 15 and above)") | 
    (wb_hnps_data['Series Name'] == "Literacy rate, adult female (% of females ages 15 and above)") | 
    (wb_hnps_data['Series Name'] == "School enrollment, primary (% gross)") | 
    (wb_hnps_data['Series Name'] == "School enrollment, secondary (% gross)") | 
    (wb_hnps_data['Series Name'] == "School enrollment, tertiary (% gross)") | 
    (wb_hnps_data['Series Name'] == "Public spending on education, total (% of GDP)")
].drop(columns=['Series Code', 'Country Code'])

wb_education_data_ = wb_education_data.loc[
    (wb_education_data['Series'] == "Population of compulsory school age, both sexes (number)") | 
    (wb_education_data['Series'] == "Population of the official entrance age to primary education, both sexes (number)") | 
    (wb_education_data['Series'] == "Population of the official entrance age to secondary general education, both sexes (number)") | 
    (wb_education_data['Series'] == "Teachers in primary education, both sexes (number)") | 
    (wb_education_data['Series'] == "Teachers in secondary education, both sexes (number)")
].drop(columns=['Series Code', 'Country Code']).rename(columns={"Series" : "Series Name"})

education_dimension_df = convert_dataframe_structure(pd.concat([wb_hnps_education_data, wb_education_data_]))
education_dimension_df["Year"] = education_dimension_df.index
education_dimension_df.reset_index(inplace=True)
education_dimension_df.drop(columns=['index'], inplace=True)

education_dimension_df.head()

Series Name,"Literacy rate, adult total (% of people ages 15 and above)","Literacy rate, adult male (% of males ages 15 and above)","Literacy rate, adult female (% of females ages 15 and above)","School enrollment, tertiary (% gross)","School enrollment, secondary (% gross)","School enrollment, primary (% gross)","Public spending on education, total (% of GDP)","Population of compulsory school age, both sexes (number)","Population of the official entrance age to primary education, both sexes (number)","Population of the official entrance age to secondary general education, both sexes (number)","Teachers in primary education, both sexes (number)","Teachers in secondary education, both sexes (number)",Country,Year
0,..,..,..,..,101.391189575195,97.3645706176758,4.76588010787964,4092573,368975,426980,..,..,Canada,2005 [YR2005]
1,..,..,..,..,101.728248596191,98.8806304931641,..,4054588,361128,427609,..,..,Canada,2006 [YR2006]
2,..,..,..,63.6000595092773,101.90348815918,98.9664764404297,4.7664098739624,4012842,356032,421977,..,..,Canada,2007 [YR2007]
3,..,..,..,63.7673988342285,102.344436645508,98.013313293457,4.62612009048462,3961374,353599,412521,..,..,Canada,2008 [YR2008]
4,..,..,..,63.0657691955566,102.734443664551,99.0087966918945,4.84057998657227,3905068,353547,403277,..,..,Canada,2009 [YR2009]


### Handle data quality issues for Education data frame

Lots of data here is missing and there is no pattern to what data isn't so we will replace missing values with null so we can ignore them

In [32]:
education_dimension_df = education_dimension_df.applymap(lambda x: "Null" if x==".." else x)

education_dimension_df.head()

Series Name,"Literacy rate, adult total (% of people ages 15 and above)","Literacy rate, adult male (% of males ages 15 and above)","Literacy rate, adult female (% of females ages 15 and above)","School enrollment, tertiary (% gross)","School enrollment, secondary (% gross)","School enrollment, primary (% gross)","Public spending on education, total (% of GDP)","Population of compulsory school age, both sexes (number)","Population of the official entrance age to primary education, both sexes (number)","Population of the official entrance age to secondary general education, both sexes (number)","Teachers in primary education, both sexes (number)","Teachers in secondary education, both sexes (number)",Country,Year
0,Null,Null,Null,Null,101.391189575195,97.3645706176758,4.76588010787964,4092573,368975,426980,Null,Null,Canada,2005 [YR2005]
1,Null,Null,Null,Null,101.728248596191,98.8806304931641,Null,4054588,361128,427609,Null,Null,Canada,2006 [YR2006]
2,Null,Null,Null,63.6000595092773,101.90348815918,98.9664764404297,4.7664098739624,4012842,356032,421977,Null,Null,Canada,2007 [YR2007]
3,Null,Null,Null,63.7673988342285,102.344436645508,98.013313293457,4.62612009048462,3961374,353599,412521,Null,Null,Canada,2008 [YR2008]
4,Null,Null,Null,63.0657691955566,102.734443664551,99.0087966918945,4.84057998657227,3905068,353547,403277,Null,Null,Canada,2009 [YR2009]


### Populate the Month Dimension

There will be no need to worry about data quality issues for this dimension

In [122]:
quarters = {
    'January' : 1, 
    'February' : 1, 
    'March' : 1, 
    'April' : 2, 
    'May' : 2, 
    'June' : 2, 
    'July' : 3, 
    'August' : 3, 
    'September' : 3, 
    'October' : 4, 
    'November' : 4, 
    'December' : 4
}

months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']


month_dimension_df = pd.DataFrame(
    columns=
    [
        "Name",
        "Month",
        "Quarter",
        "Year",
        "Decade"
    ]
)

# For each year between 2005 and 2020
for year in range(2005, 2021):
    # For each month of the year
    for month in  months:
        dict = {}
        dict["Name"] = month
        dict["Month"] = months.index(month)+1
        dict["Quarter"] = quarters[month]
        dict["Year"] = year
        dict["Decade"] = int(year / 10) + 1

        month_dimension_df = month_dimension_df.append(dict, ignore_index=True)

month_dimension_df.head()


Unnamed: 0,Name,Month,Quarter,Year,Decade
0,January,1,1,2005,201
1,February,2,1,2005,201
2,March,3,1,2005,201
3,April,4,2,2005,201
4,May,5,2,2005,201


### Populate the Country Dimension

In [34]:
wb_poverty_and_equity_country_data = wb_poverty_and_equity_data.loc[
    (wb_poverty_and_equity_data['Series Name'] == "Population, total")
].drop(columns=['Series Code', 'Country Code']).rename(columns={"Series" : "Series Name"})

wb_hnps_country_data = wb_hnps_data.loc[
    (wb_hnps_data['Series Name'] == "Birth rate, crude (per 1,000 people)") | 
    (wb_hnps_data['Series Name'] == "GNI per capita, Atlas method (current US$)") | 
    (wb_hnps_data['Series Name'] == "Life expectancy at birth, total (years)") | 
    (wb_hnps_data['Series Name'] == "Labor force, total") | 
    (wb_hnps_data['Series Name'] == "Human capital index (HCI) (scale 0-1)") | 
    (wb_hnps_data['Series Name'] == "Population growth (annual %)")
].drop(columns=['Series Code', 'Country Code'])

country_dimension_temp_df = convert_dataframe_structure(pd.concat([wb_poverty_and_equity_country_data, wb_hnps_country_data]))

# Add the extra country information from the countryinfo data
country_dimension_df = pd.DataFrame()

for index, row in country_dimension_temp_df.iterrows():
    data = row.to_dict()
    for _, r2 in country_info_data.iterrows():
        data2 = r2.to_dict()
        if data2['Name'] == data['Country']:
            data['Series Name'] = index
            data['Region'] = data2['Region']
            data['Continent'] = data2['Continent']
            data['Currency'] = data2['Currency']
            data['Capital'] = data2['Capital']

            country_dimension_df = country_dimension_df.append(pd.DataFrame([data]))
            break

country_dimension_df.reset_index(inplace=True)
country_dimension_df.rename(columns={"Series Name": "Year"})
country_dimension_df.drop(columns=['index'], inplace=True)

country_dimension_df.head()


Unnamed: 0,"Population, total","Life expectancy at birth, total (years)",Human capital index (HCI) (scale 0-1),"Labor force, total","GNI per capita, Atlas method (current US$)","Birth rate, crude (per 1,000 people)",Population growth (annual %),Country,Series Name,Region,Continent,Currency,Capital
0,32243753,80.1926829268293,..,20788261,34810,10.6,0.944466927450734,Canada,2005 [YR2005],North America,North America,Canadian Dollar,Ottawa
1,32571174,80.3439024390244,..,20492683,38510,10.9,1.01033450270045,Canada,2006 [YR2006],North America,North America,Canadian Dollar,Ottawa
2,32889025,80.5439024390244,..,20396697,43090,11.2,0.971135141368055,Canada,2007 [YR2007],North America,North America,Canadian Dollar,Ottawa
3,33247118,80.6951219512195,..,20218270,45650,11.3,1.08290711607014,Canada,2008 [YR2008],North America,North America,Canadian Dollar,Ottawa
4,33628895,80.9951219512195,..,19955331,43230,11.3,1.14175809912162,Canada,2009 [YR2009],North America,North America,Canadian Dollar,Ottawa


### Handle data quality issues for Country data frame

Lots of data here is missing and there is no pattern to what data isn't so we will replace missing values with null so we can ignore them

In [50]:
country_dimension_df = country_dimension_df.applymap(lambda x: "Null" if x==".." else x)

country_dimension_df.head()

Unnamed: 0,"Population, total","Life expectancy at birth, total (years)",Human capital index (HCI) (scale 0-1),"Labor force, total","GNI per capita, Atlas method (current US$)","Birth rate, crude (per 1,000 people)",Population growth (annual %),Country,Series Name,Region,Continent,Currency,Capital
0,32243753,80.1926829268293,Null,20788261,34810,10.6,0.944466927450734,Canada,2005 [YR2005],North America,North America,Canadian Dollar,Ottawa
1,32571174,80.3439024390244,Null,20492683,38510,10.9,1.01033450270045,Canada,2006 [YR2006],North America,North America,Canadian Dollar,Ottawa
2,32889025,80.5439024390244,Null,20396697,43090,11.2,0.971135141368055,Canada,2007 [YR2007],North America,North America,Canadian Dollar,Ottawa
3,33247118,80.6951219512195,Null,20218270,45650,11.3,1.08290711607014,Canada,2008 [YR2008],North America,North America,Canadian Dollar,Ottawa
4,33628895,80.9951219512195,Null,19955331,43230,11.3,1.14175809912162,Canada,2009 [YR2009],North America,North America,Canadian Dollar,Ottawa


### Now we build our fact table Data Frame


In [138]:
fact_table_df = pd.DataFrame(
    columns=
    [
        "Month_Key",
        "Country_Key",
        "Education_Key",
        "Population_Key",
        "Quality_of_Life_Key",
        "Health_Key",
        "Event_Key",
        "Quality_of_Life",
        "Development_Index",
        "Human_Development_Index"
    ]
)

# For each country
for country in countries:
    # For each month in the month dimension
    for i, r in month_dimension_df.iterrows():
        year = str(r.Year) + " [YR" + str(r.Year) + "]"

        data = {}
        data["Month_Key"] = i
        data["Country_Key"] = country_dimension_df[(country_dimension_df['Country']== country)&(country_dimension_df['Series Name']== year)].index[0]
        data["Education_Key"] = education_dimension_df[(education_dimension_df['Country']==country)&(education_dimension_df['Year']==year)].index[0]
        data["Population_Key"] = population_dimension_df[(population_dimension_df['Country']==country)&(population_dimension_df['Year']==year)].index[0]
        data["Quality_of_Life_Key"] = quality_of_life_dimension_df[(quality_of_life_dimension_df['Country']==country)&(quality_of_life_dimension_df['Year']==year)].index[0]
        data["Health_Key"] = health_dimension_df[(health_dimension_df['Country']==country)&(health_dimension_df['Year']==year)].index[0]
        #Incase there is a country/month without an event we use None instead 
        try:
            data["Event_Key"] = event_dimension_df[(event_dimension_df['Country']==country)&(event_dimension_df['Start Year']==int(r.Year))&(event_dimension_df['Start Month']==int(r.Month))].index[0]
        except:
            data["Event_Key"] = None

        # Gotta actually do this part now
        data["Quality_of_Life"] = 0
        data["Development_Index"] = 0
        data["Human_Development_Index"] = 0

        fact_table_df = fact_table_df.append(pd.DataFrame([data]), ignore_index=True)


#fact_table_df
print(len(month_dimension_df))


192
