### Cost of Living Index Data Extraction

In [None]:
# import modules
import pandas as pd
import numpy as np

# Import SQL Alchemy
from sqlalchemy import create_engine

# Import URI
#from config import database

In [None]:
# parse in html tables and check for number of records per table
years = (2015, 2016, 2017, 2018, 2019)

all_COLI_data = []

for year in years:
    url = 'https://www.numbeo.com/cost-of-living/rankings_by_country.jsp?title=' + str(year)
    print(url)
    COLI_data = pd.read_html(url)
    print(len(COLI_data[2]))
    all_COLI_data.append(COLI_data[2])


### Cost of Living Index Data Transformation

In [None]:
# create a dataframe for each year
COLI_df_2015 = pd.DataFrame(all_COLI_data[0])
COLI_df_2016 = pd.DataFrame(all_COLI_data[1])
COLI_df_2017 = pd.DataFrame(all_COLI_data[2])
COLI_df_2018 = pd.DataFrame(all_COLI_data[3])
COLI_df_2019 = pd.DataFrame(all_COLI_data[4])

# add a column for year and assigning year value
COLI_df_2015["Year"] = [int(2015)]*len(all_COLI_data[0]) 
COLI_df_2016["Year"] = [int(2016)]*len(all_COLI_data[1]) 
COLI_df_2017["Year"] = [int(2017)]*len(all_COLI_data[2]) 
COLI_df_2018["Year"] = [int(2018)]*len(all_COLI_data[3]) 
COLI_df_2019["Year"] = [int(2019)]*len(all_COLI_data[4]) 

# add annual ranking based on row number + 1
COLI_df_2015["Ranking"] = np.arange(start = 1, stop = len(COLI_df_2015)+1, step = 1)
COLI_df_2016["Ranking"] = np.arange(start = 1, stop = len(COLI_df_2016)+1, step = 1)
COLI_df_2017["Ranking"] = np.arange(start = 1, stop = len(COLI_df_2017)+1, step = 1)
COLI_df_2018["Ranking"] = np.arange(start = 1, stop = len(COLI_df_2018)+1, step = 1)
COLI_df_2019["Ranking"] = np.arange(start = 1, stop = len(COLI_df_2019)+1, step = 1)

# drop rank column
COLI_df_2015 = COLI_df_2015.drop(["Rank"], axis=1)
COLI_df_2016 = COLI_df_2016.drop(["Rank"], axis=1)
COLI_df_2017 = COLI_df_2017.drop(["Rank"], axis=1)
COLI_df_2018 = COLI_df_2018.drop(["Rank"], axis=1)
COLI_df_2019 = COLI_df_2019.drop(["Rank"], axis=1)

In [None]:
# concatenate dataframes for all years into a single dataframe
COLI_result_df = pd.concat([COLI_df_2015, COLI_df_2016, COLI_df_2017, 
                                COLI_df_2018, COLI_df_2019
                               ], 
                               axis=0, 
                               join="inner"
                              )
COLI_result_df

In [None]:
# rename columns to remove spacing (required for SQL database)
COLI_result_df.rename(columns={"Cost of Living Index":"CostOfLiving_index",
                        "Rent Index":"Rent_index",
                        "Groceries Index":"Groceries_index",
                        "Restaurant Price Index":"RestaurantPrice_index",
                        "Local Purchasing Power Index":"LocalPurchPower_index"
                       }, 
               inplace=True
              )


In [None]:
# drop redundant columns
COLI_result_df.drop(["Cost of Living Plus Rent Index"], axis=1)

In [None]:
# resize dataframe (keeping relevant columns) and rearrange column order
COLI_result_df = pd.DataFrame(COLI_result_df[["Country", "Year", "Ranking", "CostOfLiving_index", "Rent_index", "Groceries_index", 
                   "RestaurantPrice_index","LocalPurchPower_index"
                  ]
                 ]
                             ) 
COLI_result_df


In [None]:
# count the number of years data for each country
country_df = pd.DataFrame(COLI_result_df.groupby(["Country"])["Year"].count())

# iterate over dataframe to drop countries where there are not 5 years of data
for index, row in country_df.iterrows():
    if row["Year"] !=5:
        country_df.drop(index, inplace = True)
        
country_df

In [None]:
# validate that all countries without 5 years of data have been dropped
check = country_df.loc[country_df["Year"] != 5]
check

In [None]:
# filter the database for countries with only 5 years of data
countries = list(country_df.index.values) 
countries


In [None]:
# filter datafrome for countries with 5 years of data
COLI_df = pd.DataFrame(COLI_result_df[COLI_result_df["Country"].isin(countries)])
COLI_df

In [None]:
# review simple stats as a reasonableness check (the "sniff" test)
COLI_df.describe()

In [None]:
# confirm same number of records per year for all years
check_df = pd.DataFrame(COLI_df.groupby(["Year"]).count())
check_df

In [None]:
# parse in html tables and check for number of records per table
all_property_data = []

for year in years:
    url = 'https://www.numbeo.com/property-investment/rankings_by_country.jsp?title=' + str(year)
    print(url)
    property_data = pd.read_html(url)
    print(len(property_data[2]))
    all_property_data.append(property_data[2])


In [None]:
# create a dataframe for each year
property_df_2015 = pd.DataFrame(all_property_data[0])
property_df_2016 = pd.DataFrame(all_property_data[1])
property_df_2017 = pd.DataFrame(all_property_data[2])
property_df_2018 = pd.DataFrame(all_property_data[3])
property_df_2019 = pd.DataFrame(all_property_data[4])

# add a column for year and assigning year value
property_df_2015["Year"] = [int(2015)]*len(all_property_data[0]) 
property_df_2016["Year"] = [int(2016)]*len(all_property_data[1]) 
property_df_2017["Year"] = [int(2017)]*len(all_property_data[2]) 
property_df_2018["Year"] = [int(2018)]*len(all_property_data[3]) 
property_df_2019["Year"] = [int(2019)]*len(all_property_data[4]) 

# drop rank column
property_df_2015 = property_df_2015.drop(["Rank"], axis=1)
property_df_2016 = property_df_2016.drop(["Rank"], axis=1)
property_df_2017 = property_df_2017.drop(["Rank"], axis=1)
property_df_2018 = property_df_2018.drop(["Rank"], axis=1)
property_df_2019 = property_df_2019.drop(["Rank"], axis=1)


In [None]:
# concatenate dataframes for all years to find common countries across all years
property_result_df = pd.concat([property_df_2015,property_df_2016, property_df_2017, 
                                property_df_2018, property_df_2019
                               ], 
                               axis=0, 
                               join="inner"
                              )
property_result_df

In [None]:
# resize dataframe (keeping relevant columns) and rearrange column order
property_result_df = pd.DataFrame(property_result_df[["Country", "Year", "Price To Income Ratio", "Mortgage As A Percentage Of Income", 
                           "Affordability Index"
                          ]
                         ]
                          )
property_result_df

In [None]:
# rename columns to remove spacing (required for SQL database)
property_result_df.rename(columns={"Price To Income Ratio":"price_income_ratio",
                            "Mortgage As A Percentage Of Income":"mortage_income_ratio",
                            "Affordability Index":"affordability_index"
                           }, 
                   inplace=True
                  )
property_result_df

In [None]:
# count the number of years data for each country
pcountry_df = pd.DataFrame(property_result_df.groupby(["Country"])["Year"].count())

# iterate over dataframe to drop countries where there are not 5 years of data
for index, row in pcountry_df.iterrows():
    if row["Year"] !=5:
        pcountry_df.drop(index, inplace = True)
        
pcountry_df

In [None]:
# validate that all countries without 5 years of data have been dropped
check = pcountry_df.loc[pcountry_df["Year"] != 5]
check

In [None]:
# filter the database for countries with only 5 years of data
final_countries = list(pcountry_df.index.values) 
final_countries

In [None]:
# filter property datafrome for countries with 5 years of data
property_df = pd.DataFrame(property_result_df[property_result_df["Country"].isin(final_countries)])
property_df

In [None]:
# filter COLI datafrome for countries with 5 years of data in both dataframes
COLI_df = pd.DataFrame(COLI_result_df[COLI_result_df["Country"].isin(final_countries)])
COLI_df

In [None]:
# review simple stats as a reasonableness check (the "sniff" test)
COLI_df.describe()

In [None]:
# review simple stats as a reasonableness check (the "sniff" test)
property_df.describe()

In [None]:
# create a connection to a Postgresql database
#rds_connection_string = "<insert user name>:<insert password>@localhost:5432/customer_db"
#engine = create_engine(f'postgresql://{rds_connection_string}')

In [None]:
#check for tables
#engine.table_names()

In [None]:
#use pandas to load csv converted DataFrame into database
#new_customer_data_df.to_sql(name='customer_name', con=engine, if_exists='append', index=False)

In [None]:
#Confirm data has been added by querying the customer_name table (also checked in pgAdmin)
#pd.read_sql_query('select * from customer_name', con=engine).head()

In [None]:
# create csv files

#COLI_df.to_csv("COLI_raw.csv")
#property_df.to_csv("property_raw.csv")