In [1]:
# import modules
import pandas as pd
import numpy as np


In [2]:
# scrape data and check for number of records per year
years = (2015, 2016, 2017, 2018, 2019)

all_COLI_data = []

for year in years:
    url = 'https://www.numbeo.com/cost-of-living/rankings_by_country.jsp?title=' + str(year)
    print(url)
    COLI_data = pd.read_html(url)
    print(len(COLI_data[2]))
    all_COLI_data.append(COLI_data[2])


https://www.numbeo.com/cost-of-living/rankings_by_country.jsp?title=2015
119
https://www.numbeo.com/cost-of-living/rankings_by_country.jsp?title=2016
122
https://www.numbeo.com/cost-of-living/rankings_by_country.jsp?title=2017
121
https://www.numbeo.com/cost-of-living/rankings_by_country.jsp?title=2018
115
https://www.numbeo.com/cost-of-living/rankings_by_country.jsp?title=2019
119


In [3]:
# create a dataframe for each year
COLI_df_2015 = pd.DataFrame(all_COLI_data[0])
COLI_df_2016 = pd.DataFrame(all_COLI_data[1])
COLI_df_2017 = pd.DataFrame(all_COLI_data[2])
COLI_df_2018 = pd.DataFrame(all_COLI_data[3])
COLI_df_2019 = pd.DataFrame(all_COLI_data[4])

# add a column for year and assigning year value
COLI_df_2015["Year"] = [int(2015)]*len(all_COLI_data[0]) 
COLI_df_2016["Year"] = [int(2016)]*len(all_COLI_data[1]) 
COLI_df_2017["Year"] = [int(2017)]*len(all_COLI_data[2]) 
COLI_df_2018["Year"] = [int(2018)]*len(all_COLI_data[3]) 
COLI_df_2019["Year"] = [int(2019)]*len(all_COLI_data[4]) 

# add annual ranking based on row number + 1
COLI_df_2015["Ranking"] = np.arange(start = 1, stop = len(COLI_df_2015)+1, step = 1)
COLI_df_2016["Ranking"] = np.arange(start = 1, stop = len(COLI_df_2016)+1, step = 1)
COLI_df_2017["Ranking"] = np.arange(start = 1, stop = len(COLI_df_2017)+1, step = 1)
COLI_df_2018["Ranking"] = np.arange(start = 1, stop = len(COLI_df_2018)+1, step = 1)
COLI_df_2019["Ranking"] = np.arange(start = 1, stop = len(COLI_df_2019)+1, step = 1)

# drop rank column
COLI_df_2015 = COLI_df_2015.drop(["Rank"], axis=1)
COLI_df_2016 = COLI_df_2016.drop(["Rank"], axis=1)
COLI_df_2017 = COLI_df_2017.drop(["Rank"], axis=1)
COLI_df_2018 = COLI_df_2018.drop(["Rank"], axis=1)
COLI_df_2019 = COLI_df_2019.drop(["Rank"], axis=1)

In [4]:
# concatenate dataframes for all years to find common countries across all years
COLI_result_df = pd.concat([COLI_df_2015, COLI_df_2016, COLI_df_2017, 
                                COLI_df_2018, COLI_df_2019
                               ], 
                               axis=0, 
                               join="inner"
                              )
COLI_result_df

Unnamed: 0,Country,Cost of Living Index,Rent Index,Cost of Living Plus Rent Index,Groceries Index,Restaurant Price Index,Local Purchasing Power Index,Year,Ranking
0,Switzerland,126.03,57.63,92.61,128.44,127.64,146.51,2015,1
1,Norway,118.59,48.70,84.43,110.06,141.45,96.30,2015,2
2,Venezuela,111.01,34.13,73.44,128.43,91.09,11.90,2015,3
3,Iceland,102.14,35.86,69.75,103.76,105.21,70.84,2015,4
4,Denmark,100.60,34.71,68.40,88.59,117.53,105.70,2015,5
...,...,...,...,...,...,...,...,...,...
114,Kosovo (Disputed Territory),26.18,8.05,17.48,23.21,19.57,61.82,2019,115
115,Venezuela,25.73,5.89,16.20,27.58,21.98,3.71,2019,116
116,India,24.17,5.84,15.37,24.20,17.15,66.91,2019,117
117,Tunisia,23.69,4.96,14.69,22.27,13.33,37.99,2019,118


In [5]:
# slice dataframes to create dataframes for each year
country_df = pd.DataFrame(COLI_result_df.groupby(["Country"])["Year"].count())
country_df


Unnamed: 0_level_0,Year
Country,Unnamed: 1_level_1
Albania,5
Algeria,5
Argentina,5
Armenia,5
Australia,5
...,...
Uzbekistan,3
Venezuela,5
Vietnam,5
Zambia,3


In [6]:
for index, row in country_df.iterrows():
    if row["Year"] !=5:
        country_df.drop(index, inplace = True)
        
country_df

Unnamed: 0_level_0,Year
Country,Unnamed: 1_level_1
Albania,5
Algeria,5
Argentina,5
Armenia,5
Australia,5
...,...
United States,5
Uruguay,5
Venezuela,5
Vietnam,5


In [7]:
# validate that countries have data in all years
check = country_df.loc[country_df["Year"] != 5]
check

Unnamed: 0_level_0,Year
Country,Unnamed: 1_level_1


In [8]:
countries = list(country_df.index.values) 
countries


['Albania',
 'Algeria',
 'Argentina',
 'Armenia',
 'Australia',
 'Austria',
 'Azerbaijan',
 'Bahrain',
 'Bangladesh',
 'Belarus',
 'Belgium',
 'Bolivia',
 'Bosnia And Herzegovina',
 'Brazil',
 'Bulgaria',
 'Cambodia',
 'Canada',
 'Chile',
 'China',
 'Colombia',
 'Costa Rica',
 'Croatia',
 'Cyprus',
 'Czech Republic',
 'Denmark',
 'Dominican Republic',
 'Ecuador',
 'Egypt',
 'El Salvador',
 'Estonia',
 'Finland',
 'France',
 'Georgia',
 'Germany',
 'Ghana',
 'Greece',
 'Guatemala',
 'Honduras',
 'Hong Kong',
 'Hungary',
 'Iceland',
 'India',
 'Indonesia',
 'Iran',
 'Iraq',
 'Ireland',
 'Israel',
 'Italy',
 'Jamaica',
 'Japan',
 'Jordan',
 'Kazakhstan',
 'Kenya',
 'Kuwait',
 'Latvia',
 'Lebanon',
 'Lithuania',
 'Luxembourg',
 'Malaysia',
 'Malta',
 'Mauritius',
 'Mexico',
 'Moldova',
 'Montenegro',
 'Morocco',
 'Nepal',
 'Netherlands',
 'New Zealand',
 'Nigeria',
 'North Macedonia',
 'Norway',
 'Oman',
 'Pakistan',
 'Palestine',
 'Panama',
 'Peru',
 'Philippines',
 'Poland',
 'Portugal',

In [13]:
df = COLI_result_df[COLI_result_df["Country"].isin(countries)]


Unnamed: 0,Country,Cost of Living Index,Rent Index,Cost of Living Plus Rent Index,Groceries Index,Restaurant Price Index,Local Purchasing Power Index,Year,Ranking
0,Switzerland,126.03,57.63,92.61,128.44,127.64,146.51,2015,1
1,Norway,118.59,48.70,84.43,110.06,141.45,96.30,2015,2
2,Venezuela,111.01,34.13,73.44,128.43,91.09,11.90,2015,3
3,Iceland,102.14,35.86,69.75,103.76,105.21,70.84,2015,4
4,Denmark,100.60,34.71,68.40,88.59,117.53,105.70,2015,5
...,...,...,...,...,...,...,...,...,...
113,Egypt,26.46,5.01,16.16,23.19,22.26,23.67,2019,114
115,Venezuela,25.73,5.89,16.20,27.58,21.98,3.71,2019,116
116,India,24.17,5.84,15.37,24.20,17.15,66.91,2019,117
117,Tunisia,23.69,4.96,14.69,22.27,13.33,37.99,2019,118


In [14]:
df.to_csv("df_raw.csv")

In [None]:
# concatenate dataframes for all years to find common countries across all years

#COLI_df = pd.merge(country_df, COLI_result_df, how = "left")  

#COLI_df = pd.concat([COLI_df_2015,COLI_df_2016, COLI_df_2017, COLI_df_2018, COLI_df_2019
#                    ], 
 #                   axis=0, 
  #                  join="inner"
   #                )

In [None]:
# rename columns to remove spacing
COLI_df.rename(columns={"Cost of Living Index":"CostOfLiving_index",
                        "Rent Index":"Rent_index",
                        "Groceries Index":"Groceries_index",
                        "Restaurant Price Index":"RestaurantPrice_index",
                        "Local Purchasing Power Index":"LocalPurchPower_index"
                       }, 
               inplace=True
              )

In [None]:
# slice dataframe (keeping relevant columns) and rearrange column order
COLI_df = COLI_df[["Country", "Year", "Ranking", "CostOfLiving_index", "Rent_index", "Groceries_index", 
                   "RestaurantPrice_index","LocalPurchPower_index"
                  ]
                 ] 

In [None]:
# display dataframe
COLI_df


In [None]:
# Scrape data and check for number of records per year
all_property_data = []

for year in years:
    url = 'https://www.numbeo.com/property-investment/rankings_by_country.jsp?title=' + str(year)
    print(url)
    property_data = pd.read_html(url)
    print(len(property_data[2]))
    all_property_data.append(property_data[2])


In [None]:
# create a dataframe for each year
property_df_2015 = pd.DataFrame(all_property_data[0])
property_df_2016 = pd.DataFrame(all_property_data[1])
property_df_2017 = pd.DataFrame(all_property_data[2])
property_df_2018 = pd.DataFrame(all_property_data[3])
property_df_2019 = pd.DataFrame(all_property_data[4])

# add a column for year and assigning year value
property_df_2015["Year"] = [int(2015)]*len(all_property_data[0]) 
property_df_2016["Year"] = [int(2016)]*len(all_property_data[1]) 
property_df_2017["Year"] = [int(2017)]*len(all_property_data[2]) 
property_df_2018["Year"] = [int(2018)]*len(all_property_data[3]) 
property_df_2019["Year"] = [int(2019)]*len(all_property_data[4]) 


In [None]:
# concatenate dataframes for all years to find common countries across all years
property_result_df = pd.concat([property_df_2015,property_df_2016, property_df_2017, 
                                property_df_2018, property_df_2019
                               ], 
                               axis=1, 
                               join="inner"
                              )
property_result_df

In [None]:
# slice dataframes to create dataframes for each year
property_df_2015 = property_result_df.iloc[:,1:10]
property_df_2016 = property_result_df.iloc[:,11:20]
property_df_2017 = property_result_df.iloc[:,21:30]
property_df_2018 = property_result_df.iloc[:,31:40]
property_df_2019 = property_result_df.iloc[:,41:50]


In [None]:
# concatenate dataframes for all years to find common countries across all years
property_df = pd.concat([property_df_2015, property_df_2016, property_df_2017,
                         property_df_2018, property_df_2019
                        ], 
                        axis=0, 
                        join="inner"
                       )

In [None]:
# slice dataframe (keeping relevant columns) and rearrange column order
property_df = property_df[["Country", "Year", "Price To Income Ratio", "Mortgage As A Percentage Of Income", 
                           "Affordability Index"
                          ]
                         ]
property_df

In [None]:
 # rename columns to remove spacing
property_df.rename(columns={"Price To Income Ratio":"price_income_ratio",
                            "Mortgage As A Percentage Of Income":"mortage_income_ratio",
                            "Affordability Index":"affordability_index"
                           }, 
                   inplace=True
                  )

In [None]:
property_df

In [None]:
result_df.to_csv("result_raw.csv")
#property_df.to_csv("property_raw.csv")