In [1]:
# import modules
import os
import pandas as pd
import numpy as np

## Cost Of Living Index Data

In [2]:
# create a dataframe for each year
COLI_df_2015 = pd.read_csv('01_extract_coli/2015coli.csv')
COLI_df_2016 = pd.read_csv('01_extract_coli/2016coli.csv')
COLI_df_2017 = pd.read_csv('01_extract_coli/2017coli.csv')
COLI_df_2018 = pd.read_csv('01_extract_coli/2018coli.csv')
COLI_df_2019 = pd.read_csv('01_extract_coli/2019coli.csv')

# add a column for year and assigning year value
COLI_df_2015["year"] = int(2015)
COLI_df_2016["year"] = int(2016)
COLI_df_2017["year"] = int(2017)
COLI_df_2018["year"] = int(2018)
COLI_df_2019["year"] = int(2019) 

# add annual ranking based on row number + 1
COLI_df_2015["ranking"] = np.arange(start = 1, stop = len(COLI_df_2015)+1, step = 1)
COLI_df_2016["ranking"] = np.arange(start = 1, stop = len(COLI_df_2016)+1, step = 1)
COLI_df_2017["ranking"] = np.arange(start = 1, stop = len(COLI_df_2017)+1, step = 1)
COLI_df_2018["ranking"] = np.arange(start = 1, stop = len(COLI_df_2018)+1, step = 1)
COLI_df_2019["ranking"] = np.arange(start = 1, stop = len(COLI_df_2019)+1, step = 1)

# drop rank column
COLI_df_2015 = COLI_df_2015.drop(["Rank"], axis=1)
COLI_df_2016 = COLI_df_2016.drop(["Rank"], axis=1)
COLI_df_2017 = COLI_df_2017.drop(["Rank"], axis=1)
COLI_df_2018 = COLI_df_2018.drop(["Rank"], axis=1)
COLI_df_2019 = COLI_df_2019.drop(["Rank"], axis=1)

In [3]:
# concatenate dataframes for all years into a single dataframe
COLI_result_df = pd.concat([COLI_df_2015, COLI_df_2016, COLI_df_2017, 
                                COLI_df_2018, COLI_df_2019
                               ], 
                               axis=0, 
                               join="inner"
                              )
COLI_result_df=COLI_result_df.reset_index(drop=True)
COLI_result_df

Unnamed: 0,Country,Cost of Living Index,Rent Index,Cost of Living Plus Rent Index,Groceries Index,Restaurant Price Index,Local Purchasing Power Index,year,ranking
0,Switzerland,126.03,57.63,92.61,128.44,127.64,146.51,2015,1
1,Norway,118.59,48.70,84.43,110.06,141.45,96.30,2015,2
2,Venezuela,111.01,34.13,73.44,128.43,91.09,11.90,2015,3
3,Iceland,102.14,35.86,69.75,103.76,105.21,70.84,2015,4
4,Denmark,100.60,34.71,68.40,88.59,117.53,105.70,2015,5
...,...,...,...,...,...,...,...,...,...
591,Kosovo (Disputed Territory),26.18,8.05,17.48,23.21,19.57,61.82,2019,115
592,Venezuela,25.73,5.89,16.20,27.58,21.98,3.71,2019,116
593,India,24.17,5.84,15.37,24.20,17.15,66.91,2019,117
594,Tunisia,23.69,4.96,14.69,22.27,13.33,37.99,2019,118


In [4]:
# rename columns to remove spacing (required for SQL database)
COLI_result_df.rename(columns={"Country":"country",
                        "Cost of Living Index":"costofliving_index",
                        "Rent Index":"rent_index",
                        "Groceries Index":"groceries_index",
                        "Restaurant Price Index":"restaurantprice_index",
                        "Local Purchasing Power Index":"localpurchpower_index"
                       }, 
               inplace=True
              )

In [5]:
# drop redundant columns
COLI_result_df.drop(["Cost of Living Plus Rent Index"], axis=1)

Unnamed: 0,country,costofliving_index,rent_index,groceries_index,restaurantprice_index,localpurchpower_index,year,ranking
0,Switzerland,126.03,57.63,128.44,127.64,146.51,2015,1
1,Norway,118.59,48.70,110.06,141.45,96.30,2015,2
2,Venezuela,111.01,34.13,128.43,91.09,11.90,2015,3
3,Iceland,102.14,35.86,103.76,105.21,70.84,2015,4
4,Denmark,100.60,34.71,88.59,117.53,105.70,2015,5
...,...,...,...,...,...,...,...,...
591,Kosovo (Disputed Territory),26.18,8.05,23.21,19.57,61.82,2019,115
592,Venezuela,25.73,5.89,27.58,21.98,3.71,2019,116
593,India,24.17,5.84,24.20,17.15,66.91,2019,117
594,Tunisia,23.69,4.96,22.27,13.33,37.99,2019,118


In [6]:
# resize dataframe (keeping relevant columns) and rearrange column order
COLI_result_df = pd.DataFrame(COLI_result_df[["country", "year", "ranking", "costofliving_index", "rent_index", "groceries_index", 
                   "restaurantprice_index","localpurchpower_index"
                  ]
                 ]
                             ) 
COLI_result_df


Unnamed: 0,country,year,ranking,costofliving_index,rent_index,groceries_index,restaurantprice_index,localpurchpower_index
0,Switzerland,2015,1,126.03,57.63,128.44,127.64,146.51
1,Norway,2015,2,118.59,48.70,110.06,141.45,96.30
2,Venezuela,2015,3,111.01,34.13,128.43,91.09,11.90
3,Iceland,2015,4,102.14,35.86,103.76,105.21,70.84
4,Denmark,2015,5,100.60,34.71,88.59,117.53,105.70
...,...,...,...,...,...,...,...,...
591,Kosovo (Disputed Territory),2019,115,26.18,8.05,23.21,19.57,61.82
592,Venezuela,2019,116,25.73,5.89,27.58,21.98,3.71
593,India,2019,117,24.17,5.84,24.20,17.15,66.91
594,Tunisia,2019,118,23.69,4.96,22.27,13.33,37.99


In [7]:
# count the number of years data for each country
country_df = pd.DataFrame(COLI_result_df.groupby(["country"])["year"].count())

# iterate over dataframe to drop countries where there are not 5 years of data
for index, row in country_df.iterrows():
    if row["year"] !=5:
        print(f"{index} dropped")
        country_df.drop(index, inplace = True)

Bahamas dropped
Belize dropped
Bermuda dropped
Botswana dropped
Brunei dropped
Cuba dropped
Ethiopia dropped
Fiji dropped
Kosovo (Disputed Territory) dropped
Libya dropped
Macao dropped
Mongolia dropped
Mozambique dropped
Myanmar dropped
Namibia dropped
Nicaragua dropped
Paraguay dropped
Syria dropped
Uganda dropped
Us Virgin Islands dropped
Uzbekistan dropped
Zambia dropped


In [8]:
# validate that all countries without 5 years of data have been dropped
check = country_df.loc[country_df["year"] != 5]
check

Unnamed: 0_level_0,year
country,Unnamed: 1_level_1


In [9]:
# filter the database for countries with only 5 years of data
countries = list(country_df.index.values) 
countries

['Albania',
 'Algeria',
 'Argentina',
 'Armenia',
 'Australia',
 'Austria',
 'Azerbaijan',
 'Bahrain',
 'Bangladesh',
 'Belarus',
 'Belgium',
 'Bolivia',
 'Bosnia And Herzegovina',
 'Brazil',
 'Bulgaria',
 'Cambodia',
 'Canada',
 'Chile',
 'China',
 'Colombia',
 'Costa Rica',
 'Croatia',
 'Cyprus',
 'Czech Republic',
 'Denmark',
 'Dominican Republic',
 'Ecuador',
 'Egypt',
 'El Salvador',
 'Estonia',
 'Finland',
 'France',
 'Georgia',
 'Germany',
 'Ghana',
 'Greece',
 'Guatemala',
 'Honduras',
 'Hong Kong',
 'Hungary',
 'Iceland',
 'India',
 'Indonesia',
 'Iran',
 'Iraq',
 'Ireland',
 'Israel',
 'Italy',
 'Jamaica',
 'Japan',
 'Jordan',
 'Kazakhstan',
 'Kenya',
 'Kuwait',
 'Latvia',
 'Lebanon',
 'Lithuania',
 'Luxembourg',
 'Malaysia',
 'Malta',
 'Mauritius',
 'Mexico',
 'Moldova',
 'Montenegro',
 'Morocco',
 'Nepal',
 'Netherlands',
 'New Zealand',
 'Nigeria',
 'North Macedonia',
 'Norway',
 'Oman',
 'Pakistan',
 'Palestine',
 'Panama',
 'Peru',
 'Philippines',
 'Poland',
 'Portugal',

In [10]:
# filter datafrome for countries with 5 years of data
COLI_df = pd.DataFrame(COLI_result_df[COLI_result_df["country"].isin(countries)])
COLI_df

Unnamed: 0,country,year,ranking,costofliving_index,rent_index,groceries_index,restaurantprice_index,localpurchpower_index
0,Switzerland,2015,1,126.03,57.63,128.44,127.64,146.51
1,Norway,2015,2,118.59,48.70,110.06,141.45,96.30
2,Venezuela,2015,3,111.01,34.13,128.43,91.09,11.90
3,Iceland,2015,4,102.14,35.86,103.76,105.21,70.84
4,Denmark,2015,5,100.60,34.71,88.59,117.53,105.70
...,...,...,...,...,...,...,...,...
590,Egypt,2019,114,26.46,5.01,23.19,22.26,23.67
592,Venezuela,2019,116,25.73,5.89,27.58,21.98,3.71
593,India,2019,117,24.17,5.84,24.20,17.15,66.91
594,Tunisia,2019,118,23.69,4.96,22.27,13.33,37.99


In [11]:
# review simple stats as a reasonableness check (the "sniff" test)
COLI_df.describe()

Unnamed: 0,year,ranking,costofliving_index,rent_index,groceries_index,restaurantprice_index,localpurchpower_index
count,540.0,540.0,540.0,540.0,540.0,540.0,540.0
mean,2017.0,59.766667,54.210815,20.37663,49.208926,46.782352,64.945407
std,1.415525,34.580567,20.090345,14.934197,20.344761,24.360431,33.209392
min,2015.0,1.0,20.4,3.0,18.25,13.33,3.35
25%,2016.0,29.0,38.98,10.2825,33.95,29.025,38.73
50%,2017.0,59.0,50.145,15.195,44.315,40.01,54.955
75%,2018.0,90.0,67.0225,27.025,59.43,59.705,92.985
max,2019.0,122.0,131.39,84.62,133.34,141.45,178.74


In [12]:
# confirm same number of records per year for all years
check_df = pd.DataFrame(COLI_df.groupby(["year"]).count())
check_df

Unnamed: 0_level_0,country,ranking,costofliving_index,rent_index,groceries_index,restaurantprice_index,localpurchpower_index
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2015,108,108,108,108,108,108,108
2016,108,108,108,108,108,108,108
2017,108,108,108,108,108,108,108
2018,108,108,108,108,108,108,108
2019,108,108,108,108,108,108,108


In [13]:
# import country&continent csv
country=os.path.join('02_transform_country','country.csv')
country_df=pd.read_csv(country)
country_df.head()

Unnamed: 0,countrycode,country,continent
0,AFG,Afghanistan,Asia
1,ALB,Albania,Europe
2,ATA,Antarctica,Antarctica
3,DZA,Algeria,Africa
4,ASM,American Samoa,Oceania


In [14]:
# replace countries with country code
for ind,row in COLI_df.iterrows():
    for ind2,row2 in country_df.iterrows():
        if row['country'].lower() in row2['country'].lower():
            COLI_df.loc[ind,'country']=country_df.loc[ind2,'countrycode']
            break

In [15]:
# find countries without a country code from the main dataframe
miscountry=[]
for i in COLI_df['country']:
    if (len(i)>3) and i not in miscountry:
        miscountry.append(i)
print(miscountry)

['South Korea', 'Palestine', 'North Macedonia']


In [16]:
# invert match to capture countries that are named differently
# ie. previous round looking for happyall country string within continent_df country, 
# now looking for continent_df country string within happyall country
for ind,row in country_df.iterrows():
    for j in miscountry:
        if row['country'].lower() in j.lower():
            COLI_df['country']=COLI_df['country'].replace(j,country_df.loc[ind,'countrycode'])

In [17]:
# find countries without a country code from the main dataframe
miscountry=[]
for i in COLI_df['country']:
    if (len(i)>3) and i not in miscountry:
        miscountry.append(i)
print(miscountry)

['Palestine']


In [18]:
# rename manually but only if country code exist in country list
countryCode=list(country_df['countrycode'])
if 'PSE' in countryCode:
    COLI_df['country'] = COLI_df['country'].replace(['Palestine'],['PSE'])
else:
    COLI_df=COLI_df[COLI_df.country!='Palestine']

In [19]:
# ensure that all countries still have 5 years of data
countrycount=pd.DataFrame(COLI_df.groupby('country')['year'].count())
for ind,row in countrycount.iterrows():
    if row['year']!=5:
        print(f'{ind}-{row["year"]}')

In [20]:
COLI_df=COLI_df.rename(columns={'country':'countrycode'})

In [21]:
# save transformed cost of living index data
COLI_df.to_csv('02_transform_coli/coli_data.csv',index=False)

## Property Index Data

In [22]:
# create a dataframe for each year
property_df_2015 = pd.read_csv('01_extract_coli/2015property.csv')
property_df_2016 = pd.read_csv('01_extract_coli/2015property.csv')
property_df_2017 = pd.read_csv('01_extract_coli/2015property.csv')
property_df_2018 = pd.read_csv('01_extract_coli/2015property.csv')
property_df_2019 = pd.read_csv('01_extract_coli/2015property.csv')

# add a column for year and assigning year value
property_df_2015["Year"] = int(2015)
property_df_2016["Year"] = int(2016)
property_df_2017["Year"] = int(2017)
property_df_2018["Year"] = int(2018)
property_df_2019["Year"] = int(2019)

# drop rank column
property_df_2015 = property_df_2015.drop(["Rank"], axis=1)
property_df_2016 = property_df_2016.drop(["Rank"], axis=1)
property_df_2017 = property_df_2017.drop(["Rank"], axis=1)
property_df_2018 = property_df_2018.drop(["Rank"], axis=1)
property_df_2019 = property_df_2019.drop(["Rank"], axis=1)


In [23]:
# concatenate dataframes for all years to find common countries across all years
property_result_df = pd.concat([property_df_2015,property_df_2016, property_df_2017, 
                                property_df_2018, property_df_2019
                               ], 
                               axis=0, 
                               join="inner"
                              )
property_result_df=property_result_df.reset_index(drop=True)
property_result_df

Unnamed: 0,Country,Price To Income Ratio,Gross Rental Yield City Centre,Gross Rental Yield Outside of Centre,Price To Rent Ratio City Centre,Price To Rent Ratio Outside Of City Centre,Mortgage As A Percentage Of Income,Affordability Index,Year
0,Uganda,31.13,1.86,10.02,53.70,9.98,649.36,0.15,2015
1,Syria,30.49,3.95,3.15,25.33,31.75,283.67,0.35,2015
2,Hong Kong,30.09,2.67,2.77,37.49,36.08,192.99,0.52,2015
3,Myanmar,26.84,8.47,2.95,11.81,33.92,317.96,0.31,2015
4,Ghana,26.62,10.77,13.45,9.28,7.44,670.38,0.15,2015
...,...,...,...,...,...,...,...,...,...
600,Oman,3.23,11.35,12.64,8.81,7.91,30.30,3.30,2019
601,South Africa,3.14,10.20,9.40,9.80,10.63,33.59,2.98,2019
602,Saudi Arabia,3.12,7.20,6.85,13.89,14.59,23.55,4.25,2019
603,United States,2.62,11.16,12.92,8.96,7.74,19.43,5.15,2019


In [24]:
# resize dataframe (keeping relevant columns) and rearrange column order
property_result_df = pd.DataFrame(property_result_df[["Country", "Year", "Price To Income Ratio", "Mortgage As A Percentage Of Income", 
                           "Affordability Index"
                          ]
                         ]
                          )
property_result_df

Unnamed: 0,Country,Year,Price To Income Ratio,Mortgage As A Percentage Of Income,Affordability Index
0,Uganda,2015,31.13,649.36,0.15
1,Syria,2015,30.49,283.67,0.35
2,Hong Kong,2015,30.09,192.99,0.52
3,Myanmar,2015,26.84,317.96,0.31
4,Ghana,2015,26.62,670.38,0.15
...,...,...,...,...,...
600,Oman,2019,3.23,30.30,3.30
601,South Africa,2019,3.14,33.59,2.98
602,Saudi Arabia,2019,3.12,23.55,4.25
603,United States,2019,2.62,19.43,5.15


In [25]:
# rename columns to remove spacing (required for SQL database)
property_result_df.rename(columns={"Country":"country",
                            "Year":"year",
                            "Price To Income Ratio":"price_income_ratio",
                            "Mortgage As A Percentage Of Income":"mortage_income_ratio",
                            "Affordability Index":"affordability_index"
                           }, 
                   inplace=True
                  )
property_result_df

Unnamed: 0,country,year,price_income_ratio,mortage_income_ratio,affordability_index
0,Uganda,2015,31.13,649.36,0.15
1,Syria,2015,30.49,283.67,0.35
2,Hong Kong,2015,30.09,192.99,0.52
3,Myanmar,2015,26.84,317.96,0.31
4,Ghana,2015,26.62,670.38,0.15
...,...,...,...,...,...
600,Oman,2019,3.23,30.30,3.30
601,South Africa,2019,3.14,33.59,2.98
602,Saudi Arabia,2019,3.12,23.55,4.25
603,United States,2019,2.62,19.43,5.15


In [26]:
# count the number of years data for each country
pcountry_df = pd.DataFrame(property_result_df.groupby(["country"])["year"].count())

# iterate over dataframe to drop countries where there are not 5 years of data
for index, row in pcountry_df.iterrows():
    if row["year"] !=5:
        pcountry_df.drop(index, inplace = True)
        
pcountry_df

Unnamed: 0_level_0,year
country,Unnamed: 1_level_1
Albania,5
Algeria,5
Argentina,5
Armenia,5
Australia,5
...,...
Uruguay,5
Uzbekistan,5
Venezuela,5
Vietnam,5


In [27]:
# validate that all countries without 5 years of data have been dropped
check = pcountry_df.loc[pcountry_df["year"] != 5]
check

Unnamed: 0_level_0,year
country,Unnamed: 1_level_1


In [28]:
# filter the database for countries with only 5 years of data
final_countries = list(pcountry_df.index.values) 
final_countries

['Albania',
 'Algeria',
 'Argentina',
 'Armenia',
 'Australia',
 'Austria',
 'Azerbaijan',
 'Bahamas',
 'Bahrain',
 'Bangladesh',
 'Belarus',
 'Belgium',
 'Belize',
 'Bolivia',
 'Bosnia And Herzegovina',
 'Botswana',
 'Brazil',
 'Bulgaria',
 'Cambodia',
 'Canada',
 'Chile',
 'China',
 'Colombia',
 'Costa Rica',
 'Croatia',
 'Cyprus',
 'Czech Republic',
 'Denmark',
 'Dominican Republic',
 'Ecuador',
 'Egypt',
 'El Salvador',
 'Estonia',
 'Ethiopia',
 'Finland',
 'France',
 'Georgia',
 'Germany',
 'Ghana',
 'Greece',
 'Guatemala',
 'Honduras',
 'Hong Kong',
 'Hungary',
 'Iceland',
 'India',
 'Indonesia',
 'Iran',
 'Iraq',
 'Ireland',
 'Israel',
 'Italy',
 'Jamaica',
 'Japan',
 'Jordan',
 'Kazakhstan',
 'Kenya',
 'Kosovo (Disputed Territory)',
 'Kuwait',
 'Latvia',
 'Lebanon',
 'Libya',
 'Lithuania',
 'Luxembourg',
 'Malaysia',
 'Malta',
 'Mauritius',
 'Mexico',
 'Moldova',
 'Montenegro',
 'Morocco',
 'Mozambique',
 'Myanmar',
 'Namibia',
 'Nepal',
 'Netherlands',
 'New Zealand',
 'Nicara

In [29]:
# filter property datafrome for countries with 5 years of data
property_df = pd.DataFrame(property_result_df[property_result_df["country"].isin(final_countries)])
property_df

Unnamed: 0,country,year,price_income_ratio,mortage_income_ratio,affordability_index
0,Uganda,2015,31.13,649.36,0.15
1,Syria,2015,30.49,283.67,0.35
2,Hong Kong,2015,30.09,192.99,0.52
3,Myanmar,2015,26.84,317.96,0.31
4,Ghana,2015,26.62,670.38,0.15
...,...,...,...,...,...
600,Oman,2019,3.23,30.30,3.30
601,South Africa,2019,3.14,33.59,2.98
602,Saudi Arabia,2019,3.12,23.55,4.25
603,United States,2019,2.62,19.43,5.15


In [30]:
# review simple stats as a reasonableness check (the "sniff" test)
property_df.describe()

Unnamed: 0,year,price_income_ratio,mortage_income_ratio,affordability_index
count,605.0,605.0,605.0,605.0
mean,2017.0,12.152479,141.502314,1.205702
std,1.415384,6.424043,117.369255,0.875464
min,2015.0,1.84,19.43,0.15
25%,2016.0,7.28,61.85,0.52
50%,2017.0,10.79,94.54,1.06
75%,2018.0,16.07,192.99,1.62
max,2019.0,31.13,670.38,5.15


In [31]:
# confirm same number of records per year for all years
check_df = pd.DataFrame(property_df.groupby(["year"]).count())
check_df

Unnamed: 0_level_0,country,price_income_ratio,mortage_income_ratio,affordability_index
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2015,121,121,121,121
2016,121,121,121,121
2017,121,121,121,121
2018,121,121,121,121
2019,121,121,121,121


In [32]:
# replace countries with country code
for ind,row in property_df.iterrows():
    for ind2,row2 in country_df.iterrows():
        if row['country'].lower() in row2['country'].lower():
            property_df.loc[ind,'country']=country_df.loc[ind2,'countrycode']
            break

In [33]:
# find countries without a country code from the main dataframe
miscountry=[]
for i in property_df['country']:
    if (len(i)>3) and i not in miscountry:
        miscountry.append(i)
print(miscountry)

['Kosovo (Disputed Territory)', 'North Macedonia', 'South Korea', 'Palestine']


In [34]:
# invert match to capture countries that are named differently
# ie. previous round looking for happyall country string within continent_df country, 
# now looking for continent_df country string within happyall country
for ind,row in country_df.iterrows():
    for j in miscountry:
        if row['country'].lower() in j.lower():
            property_df['country']=property_df['country'].replace(j,country_df.loc[ind,'countrycode'])

In [35]:
# find countries without a country code from the main dataframe
miscountry=[]
for i in property_df['country']:
    if (len(i)>3) and i not in miscountry:
        miscountry.append(i)
print(miscountry)

['Kosovo (Disputed Territory)', 'Palestine']


In [36]:
# rename manually but only if country code exist in country list
countryCode=list(country_df['countrycode'])
insertcode=['PSE','XKX']
for i in range(len(miscountry)):
    if insertcode[i] in countryCode:
        property_df['country'] = property_df['country'].replace([miscountry[i]],[insertcode[i]])
    else:
        property_df=property_df[property_df.country!=miscountry[i]]

In [37]:
# ensure that all countries still have 5 years of data
countrycount=pd.DataFrame(property_df.groupby('country')['year'].count())
for ind,row in countrycount.iterrows():
    if row['year']!=5:
        print(f'{ind}-{row["year"]}')

In [38]:
property_df=property_df.rename(columns={'country':'countrycode'})

In [39]:
# save transformed property index data
property_df.to_csv('02_transform_coli/property_data.csv',index=False)