In [1]:
#import dependencies
import pandas as pd
import numpy as np

In [2]:
#Import CSV files
cost_csvpath = "../Resources/cost_of_living.csv"
cities_csvpath="../Resources/CitiesWGeolocation.csv"

cost_raw_df = pd.read_csv(cost_csvpath,index_col=0)
cities_df = pd.read_csv(cities_csvpath,index_col=0)


In [3]:
# Strip out any extraneous white space from the columns that will be filtered later
cities_df["Country"] = cities_df["Country"].str.strip()
cities_df["City"] = cities_df["City"].str.strip()
cost_raw_df["country"] = cost_raw_df["country"].str.strip()
cost_raw_df["city"] = cost_raw_df["city"].str.strip()

In [4]:
# Documentation from Kaggle Dataset explaining the names of the columns. Filtered to only show columns we want to keep
# city	Name of the city
# country	Name of the country
# x2	Meal for 2 People, Mid-range Restaurant, Three-course (USD)
# x28	One-way Ticket (Local Transport) (USD)
# x30	Taxi Start (Normal Tariff) (USD)
# x31	Taxi 1km (Normal Tariff) (USD)

In [5]:
# Normalize the dataset
# filter out the columns we care about and rename them to match the descriptions
cost_df = cost_raw_df[["city","country","x2","x28","x30","x31"]]
cost_df = cost_df.rename(columns={"x2":"meal","x28":"ticket","x30":"taxi_start","x31":"taxi_1km"})

# Standardize capitalization of all text columns
cost_df["city"] = cost_df["city"].str.title()
cost_df["country"] = cost_df["country"].str.title()

# Replace NaN values, sort by country, print a list of unique countries
# cost_df = cost_df.replace(np.nan,0)
cost_df = cost_df.sort_values(by="country")
cost_df["country"].unique()

array(['Afghanistan', 'Albania', 'Algeria', 'American Samoa', 'Andorra',
       'Angola', 'Anguilla', 'Antigua And Barbuda', 'Argentina',
       'Armenia', 'Aruba', 'Australia', 'Austria', 'Azerbaijan',
       'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium',
       'Belize', 'Benin', 'Bermuda', 'Bhutan', 'Bolivia',
       'Bosnia And Herzegovina', 'Botswana', 'Brazil',
       'British Virgin Islands', 'Brunei', 'Bulgaria', 'Burkina Faso',
       'Burundi', 'Cambodia', 'Cameroon', 'Canada', 'Chad', 'Chile',
       'China', 'Colombia', 'Comoros', 'Cook Islands', 'Costa Rica',
       'Croatia', 'Cuba', 'Cyprus', 'Denmark', 'Djibouti', 'Dominica',
       'Dominican Republic', 'Ecuador', 'Egypt', 'El Salvador',
       'Equatorial Guinea', 'Eritrea', 'Estonia', 'Ethiopia',
       'Faroe Islands', 'Fiji', 'Finland', 'France', 'French Guiana',
       'French Polynesia', 'Gabon', 'Georgia', 'Germany', 'Ghana',
       'Gibraltar', 'Greece', 'Greenland', 'Guadeloupe', 'Guatemala',
     

In [6]:
cost_df.loc[(cost_df["country"]=="Jamaica")]

Unnamed: 0,city,country,meal,ticket,taxi_start,taxi_1km
1870,Portmore,Jamaica,45.39,0.91,0.65,2.92
3292,Saint Ann'S Bay,Jamaica,25.94,1.17,0.94,0.81
3412,Black River,Jamaica,19.45,,1.95,2.92
2547,Half Way Tree,Jamaica,48.63,0.71,0.65,3.24
483,Kingston,Jamaica,45.39,0.88,1.95,2.92
2940,Mandeville,Jamaica,25.94,1.3,0.81,0.81
2194,Spanish Town,Jamaica,45.39,,0.65,2.92
2161,May Pen,Jamaica,49.8,0.88,0.65,


In [7]:
# Normalize the dataset

# Standardize the spelling / capitalization
cities_df = cities_df.rename(columns={"Country":"country","City":"city"})
# Remove unnecessary characters
cities_df["country"] = cities_df["country"].str.replace('\u200b',"")
cities_df["country"] = cities_df["country"].str.replace('Kingdom of the Netherlands','Netherlands')

# Standardize capitalization of all text columns
cities_df["country"] = cities_df["country"].str.title()
cities_df["city"] = cities_df["city"].str.title()

# Reset index and replace NaN values
cities_df.reset_index(inplace=True)
cities_df.replace(np.nan,0)

# Sort by country and print a list of unique countries
cities_df = cities_df.sort_values(by="country")
cities_df["country"].unique()

array(['Antigua And Barbuda', 'Argentina', 'Australia', 'Austria',
       'Bahamas', 'Barbados', 'Belarus', 'Belgium', 'Bolivia', 'Brazil',
       'Bulgaria', 'Cambodia', 'Canada', 'Cayman Islands', 'Chile',
       'China', 'Colombia', 'Cook Islands', 'Costa Rica', 'Croatia',
       'Czech Republic', 'Denmark', 'Ecuador', 'Estonia', 'Fiji',
       'Finland', 'France', 'Germany', 'Greece', 'Hungary', 'Iceland',
       'India', 'Indonesia', 'Ireland', 'Israel', 'Italy', 'Jamaica',
       'Japan', 'Kuwait', 'Laos', 'Latvia', 'Lebanon', 'Lithuania',
       'Malaysia', 'Malta', 'Mauritius', 'Mexico', 'Morocco', 'Nepal',
       'Netherlands', 'New Zealand', 'Norway', 'Oman', 'Peru',
       'Philippines', 'Poland', 'Portugal', 'Qatar', 'Romania', 'Russia',
       'Saint Lucia', 'Senegal', 'Serbia', 'Seychelles', 'Singapore',
       'Slovenia', 'South Africa', 'South Korea', 'Spain', 'Sri Lanka',
       'Sweden', 'Switzerland', 'Taiwan', 'Tanzania', 'Thailand',
       'Turkey', 'Turks And Caic

In [8]:
# Initialize four new columns on cities_df with 0 to avoid NaN values
cities_df["meal"] = 0.0
cities_df["ticket"] = 0.0
cities_df["taxi"] = 0.0
# cities_df["taxi_1km"] = 0.0

In [9]:
# Merge the cost data if the city and country match and there is only one result
for i in range(len(cities_df)):
    if len(cost_df.loc[(cost_df['city'] == cities_df['city'][i]) & (cost_df['country'] == cities_df['country'][i])]) == 1:
        result = cost_df.loc[(cost_df['city'] == cities_df['city'][i]) & (cost_df['country'] == cities_df['country'][i])]
        cities_df.at[i,"meal"] = result['meal']
        cities_df.at[i,"ticket"] = result["ticket"]
        cities_df.at[i,"taxi"] = result["taxi_start"] + result["taxi_1km"]
#         cities_df.at[i,"taxi_start"] = result["taxi_start"]
#         cities_df.at[i,"taxi_1km"] = result["taxi_1km"]
        # marker to show which result was applied to the data
        cities_df.at[i,"data_quality"] = 1
        
    # Merge the cost data on country only if there are no results
    elif len(cost_df.loc[(cost_df['city'] == cities_df['city'][i]) & (cost_df['country'] == cities_df['country'][i])]) == 0:
        if len(cost_df.loc[(cost_df['country'] == cities_df['country'][i])]) >=1:
            result = cost_df.loc[(cost_df['country'] == cities_df['country'][i])]
            # Average all results for the country
            cities_df.at[i,"meal"] = result["meal"].mean()
            cities_df.at[i,"ticket"] = result["ticket"].mean()
            cities_df.at[i,"taxi"] = result["taxi_start"].mean() + result["taxi_1km"].mean()
#             cities_df.at[i,"taxi_start"] = result["taxi_start"].mean()
#             cities_df.at[i,"taxi_1km"] = result["taxi_1km"].mean()
            # marker to show which result was applied to the data
            cities_df.at[i,"data_quality"] = 2

In [10]:
# reset the index and remove the two columns that are created as a result
cities_df.reset_index(inplace=True)
cities_df = cities_df.drop(columns=["index","level_0"])
cities_df

Unnamed: 0,city,State/Province,country,Latitude,Longitude,meal,ticket,taxi,data_quality
0,Antigua And Barbuda​,,Antigua And Barbuda,17.223472,-61.955461,92.510000,1.020000,11.300000,2.0
1,Buenos Aires,,Argentina,-34.607568,-58.437089,25.000000,0.150000,1.520000,1.0
2,Margaret River,Western Australia,Australia,-33.953178,115.076937,63.710000,,3.890000,1.0
3,Byron Bay,New South Wales,Australia,-28.648333,153.617778,62.074727,2.804054,4.382322,2.0
4,Gold Coast,Queensland,Australia,-28.002373,153.414599,60.360000,3.350000,4.700000,1.0
...,...,...,...,...,...,...,...,...,...
215,Aspen,Colorado,United States,39.191113,-106.823561,61.948778,2.300036,5.319461,2.0
216,Montevideo,,Uruguay,-34.905892,-56.191309,53.470000,1.150000,3.180000,1.0
217,Hanoi,,Vietnam,21.029450,105.854444,20.030000,0.280000,1.110000,1.0
218,Ho Chi Minh City,,Vietnam,10.776477,106.701938,24.230000,0.280000,1.140000,1.0


In [11]:
# View the number of null values and confirm they aren't in the dataset
cities_df[cities_df["data_quality"].isna()]

Unnamed: 0,city,State/Province,country,Latitude,Longitude,meal,ticket,taxi,data_quality
26,Bahamas​,,Bahamas,24.773655,-78.000055,0.0,0.0,0.0,
46,Cayman Islands​,,Cayman Islands,19.703182,-79.917463,0.0,0.0,0.0,
59,Prague,,Czech Republic,50.087465,14.421254,0.0,0.0,0.0,


In [12]:
# Create a list of countries near the Bahamas, Cayman Islands, and Turks&Caicos that may be in the dataset
caribbean_countries = ['Cuba', 'Haiti','Jamaica','Dominican Republic','Puerto Rico']
# Iterate through and create an average of each category across the countries
meal_sum = 0
ticket_sum = 0
taxi_sum = 0
counter = 0
for i in range(len(caribbean_countries)):
    if len(cost_df.loc[(cost_df['country'] == cities_df['country'][i])]) >=1:
        result = cost_df.loc[(cost_df['country'] == cities_df['country'][i])]
        meal_sum += result['meal'].mean()
        ticket_sum += result['ticket'].mean()
        taxi_sum += (result['taxi_start'] + result['taxi_1km']).mean()
        counter += 1
meal_avg = meal_sum / counter
ticket_avg = ticket_sum / counter
taxi_avg = taxi_sum / counter

print(meal_avg, ticket_avg, taxi_avg)

61.3116935064935 1.9609324324324326 5.2662588235294105


In [13]:
# Add values to the islands that have null values
cities_df.loc[cities_df['country'] == 'Bahamas' ,'meal'] = meal_avg
cities_df.loc[cities_df['country'] == 'Cayman Islands' ,'meal'] = meal_avg
cities_df.loc[cities_df['country'] == 'Turks And Caicos Islands' ,'meal'] = meal_avg

cities_df.loc[cities_df['country'] == 'Bahamas','ticket'] = ticket_avg
cities_df.loc[cities_df['country'] == 'Cayman Islands','ticket'] = ticket_avg
cities_df.loc[cities_df['country'] == 'Turks And Caicos Islands','ticket'] = ticket_avg

cities_df.loc[cities_df['country'] == 'Bahamas','taxi'] = taxi_avg
cities_df.loc[cities_df['country'] == 'Cayman Islands','taxi'] = taxi_avg
cities_df.loc[cities_df['country'] == 'Turks And Caicos Islands','taxi'] = taxi_avg

#update data quality field to keep values
cities_df.loc[cities_df['country'] == 'Bahamas','data_quality'] = 2
cities_df.loc[cities_df['country'] == 'Cayman Islands','data_quality'] = 2
cities_df.loc[cities_df['country'] == 'Turks And Caicos Islands','data_quality'] = 2


cities_df.loc[cities_df['country'] == 'Turks And Caicos Islands']

Unnamed: 0,city,State/Province,country,Latitude,Longitude,meal,ticket,taxi,data_quality
168,Turks And Caicos Islands​,,Turks And Caicos Islands,21.721746,-71.552781,61.311694,1.960932,5.266259,2.0


In [14]:
# Drop null values from the final dataset
final_merged_df = cities_df.dropna(subset=["data_quality"])
final_merged_df.head()

Unnamed: 0,city,State/Province,country,Latitude,Longitude,meal,ticket,taxi,data_quality
0,Antigua And Barbuda​,,Antigua And Barbuda,17.223472,-61.955461,92.51,1.02,11.3,2.0
1,Buenos Aires,,Argentina,-34.607568,-58.437089,25.0,0.15,1.52,1.0
2,Margaret River,Western Australia,Australia,-33.953178,115.076937,63.71,,3.89,1.0
3,Byron Bay,New South Wales,Australia,-28.648333,153.617778,62.074727,2.804054,4.382322,2.0
4,Gold Coast,Queensland,Australia,-28.002373,153.414599,60.36,3.35,4.7,1.0


In [15]:
# Drop unneeded column
final_merged_df = final_merged_df.drop(columns="data_quality")

In [16]:
# create quantile ratings for each variable
final_merged_df["meal_ranking"] = pd.qcut(final_merged_df["meal"],5,labels=[1,2,3,4,5])
final_merged_df["ticket_ranking"] = pd.qcut(final_merged_df["ticket"],5,labels=[1,2,3,4,5])
final_merged_df["taxi_ranking"] = pd.qcut(final_merged_df["taxi"],5,labels=[1,2,3,4,5])
final_merged_df["total_rank"] = final_merged_df[["meal_ranking","ticket_ranking","taxi_ranking"]].mean(axis=1)
final_merged_df["total_rank"] = final_merged_df["total_rank"].round(1)
final_merged_df

Unnamed: 0,city,State/Province,country,Latitude,Longitude,meal,ticket,taxi,meal_ranking,ticket_ranking,taxi_ranking,total_rank
0,Antigua And Barbuda​,,Antigua And Barbuda,17.223472,-61.955461,92.510000,1.020000,11.300000,5,2,5,4.0
1,Buenos Aires,,Argentina,-34.607568,-58.437089,25.000000,0.150000,1.520000,1,1,1,1.0
2,Margaret River,Western Australia,Australia,-33.953178,115.076937,63.710000,,3.890000,4,,2,3.0
3,Byron Bay,New South Wales,Australia,-28.648333,153.617778,62.074727,2.804054,4.382322,3,4,3,3.3
4,Gold Coast,Queensland,Australia,-28.002373,153.414599,60.360000,3.350000,4.700000,3,5,3,3.7
...,...,...,...,...,...,...,...,...,...,...,...,...
215,Aspen,Colorado,United States,39.191113,-106.823561,61.948778,2.300036,5.319461,3,3,4,3.3
216,Montevideo,,Uruguay,-34.905892,-56.191309,53.470000,1.150000,3.180000,2,2,2,2.0
217,Hanoi,,Vietnam,21.029450,105.854444,20.030000,0.280000,1.110000,1,1,1,1.0
218,Ho Chi Minh City,,Vietnam,10.776477,106.701938,24.230000,0.280000,1.140000,1,1,1,1.0


In [17]:
final_merged_df[final_merged_df["total_rank"].isna()]

Unnamed: 0,city,State/Province,country,Latitude,Longitude,meal,ticket,taxi,meal_ranking,ticket_ranking,taxi_ranking,total_rank


In [18]:
final_merged_df.to_csv("../Resources/cost_analysis.csv")