In [1]:
#import dependencies
import pandas as pd
import numpy as np

In [2]:
#Import CSV files
cost_csvpath = "../Resources/cost_of_living.csv"
cities_csvpath="../Resources/CitiesWGeolocation.csv"

cost_raw_df = pd.read_csv(cost_csvpath,index_col=0)
cities_df = pd.read_csv(cities_csvpath,index_col=0)


In [3]:
# Strip out any extraneous white space from the columns that will be filtered later
cities_df["Country"] = cities_df["Country"].str.strip()
cities_df["City"] = cities_df["City"].str.strip()
cost_raw_df["country"] = cost_raw_df["country"].str.strip()
cost_raw_df["city"] = cost_raw_df["city"].str.strip()

In [4]:
# Documentation from Kaggle Dataset explaining the names of the columns. Filtered to only show columns we want to keep
# city	Name of the city
# country	Name of the country
# x2	Meal for 2 People, Mid-range Restaurant, Three-course (USD)
# x28	One-way Ticket (Local Transport) (USD)
# x30	Taxi Start (Normal Tariff) (USD)
# x31	Taxi 1km (Normal Tariff) (USD)

In [10]:
# Normalize the dataset
# filter out the columns we care about and rename them to match the descriptions
cost_df = cost_raw_df[["city","country","x2","x28","x30","x31"]]
cost_df = cost_df.rename(columns={"x2":"meal","x28":"ticket","x30":"taxi_start","x31":"taxi_1km"})

# Standardize capitalization of all text columns
cost_df["city"] = cost_df["city"].str.title()
cost_df["country"] = cost_df["country"].str.title()

# Replace NaN values, sort by country, print a list of unique countries
cost_df = cost_df.replace(np.nan,0)
cost_df = cost_df.sort_values(by="country")
cost_df["country"].unique()

array(['Afghanistan', 'Albania', 'Algeria', 'American Samoa', 'Andorra',
       'Angola', 'Anguilla', 'Antigua And Barbuda', 'Argentina',
       'Armenia', 'Aruba', 'Australia', 'Austria', 'Azerbaijan',
       'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium',
       'Belize', 'Benin', 'Bermuda', 'Bhutan', 'Bolivia',
       'Bosnia And Herzegovina', 'Botswana', 'Brazil',
       'British Virgin Islands', 'Brunei', 'Bulgaria', 'Burkina Faso',
       'Burundi', 'Cambodia', 'Cameroon', 'Canada', 'Chad', 'Chile',
       'China', 'Colombia', 'Comoros', 'Cook Islands', 'Costa Rica',
       'Croatia', 'Cuba', 'Cyprus', 'Denmark', 'Djibouti', 'Dominica',
       'Dominican Republic', 'Ecuador', 'Egypt', 'El Salvador',
       'Equatorial Guinea', 'Eritrea', 'Estonia', 'Ethiopia',
       'Faroe Islands', 'Fiji', 'Finland', 'France', 'French Guiana',
       'French Polynesia', 'Gabon', 'Georgia', 'Germany', 'Ghana',
       'Gibraltar', 'Greece', 'Greenland', 'Guadeloupe', 'Guatemala',
     

In [11]:
# Normalize the dataset

# Standardize the spelling / capitalization
cities_df = cities_df.rename(columns={"Country":"country","City":"city"})
# Remove unnecessary characters
cities_df["country"] = cities_df["country"].str.replace('\u200b',"")
cities_df["country"] = cities_df["country"].str.replace('Kingdom of the Netherlands','Netherlands')

# Standardize capitalization of all text columns
cities_df["country"] = cities_df["country"].str.title()
cities_df["city"] = cities_df["city"].str.title()

# Reset index and replace NaN values
cities_df.reset_index(inplace=True)
cities_df.replace(np.nan,0)

# Sort by country and print a list of unique countries
cities_df = cities_df.sort_values(by="country")
cities_df["country"].unique()

array(['Antigua And Barbuda', 'Argentina', 'Australia', 'Austria',
       'Bahamas', 'Barbados', 'Belarus', 'Belgium', 'Bolivia', 'Brazil',
       'Bulgaria', 'Cambodia', 'Canada', 'Cayman Islands', 'Chile',
       'China', 'Colombia', 'Cook Islands', 'Costa Rica', 'Croatia',
       'Czech Republic', 'Denmark', 'Ecuador', 'Estonia', 'Fiji',
       'Finland', 'France', 'Germany', 'Greece', 'Hungary', 'Iceland',
       'India', 'Indonesia', 'Ireland', 'Israel', 'Italy', 'Jamaica',
       'Japan', 'Kuwait', 'Laos', 'Latvia', 'Lebanon', 'Lithuania',
       'Malaysia', 'Malta', 'Mauritius', 'Mexico', 'Morocco', 'Nepal',
       'Netherlands', 'New Zealand', 'Norway', 'Oman', 'Peru',
       'Philippines', 'Poland', 'Portugal', 'Qatar', 'Romania', 'Russia',
       'Saint Lucia', 'Senegal', 'Serbia', 'Seychelles', 'Singapore',
       'Slovenia', 'South Africa', 'South Korea', 'Spain', 'Sri Lanka',
       'Sweden', 'Switzerland', 'Taiwan', 'Tanzania', 'Thailand',
       'Turkey', 'Turks And Caic

In [12]:
# Initialize four new columns on cities_df with 0 to avoid NaN values
cities_df["meal"] = 0
cities_df["ticket"] = 0
cities_df["taxi_start"] = 0
cities_df["taxi_1km"] = 0

In [13]:
for i in range(len(cities_df)):
    if len(cost_df.loc[(cost_df['city'] == cities_df['city'][i]) & (cost_df['country'] == cities_df['country'][i])]) == 1:
        result = cost_df.loc[(cost_df['city'] == cities_df['city'][i]) & (cost_df['country'] == cities_df['country'][i])]
        cities_df.at[i,"meal"] = result['meal']
        cities_df.at[i,"ticket"] = result["ticket"]
        cities_df.at[i,"taxi_start"] = result["taxi_start"]
        cities_df.at[i,"meal"] = result["taxi_1km"]
        cities_df.at[i,"data_quality"] = 1
    elif len(cost_df.loc[(cost_df['city'] == cities_df['city'][i]) & (cost_df['country'] == cities_df['country'][i])]) == 0:
        if len(cost_df.loc[(cost_df['country'] == cities_df['country'][i])]) >=1:
            result = cost_df.loc[(cost_df['country'] == cities_df['country'][i])]
            cities_df.at[i,"meal"] = result["meal"].mean()
            cities_df.at[i,"meal"] = result["ticket"].mean()
            cities_df.at[i,"meal"] = result["taxi_start"].mean()
            cities_df.at[i,"meal"] = result["taxi_1km"].mean()
            cities_df.at[i,"data_quality"] = 2

In [14]:
cities_df[cities_df["data_quality"].isna()]

Unnamed: 0,index,city,State/Province,country,Latitude,Longitude,meal,ticket,taxi_start,taxi_1km,data_quality
80,91,Bahamas​,,Bahamas,24.773655,-78.000055,0,0,0,0,
29,31,Cayman Islands​,,Cayman Islands,19.703182,-79.917463,0,0,0,0,
109,123,Prague,,Czech Republic,50.087465,14.421254,0,0,0,0,
