In [3]:
from bs4 import BeautifulSoup
from pprint import pprint
import pandas as pd
import requests
import re
import airportsdata
import googlemaps
import json
import os

In [4]:
# This cell is to reinstantiate the DataFrame to continue working on the process
# of adding has_crossfit to the dataset. 
mainFrame = pd.read_csv("dataset.csv",index_col=0)

cities = list(mainFrame["city"])
for i in range(len(cities)):
    cities[i] = cities[i].lower()
    cities[i] = re.sub(r'\s', '-', cities[i])

countries = list(mainFrame["country"])
for i in range(len(countries)):
    countries[i] = countries[i].lower()

city_info = {cities[i]: countries[i] for i in range(len(cities))}

In [None]:
# Find all airports with direct flights to Casablanca CMN
print("Making soup")
r = requests.get("https://www.flightconnections.com/flights-to-casablanca-cmn")
soup = BeautifulSoup(r.content, 'html.parser')

print("Extracting airports")
directFlights = soup.find('div', attrs={'id':'popular-destinations'})
listFlights = directFlights.find_all('a', attrs={'class':'popular-destination'})
airports = []
for result in listFlights:
    airport = result.find('div', attrs = {'class' : 'popular-destination-airport-name'})
    airports.append(airport.text)

# Getting List of International Schools
print("Getting Schools List")
r = requests.get("https://www.international-schools-database.com/in?filter=on&ages=12-18&city=&language=English")
soup = BeautifulSoup(r.content, 'html.parser')
allSchools = soup.find('div', attrs = {'class' : 'categories'})
schools = allSchools.find_all('a', attrs = {'class' : 'categories-link'})
listSchools = []
for school in schools:
    listSchools.append(school.text)

# Extract IATA Airport Codes from the list of cities
print("Getting IATA Codes")
mainFrame = pd.DataFrame()
pattern = r'\(([A-Z]{3})\)'
airportsData = airportsdata.load('IATA')
cities = []
countries = []

for airport in airports:
    # Extract IATA Code from the results
    match = re.search(pattern,airport)
    iata_code = match.group(1)

    # Get City and Country from IATA Code
    try:
        city = airportsData[iata_code]["city"]
        country = airportsData[iata_code]["country"]
    except:
        KeyError
    # Check if City is already in the list, then we check if that city has an
    # international school. If it passes through, we add it to the mainFrame.
    if city in cities:
        continue
    else:
        for school in listSchools:
            if city.lower() in school.lower() and city != "":
                cities.append(city)
                countries.append(country)
                print(city + ", " + country + " matches.")
            else:
                continue
mainFrame["city"] = cities
mainFrame["country"] = countries

In [None]:
# Taking city_info dict and running it through this script to obtain walkability scores, then adding them to the dataset.
# We cannot extract transit scores as the html that gets returned does not contain transit scores, so we are limited to
# walkability scores for this dataset.
walkabilityScores = []
for city, country in city_info.items():
    query = f"{city}-{country}"
    r = requests.get(f"https://www.walkscore.com/score/{query}")
    soup = BeautifulSoup(r.content, 'html.parser')
    walkScoreImg = soup.find("div", attrs={"class": "block-header-badge score-info-link"})
    img = walkScoreImg.find("img")
    imgAlt = img.get('alt')
    match = re.search(r'\d+', imgAlt)
    walkScore = int(match.group())
    walkabilityScores.append(walkScore)
    pprint(query)

mainFrame["walkability_score"] = walkabilityScores

In [None]:
# Use the Google Places API provided by Google Maps to look for places with CrossFit
# In order to properly add the has_crossfit column to the DataFrame, we need to do some
# string magic to be able to use the .loc() function later on.

def capitalize_last_word(input_string):
    # Split the input string into a list of words
    words = input_string.split()

    # Check if the list has any words
    if words:
        # Capitalize the last word in the list
        words[-1] = words[-1].capitalize()
    
    # Join the words back together to return the final string
    return ' '.join(words)

file_path = 'gmaps_key.txt'
with open(file_path, 'r') as file:
    key = file.read()
gmaps = googlemaps.Client(key = key)

for city, country in city_info.items():
    city_geocode = gmaps.geocode(f"{city}, {country}")
    places = gmaps.places(query = "CrossFit", location = city_geocode[0]["geometry"]["location"], radius = 5000)

    # This is part of the aforementioned string magic necessary to use it later
    city_title = city.capitalize()
    city_title = re.sub(r'-', ' ', city_title)
    city_title = capitalize_last_word(city_title)

    # If the returned results are greater than 0, it is safe to assume that that city
    # has CrossFit, so has_crossfit for that city is set to True.
    if len(places["results"]) > 0:
        pprint(f"{city_title}, {country} has CrossFit")
        mainFrame.loc[mainFrame['city'] == city_title, 'has_crossfit'] = True
    else:
        pprint(f"{city_title}, {country} does not have CrossFit")
        mainFrame.loc[mainFrame['city'] == city_title, 'has_crossfit'] = False
    
    # As the Google Maps API is rate limited, we output the files to use later
    with open(f'./crossfit/{city}-{country}.json', 'w') as fp:
        json.dump(places, fp)

In [None]:
# This cell was used to sequentially read the output .json files for further processing
# I am fully aware that this can all be done when we first call the gmaps.places()
# query, but I am too lazy to keep track of rate limits and felt that this would be
# a better option.

folder_path = "./crossfit/"

# The below function is the same as the above cell
def capitalize_last_word(input_string):
    words = input_string.split()
    if words:
        words[-1] = words[-1].capitalize()
    return ' '.join(words)

# Check if the folder path exists
if os.path.exists(folder_path):
    # Loop through every file in the folder path
    for file_name in os.listdir(folder_path):
        # Check if the file ends with .json so that only json files are processed
        if file_name.endswith(".json"):
            file_path = os.path.join(folder_path,file_name)

            # Similiar string magic as above
            temp = os.path.splitext(file_name)[0][:-3].capitalize()
            temp2 = re.sub(r'-', ' ', temp)
            name_from_file = capitalize_last_word(temp2)
            
            # Check and verify that the name extracted from the file name is in the DataFrame
            if name_from_file in mainFrame['city'].values:
                # Get the index value of the city
                index_value = mainFrame.index[mainFrame['city'] == name_from_file][0]
                print(f"{name_from_file} in DataFrame at Index {index_value}")

                # Open the .json file and do processing
                with open(file_path, 'r') as file:
                    result = json.load(file)
                
                crossfit_gyms = []
                # Get the names of the CrossFit gyms and put them into a list.
                for gym in result["results"]:
                    crossfit_gyms.append(gym["name"])
                mainFrame.at[index_value, 'crossfit_gyms'] = crossfit_gyms

            else:
                print(f"{name_from_file} not in DataFrame")

# Output the DataFrame to the dataset.csv file
mainFrame.to_csv("dataset.csv")