In [1]:
import csv
from pathlib import Path
from dotenv import load_dotenv
import os
import pymongo
import uuid
import random
import json


In [2]:
db_client = pymongo.MongoClient('mongodb://localhost:27017')
collections = db_client['db_ai_travel_planner']
tb_city_org = collections['tb_city_org']

In [3]:
IMAGE_URI = 'https://ak-d.tripcdn.com/images/'  #used to replace url when exporting data

In [4]:
def load_csv(filepath):
    data = []
    with open(filepath, 'r', newline='', encoding='utf-8') as csvfile:  # Handle encoding!
        reader = csv.reader(csvfile)  # Or csv.DictReader for dictionaries
        header = next(reader)  # Read the header row (if it exists)
        for row in reader:
            data.append(row)  # Or data.append(dict(zip(header, row))) for DictReader
    return header, data  # Return header and data

In [5]:
continent_map = {}  #key: country, value: continent
continents = {} #key: continent, value: 1
#read continent info
header, data = load_csv(Path("./countries.csv"))
for row in data:
    continent = row[12].replace('Americas', 'America').replace('Oceania', 'Australia').lower()
    if continent != '' and continent != 'Polar':
        continent_map[row[1]] = continent
#print(continent_map)

In [6]:
header, data = load_csv(Path("./city_country_shortened.csv"))
for row in data:
    if row[1] in continent_map:
        #print(row[0] + ',' + row[1] + ',' + continent_map[row[1]])
        a = 1
    else:
        print('Not found continent for this country ------ ' + row[1]) #replace country name in file countries.csv -> run code again to update the map


In [7]:
continents = ['asia', 'america', 'africa', 'australia', 'europe']

In [8]:
#select most commented cities for HOMEPAGE
MIN_REVIEW_COUNT = 100

def generate_cities_for_homepage():
    homepageCityUUIDs = []  #list of city UUIDs that has image Urls
    home_cities = {}
    #top banner (randomly pick a city in europe) ~ 100 cities
    query = {'continent':'europe', 'error': None, 'review': {'$gt': MIN_REVIEW_COUNT}}
    top_review_cities = tb_city_org.find(query).sort({'review': -1}).limit(100)
    total_docs = tb_city_org.count_documents(query)
    rand_top_index = random.randint(0, total_docs)
    top_banner_city = top_review_cities[rand_top_index]
    homepageCityUUIDs.append(top_banner_city['uuid'])
    home_cities['top_banner'] = {'n': top_banner_city['name'], 'c': top_banner_city['country']}
    #print('Top banner city: ' + top_banner_city['name'] + ' country: ' + top_banner_city['country'])
    #random pick (randomly pick a city in america or asia) ~ 100 cities
    query = {'$or': [{'continent':'america'}, {'continent':'asia'}], 'error': None, 'review': {'$gt': MIN_REVIEW_COUNT}}
    random_pick_cities = tb_city_org.find(query).sort({'review': -1}).limit(100)
    total_docs = tb_city_org.count_documents(query)
    rand_top_index = random.randint(0, total_docs)
    rand_pick_city = random_pick_cities[rand_top_index]
    homepageCityUUIDs.append(rand_pick_city['uuid'])
    home_cities['random_pick'] = {'n': rand_pick_city['name'], 'c': rand_pick_city['country']}
    #print('Random pick city: ' + rand_pick_city['name'] + ' country: ' + rand_pick_city['country'])
    #each continent get randomly 20 cities (get 20 cities, sort by review count)
    for continent in continents:
        cities_in_continent = []
        query = {'continent':continent, 'error': None, 'review': {'$gt': MIN_REVIEW_COUNT}}
        pipeline = [
            {"$match": query},
            {"$sample": {"size": 20}}  #X random documents to show in homepage
        ]
        cities = list(tb_city_org.aggregate(pipeline))
        for city in cities:
            homepageCityUUIDs.append(city['uuid'])
            cities_in_continent.append({'n': city['name'], 'c': city['country']})
        home_cities[continent] = cities_in_continent
    #
    return home_cities, homepageCityUUIDs


In [11]:
continent_keys = {  #map for reducing exported file
    'asia': 1, 
    'america': 2, 
    'africa': 3, 
    'australia': 4, 
    'europe': 5
}

def map_continent_key(str_continent):
    return continent_keys[str_continent]

In [27]:
#
def generate_cities_for_homepage_not_random():
    homepageCityUUIDs = []  #list of city UUIDs that has image Urls
    home_cities = {}
    #top banner (randomly pick a city in europe) ~ 100 cities
    query = {'name': 'London', 'country':'United Kingdom', 'continent':'europe'}
    top_review_cities = tb_city_org.find_one(query)
    homepageCityUUIDs.append(top_review_cities['uuid'])
    home_cities['top_banner'] = {'n': top_review_cities['name'], 'c': top_review_cities['country']}
    #random pick (randomly pick a city in america or asia) ~ 100 cities
    query = {'name': 'Phuket', 'country':'Thailand', 'continent':'asia'}
    random_pick_cities = tb_city_org.find_one(query)
    homepageCityUUIDs.append(random_pick_cities['uuid'])
    home_cities['random_pick'] = {'n': random_pick_cities['name'], 'c': random_pick_cities['country']}
    #each continent get 20 cities
    home_cities['asia'] = [
        {'n': 'Tokyo', 'c': 'Japan'},
        {'n': 'Delhi', 'c': 'India'},
        {'n': 'Shanghai', 'c': 'China'},
        {'n': 'Beijing', 'c': 'China'},
        {'n': 'Mumbai', 'c': 'India'},
        {'n': 'Karachi', 'c': 'Pakistan'},
        {'n': 'Dhaka', 'c': 'Bangladesh'},
        {'n': 'Guangzhou', 'c': 'China'},
        {'n': 'Shenzhen', 'c': 'China'},
        {'n': 'Jakarta', 'c': 'Indonesia'},
        {'n': 'Manila', 'c': 'Philippines'},
        {'n': 'Lahore', 'c': 'Pakistan'},
        {'n': 'Bangkok', 'c': 'Thailand'},
        {'n': 'Ho Chi Minh City', 'c': 'Vietnam'},
        {'n': 'Bangkok', 'c': 'Thailand'},
        {'n': 'Chongqing', 'c': 'China'},
        {'n': 'Tianjin', 'c': 'China'},
        {'n': 'Nanjing', 'c': 'China'},
        {'n': 'Wuhan', 'c': 'China'},
        {'n': 'Ahmedabad', 'c': 'India'}
    ]
    home_cities['europe'] = [
        {'n': 'Paris', 'c': 'France'},
        {'n': 'Madrid', 'c': 'Spain'},
        {'n': 'Berlin', 'c': 'Germany'},
        {'n': 'Athens', 'c': 'Greece'},
        {'n': 'Budapest', 'c': 'Hungary'},
        {'n': 'Rome', 'c': 'Italy'},
        {'n': 'Vienna', 'c': 'Austria'},
        {'n': 'Stockholm', 'c': 'Sweden'},
        {'n': 'Warsaw', 'c': 'Poland'},
        {'n': 'Amsterdam', 'c': 'Netherlands'},
        {'n': 'Copenhagen', 'c': 'Denmark'},
        {'n': 'Helsinki', 'c': 'Finland'},
        {'n': 'Dublin', 'c': 'Ireland'},
        {'n': 'Brussels', 'c': 'Belgium'},
        {'n': 'Oslo', 'c': 'Norway'},
        {'n': 'Riga', 'c': 'Latvia'},
        {'n': 'Zagreb', 'c': 'Croatia'},
        {'n': 'Lisbon', 'c': 'Portugal'},
        {'n': 'Luxembourg', 'c': 'Luxembourg'},
        {'n': 'San Marino', 'c': 'San Marino'}
    ]
    home_cities['america'] = [
        {'n': '', 'c': ''},
        {'n': '', 'c': ''},
        {'n': '', 'c': ''},
        {'n': '', 'c': ''},
        {'n': '', 'c': ''},
        {'n': '', 'c': ''},
        {'n': '', 'c': ''},
        {'n': '', 'c': ''},
        {'n': '', 'c': ''},
        {'n': '', 'c': ''},
        {'n': '', 'c': ''},
        {'n': '', 'c': ''},
        {'n': '', 'c': ''},
        {'n': '', 'c': ''},
        {'n': '', 'c': ''},
        {'n': '', 'c': ''},
        {'n': '', 'c': ''},
        {'n': '', 'c': ''},
        {'n': '', 'c': ''},
        {'n': '', 'c': ''}
    ]
    home_cities['australia'] = [
        {'n': '', 'c': ''},
        {'n': '', 'c': ''},
        {'n': '', 'c': ''},
        {'n': '', 'c': ''},
        {'n': '', 'c': ''},
        {'n': '', 'c': ''},
        {'n': '', 'c': ''},
        {'n': '', 'c': ''},
        {'n': '', 'c': ''},
        {'n': '', 'c': ''},
        {'n': '', 'c': ''},
        {'n': '', 'c': ''},
        {'n': '', 'c': ''},
        {'n': '', 'c': ''},
        {'n': '', 'c': ''},
        {'n': '', 'c': ''},
        {'n': '', 'c': ''},
        {'n': '', 'c': ''},
        {'n': '', 'c': ''},
        {'n': '', 'c': ''}
    ]
    home_cities['africa'] = [
        {'n': '', 'c': ''},
        {'n': '', 'c': ''},
        {'n': '', 'c': ''},
        {'n': '', 'c': ''},
        {'n': '', 'c': ''},
        {'n': '', 'c': ''},
        {'n': '', 'c': ''},
        {'n': '', 'c': ''},
        {'n': '', 'c': ''},
        {'n': '', 'c': ''},
        {'n': '', 'c': ''},
        {'n': '', 'c': ''},
        {'n': '', 'c': ''},
        {'n': '', 'c': ''},
        {'n': '', 'c': ''},
        {'n': '', 'c': ''},
        {'n': '', 'c': ''},
        {'n': '', 'c': ''},
        {'n': '', 'c': ''},
        {'n': '', 'c': ''}
    ]
    #find UUIDs of cities
    for continent in continents:
        cities_in_continent = home_cities[continent]
        for city in cities_in_continent:
            query = {'continent':continent, 'name': city['n'], 'country': city['c']}
            city_detail = tb_city_org.find_one(query)
            if city_detail is not None and 'uuid' in city_detail:
                homepageCityUUIDs.append(city_detail['uuid'])
            else:
                print('Missing city: ' + city['n'])
        print(continent + ' : ' + str(len(homepageCityUUIDs)))
    #
    return home_cities, homepageCityUUIDs

In [28]:
#1. generate random cities to show in homepage -> this selected randomly which getting unpopular destinations
#home_cities, homepageCityUUIDs = generate_cities_for_homepage()
home_cities, homepageCityUUIDs = generate_cities_for_homepage_not_random()

print(home_cities)
print(str(len(homepageCityUUIDs)))

asia : 22
Missing city: 
Missing city: 
Missing city: 
Missing city: 
Missing city: 
Missing city: 
Missing city: 
Missing city: 
Missing city: 
Missing city: 
Missing city: 
Missing city: 
Missing city: 
Missing city: 
Missing city: 
Missing city: 
Missing city: 
Missing city: 
Missing city: 
Missing city: 
america : 22
Missing city: 
Missing city: 
Missing city: 
Missing city: 
Missing city: 
Missing city: 
Missing city: 
Missing city: 
Missing city: 
Missing city: 
Missing city: 
Missing city: 
Missing city: 
Missing city: 
Missing city: 
Missing city: 
Missing city: 
Missing city: 
Missing city: 
Missing city: 
africa : 22
Missing city: 
Missing city: 
Missing city: 
Missing city: 
Missing city: 
Missing city: 
Missing city: 
Missing city: 
Missing city: 
Missing city: 
Missing city: 
Missing city: 
Missing city: 
Missing city: 
Missing city: 
Missing city: 
Missing city: 
Missing city: 
Missing city: 
Missing city: 
australia : 22
europe : 42
{'top_banner': {'n': 'London', 'c': 'U

In [23]:
##export ALL cities that had data in db -> the file size should limit to 2 MB (reduce key name)
#export all cities into json file
all_cities = [] #data to export
city_continent_map = {} #key: city, value: continent (just for counting how many cities in each continent)
for continent in continents:
    num_city_in_continent = 0
    db_city = tb_city.find({'continent':continent, 'error': None})
    for document in db_city:
        #count how many cities in a continent
        if document['name'] not in city_continent_map:
            num_city_in_continent += 1
        #save this city into the list
        if (document['name'] == home_cities['top_banner']['n'] and document['country'] == home_cities['top_banner']['c']) or (document['name'] == home_cities['random_pick']['n'] and document['country'] == home_cities['random_pick']['c']):
            newImgUrls = []
            for imgUrl in document['imgUrls']:
                newImgUrls.append(imgUrl.replace(IMAGE_URI, ''))
            #get full info
            all_cities.append({
                "n" : document['name'],
                "c" : document['country'],
                "o": map_continent_key(document['continent']),
                "t" : document['city_id'],
                "w" : document['wonder_id'],
                "r" : document['review'],
                "i" : document['img'].replace(IMAGE_URI, ''),  #remove domain to reduce file size
                'l': newImgUrls,
            })
        elif document['uuid'] in homepageCityUUIDs: #this belongs to homepage, need image
            #get info with 1 image, DO NOT save other images
            all_cities.append({
                "n" : document['name'],
                "c" : document['country'],
                "o": map_continent_key(document['continent']),
                "t" : document['city_id'],
                "w" : document['wonder_id'],
                "r" : document['review'],
                "i" : document['img'].replace(IMAGE_URI, '')
            })
        else:
            #get simple data for searching only (no other relevant info)
            minInfo = {
                "n" : document['name'],
                "c" : document['country'],
                "t" : document['city_id'],
                "w" : document['wonder_id']
            }
            if document['review'] != None and document['review'] > 10000:
                #those cities will be shown up in continent list, need image there
                minInfo['r'] = document['review']
                minInfo['i'] = document['img'].replace(IMAGE_URI, '')
                minInfo['o'] = map_continent_key(document['continent'])
            all_cities.append(minInfo)
    print(continent + ' : ' + str(num_city_in_continent))
#finish
print('All cities count: ' + str(len(all_cities)))
#export to json file
try:
    with open('city_db.json', 'w', encoding='utf-8') as f:  # Use utf-8 encoding
            json.dump(all_cities, f, separators=(',', ':'))  #no break line or extra spaces
except (TypeError, OSError, json.JSONDecodeError) as e:  # Catch potential errors
    print(f"Error exporting to JSON: {e}")

asia : 4115
america : 7780
africa : 1356
australia : 345
europe : 8217
All cities count: 21813
