In [1]:
import pandas as pd

In [2]:
with open('geo_data/cols.txt') as fp:
    headers = [line.split(':')[0].strip() for line in fp]
cols = ['name', 'country code', 'timezone', 'population']
df = pd.read_csv(filepath_or_buffer='geo_data/allCountries.txt',
                 sep='\t', names=headers, usecols=cols)


In [3]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12368093 entries, 0 to 12368092
Data columns (total 4 columns):
 #   Column        Dtype 
---  ------        ----- 
 0   name          object
 1   country code  object
 2   population    int64 
 3   timezone      object
dtypes: int64(1), object(3)
memory usage: 2.5 GB


In [4]:
pop_lim = 1000
best_matches = df.sort_values(by='population', ascending=False).drop_duplicates(subset='name') \
.rename(columns={'country code':'country_code'}) \
.query('population > @pop_lim and not timezone.isna() and not country_code.isna()') \
.query('name.str.fullmatch(r"[ -~]+")') \
.reset_index(drop=True)

In [5]:
country = pd.read_csv('geo_data/country_info.txt', sep='\t', usecols=['ISO', 'Country', 'Continent'])

In [6]:
locations_with_country = best_matches.merge(right=country, left_on='country_code', right_on='ISO')

In [7]:
continent = pd.read_csv('geo_data/continent_codes.csv')

In [8]:
locations_with_continents = locations_with_country.merge(right=continent, left_on='Continent', right_on='Code') \
.drop(columns=['country_code', 'ISO', 'Continent', 'Code']) \
.rename(columns={'Name':'continent', 'Country': 'country'})

In [9]:
from datetime import datetime
from zoneinfo import ZoneInfo
tz_map = pd.DataFrame(dict(timezone=list(set(locations_with_continents.timezone))))
tz_map['tz_name'] = tz_map.timezone.apply(lambda tz: datetime.now(tz=ZoneInfo(tz)).strftime('%Z')) # zone name
tz_map['utc_offset'] = tz_map.timezone.apply(lambda tz: datetime.now(tz=ZoneInfo(tz)).strftime('%z')) # zone offset

In [10]:
final_df = locations_with_continents.merge(right=tz_map, on='timezone')
new_index = ['name', 'population', 'country', 'continent', 'timezone', 'tz_name', 'utc_offset']
type_map = {item: 'category' for item in ('country', 'continent', 'timezone', 'tz_name')}
type_map = type_map | {'name': 'string', 'utc_offset': 'category'}
final_df = final_df.reindex(columns=new_index).astype(type_map)
final_df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 123638 entries, 0 to 123637
Data columns (total 7 columns):
 #   Column      Non-Null Count   Dtype   
---  ------      --------------   -----   
 0   name        123638 non-null  string  
 1   population  123638 non-null  int64   
 2   country     123638 non-null  category
 3   continent   123638 non-null  category
 4   timezone    123638 non-null  category
 5   tz_name     123638 non-null  category
 6   utc_offset  123638 non-null  category
dtypes: category(5), int64(1), string(1)
memory usage: 10.9 MB


In [13]:
final_df.to_parquet(path='geo_data/complete_data.parquet')