**Load Data**

In [1]:
# load the country codes
from utils.geocode_utils import get_country_code_lookup

codes = get_country_code_lookup()

In [2]:
# load reviews
import kagglehub
import pandas as pd
import numpy as np
import swifter
from timeit import timeit
import os

path = kagglehub.dataset_download("christopheiv/winemagdata130k")
fname = 'winemag-data-130k-v2.csv'
reviews = pd.read_csv(os.path.join(path, fname))
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129971 entries, 0 to 129970
Data columns (total 14 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   Unnamed: 0             129971 non-null  int64  
 1   country                129908 non-null  object 
 2   description            129971 non-null  object 
 3   designation            92506 non-null   object 
 4   points                 129971 non-null  int64  
 5   price                  120975 non-null  float64
 6   province               129908 non-null  object 
 7   region_1               108724 non-null  object 
 8   region_2               50511 non-null   object 
 9   taster_name            103727 non-null  object 
 10  taster_twitter_handle  98758 non-null   object 
 11  title                  129971 non-null  object 
 12  variety                129970 non-null  object 
 13  winery                 129971 non-null  object 
dtypes: float64(1), int64(2), object(11)


In [3]:
# extract the locations to augment
location_cols = ['winery', 'region_1', 'region_2', 'province', 'country']
wineries = np.unique(reviews.winery.dropna())
locations = reviews.query('winery in @wineries')[location_cols].copy().drop_duplicates()
print(f'{locations.shape[0] - len(wineries):,d}', 'duplicate winery names in different locations' )
locations

13,659 duplicate winery names in different locations


Unnamed: 0,winery,region_1,region_2,province,country
0,Nicosia,Etna,,Sicily & Sardinia,Italy
1,Quinta dos Avidagos,,,Douro,Portugal
2,Rainstorm,Willamette Valley,Willamette Valley,Oregon,US
3,St. Julian,Lake Michigan Shore,,Michigan,US
4,Sweet Cheeks,Willamette Valley,Willamette Valley,Oregon,US
...,...,...,...,...,...
129940,Standish,Mendocino,,California,US
129941,Apriori,Mendocino County,,California,US
129945,Birichino,Santa Ynez Valley,Central Coast,California,US
129947,Feudo Principi di Butera,Terre Siciliane,,Sicily & Sardinia,Italy


**Method to throttle**

In [4]:
def lookup(name):
  return codes[name] if name in codes else None

**Make it fast**

In [5]:
from timeit import default_timer as timer
start = timer()
reviews['code'] = reviews['country'].swifter.apply(lookup)
elapsed = timer() - start
print(f'{reviews.shape[0]:,d}', 'rows augmented in', f'{elapsed:.3f}', 'seconds.', f'{elapsed/reviews.shape[0]*1_000:.5f}', 'ms per row.')

Pandas Apply:   0%|          | 0/129971 [00:00<?, ?it/s]

129,971 rows augmented in 0.193 seconds. 0.00148 ms per row.


### Throttling Logic

In [6]:
rate = 10                       # 10 calls per second

expected = 1000 / (60*rate)     # time to process 1000 rows, in minutes

print(f'processing time for {1000:,d} records is expected to take {expected:.2f} minutes')

processing time for 1,000 records is expected to take 1.67 minutes


In [7]:
from ratelimit import limits, sleep_and_retry

# throttled variant of fast operation
@sleep_and_retry
@limits(calls=10, period=1)  # Adjust rate limits as needed
def throttled(name):
  return lookup(name)

In [8]:
n = 1000
samples = reviews['country'].sample(1000)
start = timer()
reviews['code'] = samples.swifter.apply(throttled)
elapsed = timer() - start
print(f'{samples.shape[0]:,d}', 'rows augmented in', f'{elapsed/60:.2f}', 'minutes.', f'{elapsed/samples.shape[0]*1_000:.5f}', 'ms per row.')

Pandas Apply:   0%|          | 0/1000 [00:00<?, ?it/s]

1,000 rows augmented in 1.87 minutes. 112.31991 ms per row.


In [14]:
# optimal use of throttling
round(locations.shape[0] / (60*rate), 1)

50.7

In [15]:
# realized throttling
round((elapsed / n * locations.shape[0]) / 60, 1)

56.9

## Tests

In [16]:
import unittest

class CountryCodeLookupTest(unittest.TestCase):

  def test_lookup_valid_country(self):
    self.assertEqual(lookup('France'), 'FR')
    self.assertEqual(lookup('United States Of America'), 'US')

  def test_lookup_unknown_country(self):
    self.assertIsNone(lookup('United States'))

  def test_overrides(self):
    self.assertEqual(lookup('US'), 'US')



In [17]:
class CountryCodeAugmentationTest(unittest.TestCase):

  def test_codes_exist(self):
    self.assertTrue('code' in reviews.columns)
    self.assertEqual(reviews['country'].dropna().count(), reviews['code'].dropna().count())
    self.assertTrue(np.all(np.vectorize(lookup)(reviews['country'].dropna()) == reviews['code'].dropna().to_numpy()))


In [18]:
if __name__ == '__main__':
    unittest.main(argv=[''], verbosity=2, exit=False)

# https://hamatti.org/posts/unit-test-your-python-code-in-jupyter-notebooks/

test_codes_exist (__main__.CountryCodeAugmentationTest.test_codes_exist) ... FAIL
test_lookup_unknown_country (__main__.CountryCodeLookupTest.test_lookup_unknown_country) ... ok
test_lookup_valid_country (__main__.CountryCodeLookupTest.test_lookup_valid_country) ... ok
test_overrides (__main__.CountryCodeLookupTest.test_overrides) ... ok

FAIL: test_codes_exist (__main__.CountryCodeAugmentationTest.test_codes_exist)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/var/folders/y8/_pw0hcc137n3wqx4y4cz33100000gq/T/ipykernel_27311/372395469.py", line 5, in test_codes_exist
    self.assertEqual(reviews['country'].dropna().count(), reviews['code'].dropna().count())
AssertionError: np.int64(129908) != np.int64(998)

----------------------------------------------------------------------
Ran 4 tests in 0.022s

FAILED (failures=1)
