**Load Data**

In [1]:
# load reviews
import kagglehub
import pandas as pd
import numpy as np
import os
import swifter

path = kagglehub.dataset_download("christopheiv/winemagdata130k")
fname = "winemag-data-130k-v2.csv"
reviews = pd.read_csv(os.path.join(path, fname))
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129971 entries, 0 to 129970
Data columns (total 14 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   Unnamed: 0             129971 non-null  int64  
 1   country                129908 non-null  object 
 2   description            129971 non-null  object 
 3   designation            92506 non-null   object 
 4   points                 129971 non-null  int64  
 5   price                  120975 non-null  float64
 6   province               129908 non-null  object 
 7   region_1               108724 non-null  object 
 8   region_2               50511 non-null   object 
 9   taster_name            103727 non-null  object 
 10  taster_twitter_handle  98758 non-null   object 
 11  title                  129971 non-null  object 
 12  variety                129970 non-null  object 
 13  winery                 129971 non-null  object 
dtypes: float64(1), int64(2), object(11)


In [2]:
# extract the locations to augment
location_cols = ["winery", "region_1", "region_2", "province", "country"]
wineries = np.unique(reviews.winery.dropna())
locations = reviews.query("winery in @wineries")[location_cols].copy().drop_duplicates()
print(
    f"{locations.shape[0] - len(wineries):,d}",
    "duplicate winery names in different locations",
)
locations

13,659 duplicate winery names in different locations


Unnamed: 0,winery,region_1,region_2,province,country
0,Nicosia,Etna,,Sicily & Sardinia,Italy
1,Quinta dos Avidagos,,,Douro,Portugal
2,Rainstorm,Willamette Valley,Willamette Valley,Oregon,US
3,St. Julian,Lake Michigan Shore,,Michigan,US
4,Sweet Cheeks,Willamette Valley,Willamette Valley,Oregon,US
...,...,...,...,...,...
129940,Standish,Mendocino,,California,US
129941,Apriori,Mendocino County,,California,US
129945,Birichino,Santa Ynez Valley,Central Coast,California,US
129947,Feudo Principi di Butera,Terre Siciliane,,Sicily & Sardinia,Italy


**Method to throttle**

In [3]:
# load the country codes
from utils.geocode_utils import get_country_code

**Make it blazingly fast**

In [4]:
from timeit import default_timer as timer

start = timer()
reviews["code"] = reviews["country"].swifter.apply(get_country_code)
elapsed = timer() - start
print(
    f"{reviews.shape[0]:,d}",
    "rows augmented in",
    f"{elapsed:.3f}",
    "seconds.",
    f"{elapsed/reviews.shape[0]*1_000:.5f}",
    "ms per row.",
)

Pandas Apply:   0%|          | 0/129971 [00:00<?, ?it/s]

129,971 rows augmented in 0.252 seconds. 0.00194 ms per row.


### Throttling Logic

In [5]:
rate = 10  # 10 calls per second

expected = 1000 / (60 * rate)  # time to process 1000 rows, in minutes

print(
    f"processing time for {1000:,d} records is expected to take {expected:.2f} minutes"
)

processing time for 1,000 records is expected to take 1.67 minutes


In [6]:
from ratelimit import limits, sleep_and_retry


# throttled variant of fast operation
@sleep_and_retry
@limits(calls=10, period=1)  # Adjust rate limits as needed
def throttled(name):
    return get_country_code(name)

In [7]:
n = 1000
samples = reviews.sample(1000)
start = timer()
samples["code"] = samples["country"].swifter.apply(throttled)
elapsed = timer() - start
print(
    f"{samples.shape[0]:,d}",
    "rows augmented in",
    f"{elapsed/60:.2f}",
    "minutes.",
    f"{elapsed/samples.shape[0]*1_000:.5f}",
    "ms per row.",
)

Pandas Apply:   0%|          | 0/1000 [00:00<?, ?it/s]

1,000 rows augmented in 1.87 minutes. 112.24976 ms per row.


In [8]:
# optimal while still complying with limit
optimal = round(locations.shape[0] / (60 * rate), 1)
optimal

50.7

In [9]:
# realized throttling
realized = round((elapsed / n * locations.shape[0]) / 60, 1)
print(realized, "\n")

efficiency = optimal / realized

assert efficiency < 1.0

print(f"{efficiency:.1%}", "efficiency relative to rate limit")

56.9 

89.1% efficiency relative to rate limit


In [10]:
samples[location_cols + ["code"]].head()

Unnamed: 0,winery,region_1,region_2,province,country,code
61366,Girasole,Mendocino,,California,US,US
97961,Francis Ford Coppola,Sonoma Coast,Sonoma,California,US,US
38724,Domaine Ostertag,Alsace,,Alsace,France,FR
62414,Hedgeline,Washington,Washington Other,Washington,US,US
93542,Offley,,,Port,Portugal,PT


### Example of getting rate limit exceptions

``` python
import requests
from ratelimit import limits, RateLimitException
from backoff import on_exception, expo
from datetime import datetime, timedelta

@on_exception(expo, RateLimitException, max_tries=8)
@limits(calls=1000, period=timedelta(days=1))
def make_api_call():
    response = requests.get("https://api.example.com/data")
    if response.status_code != 200:
        raise Exception("API call failed")
    return response.json()

# Example usage
for i in range(100):
    try:
        data = make_api_call()
        # Process the data
    except RateLimitException:
        print("Rate limit exceeded. Retrying...")
```

## Tests

In [11]:
import unittest


class CountryCodeLookupTest(unittest.TestCase):
    def test_lookup_valid_country(self):
        self.assertEqual(get_country_code("France"), "FR")
        self.assertEqual(get_country_code("United States Of America"), "US")

    def test_lookup_unknown_country(self):
        self.assertIsNone(get_country_code("United States"))

    def test_overrides(self):
        self.assertEqual(get_country_code("US"), "US")

In [12]:
class CountryCodeAugmentationTest(unittest.TestCase):
    def test_codes_exist(self):
        self.assertTrue("code" in samples.columns)
        self.assertEqual(
            samples["country"].dropna().count(), samples["code"].dropna().count()
        )
        self.assertTrue(
            np.all(
                np.vectorize(get_country_code)(samples["country"].dropna())
                == samples["code"].dropna().to_numpy()
            )
        )

In [13]:
if __name__ == "__main__":
    unittest.main(argv=[""], verbosity=2, exit=False)

# https://hamatti.org/posts/unit-test-your-python-code-in-jupyter-notebooks/
# https://medium.com/@rajeshpillai/api-rate-limiting-2542c2a90b38

test_codes_exist (__main__.CountryCodeAugmentationTest.test_codes_exist) ... ok
test_lookup_unknown_country (__main__.CountryCodeLookupTest.test_lookup_unknown_country) ... ok
test_lookup_valid_country (__main__.CountryCodeLookupTest.test_lookup_valid_country) ... ok
test_overrides (__main__.CountryCodeLookupTest.test_overrides) ... ok

----------------------------------------------------------------------
Ran 4 tests in 0.007s

OK
