In [15]:
# load reviews
import os
import kagglehub
import pandas as pd
import numpy as np
import swifter

In [16]:
# load the reviews
path = kagglehub.dataset_download("christopheiv/winemagdata130k")
fname = "winemag-data-130k-v2.csv"
reviews = pd.read_csv(os.path.join(path, fname), index_col=0)

# load the country codes
from utils.geocode_utils import get_country_code
reviews["code"] = reviews["country"].swifter.apply(get_country_code)
reviews[['winery', 'province', 'country', 'code']].head()

Pandas Apply:   0%|          | 0/129971 [00:00<?, ?it/s]

Unnamed: 0,winery,province,country,code
0,Nicosia,Sicily & Sardinia,Italy,IT
1,Quinta dos Avidagos,Douro,Portugal,PT
2,Rainstorm,Oregon,US,US
3,St. Julian,Michigan,US,US
4,Sweet Cheeks,Oregon,US,US


In [17]:
# extract the winery locations to geolocate
location_cols = ['winery', 'region_1', 'region_2', 'province', 'country', 'code']
wineries = np.unique(reviews.winery.dropna())
locations = reviews.query("winery in @wineries")[location_cols].copy().drop_duplicates()
print(
    f"{locations.shape[0] - len(wineries):,d}",
    "duplicate winery names in different locations",
)
locations.info()

13,659 duplicate winery names in different locations
<class 'pandas.core.frame.DataFrame'>
Index: 30416 entries, 0 to 129952
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   winery    30416 non-null  object
 1   region_1  26697 non-null  object
 2   region_2  10023 non-null  object
 3   province  30389 non-null  object
 4   country   30389 non-null  object
 5   code      30389 non-null  object
dtypes: object(6)
memory usage: 1.6+ MB


In [18]:
winery_locations_by_country = locations.groupby(['winery', 'code']).count()[['country']].rename(columns={'country': 'count'}).reset_index()
winery_locations_by_country

Unnamed: 0,winery,code,count
0,1+1=3,ES,2
1,10 Knots,US,2
2,100 Percent Wine,US,1
3,1000 Stories,US,2
4,1070 Green,US,1
...,...,...,...
16929,Órale,US,1
16930,Öko,AR,1
16931,Ökonomierat Rebholz,DE,1
16932,àMaurice,US,3


In [19]:
wineries_with_only_1_location_per_country = winery_locations_by_country.query('count == 1').drop(columns=['count']).set_index('winery')
print(f'{wineries_with_only_1_location_per_country.shape[0]/locations.shape[0]:.1%}', 'of the wineries have only 1 operation per country')
wineries_with_only_1_location_per_country

35.2% of the wineries have only 1 operation per country


Unnamed: 0_level_0,code
winery,Unnamed: 1_level_1
100 Percent Wine,US
1070 Green,US
12C Wines,US
13 Celsius,NZ
1752 Signature Wines,FR
...,...
Ñandú,AR
Órale,US
Öko,AR
Ökonomierat Rebholz,DE


In [20]:
wineries_with_multiple_locations_per_country = winery_locations_by_country.query('count > 1').sort_values('count', ascending=False)
wineries_with_multiple_locations_per_country

Unnamed: 0,winery,code,count
10328,Louis Latour,FR,43
10327,Louis Jadot,FR,35
9071,Jean-Luc and Paul Aegerter,FR,35
9180,Joseph Drouhin,FR,34
267,Albert Bichot,FR,33
...,...,...,...
75,A. de Luze et Fils,FR,2
16876,Zorzal,AR,2
16880,Zotovich Cellars,US,2
16885,Zull,AT,2


In [21]:
# some location details are not clean
locations[locations['winery'] == 'Wines & Winemakers'].drop_duplicates()

Unnamed: 0,winery,region_1,region_2,province,country,code
737,Wines & Winemakers,,,Península de Setúbal,Portugal,PT
1177,Wines & Winemakers,,,Douro,Portugal,PT
1617,Wines & Winemakers,,,Tejo,Portugal,PT
2273,Wines & Winemakers,,,Vinho Verde,Portugal,PT
2899,Wines & Winemakers,,,Alentejano,Portugal,PT
5431,Wines & Winemakers,,,Bairrada,Portugal,PT
10145,Wines & Winemakers,,,Palmela,Portugal,PT
13429,Wines & Winemakers,,,Dão,Portugal,PT
50314,Wines & Winemakers,,,Port,Portugal,PT
61778,Wines & Winemakers,,,Setubal,Portugal,PT


In [22]:
# some wineries operate in multiple countries, each with multiple locations
winery_locations_by_country.set_index('winery').loc['Baron Philippe de Rothschild']

Unnamed: 0_level_0,code,count
winery,Unnamed: 1_level_1,Unnamed: 2_level_1
Baron Philippe de Rothschild,CL,3
Baron Philippe de Rothschild,FR,7


In [23]:
# location details for a winery with operations in multiple countries and locations
reviews[location_cols].set_index('winery').loc['Baron Philippe de Rothschild'].drop_duplicates()

Unnamed: 0_level_0,region_1,region_2,province,country,code
winery,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Baron Philippe de Rothschild,Bordeaux,,Bordeaux,France,FR
Baron Philippe de Rothschild,Graves,,Bordeaux,France,FR
Baron Philippe de Rothschild,Bordeaux Blanc,,Bordeaux,France,FR
Baron Philippe de Rothschild,Saint-Émilion,,Bordeaux,France,FR
Baron Philippe de Rothschild,Sauternes,,Bordeaux,France,FR
Baron Philippe de Rothschild,Bordeaux Rosé,,Bordeaux,France,FR
Baron Philippe de Rothschild,,,Maipo Valley,Chile,CL
Baron Philippe de Rothschild,,,Central Valley,Chile,CL
Baron Philippe de Rothschild,Médoc,,Bordeaux,France,FR
Baron Philippe de Rothschild,,,Chile,Chile,CL


In [36]:
wineries_with_only_1_location_per_country

Unnamed: 0_level_0,code
winery,Unnamed: 1_level_1
100 Percent Wine,US
1070 Green,US
12C Wines,US
13 Celsius,NZ
1752 Signature Wines,FR
...,...
Ñandú,AR
Órale,US
Öko,AR
Ökonomierat Rebholz,DE


**Use direct http requests to geocode the wineries**

In [46]:
import requests

def geocode_by_winery_and_country(name, code):
  url = lambda endpoint: f'https://api.mapbox.com/{endpoint}'
  endpoint = 'search/searchbox/v1/forward'
  params = dict(q=name, country=code, poi_category='winery', language='en', access_token='LETMEIN')
  r = requests.get(url(endpoint), params=params)
  print(r.status_code, r.headers)
  return r.json()

In [48]:
geocode_by_winery_and_country('100 Percent Wine', 'US')

401 {'Content-Type': 'application/json; charset=utf-8', 'Content-Length': '44', 'Connection': 'keep-alive', 'Date': 'Mon, 23 Dec 2024 07:30:23 GMT', 'X-Powered-By': 'Express', 'Access-Control-Allow-Origin': '*', 'X-Content-Type-Options': 'nosniff', 'ETag': 'W/"2c-mbs4WeZIt3tmYvk6HtTC1rbvKjQ"', 'X-Cache': 'Error from cloudfront', 'Via': '1.1 b0797f10be715dcb685d992d17347df4.cloudfront.net (CloudFront)', 'X-Amz-Cf-Pop': 'SFO53-P7', 'Alt-Svc': 'h3=":443"; ma=86400', 'X-Amz-Cf-Id': 'p43qtnG-4dE9gEf6ZZrnKkhmdkZl6q9rhsBlRxgoQkyrZSaqLIojiw=='}


dict

## Tests

In [25]:
import unittest


class GeocodeWineryTestCase(unittest.TestCase):

    def test_invalid_request(self):
        print('not implemented')
        
    def test_not_authenticated(self):
        print('not implemented')
        
    def test_lookup_valid_winery(self):
        print("not implemented")

    def test_lookup_unknown_winery(self):
        print("not implemented")

    def test_multiple_matches(self):
        print("not implemented")

    def test_throttling_error(self):
        print("not implemented")


if __name__ == "__main__":
    unittest.main(argv=[""], verbosity=2, exit=False)

test_lookup_unknown_winery (__main__.GeocodeWineryTestCase.test_lookup_unknown_winery) ... ok
test_lookup_valid_winery (__main__.GeocodeWineryTestCase.test_lookup_valid_winery) ... ok
test_multiple_matches (__main__.GeocodeWineryTestCase.test_multiple_matches) ... ok
test_throttling_error (__main__.GeocodeWineryTestCase.test_throttling_error) ... ok

----------------------------------------------------------------------
Ran 4 tests in 0.004s

OK


not implemented
not implemented
not implemented
not implemented
