In [1]:
# load reviews
import os
import kagglehub
import pandas as pd
import numpy as np
import swifter

from utils.geocode_utils import get_country_code, memory

In [2]:
# load the reviews
reviews = load_reviews()

# load the country codes
from utils.geocode_utils import get_country_code

reviews["code"] = reviews["country"].swifter.apply(get_country_code)
reviews[["winery", "province", "country", "code"]].head()

Pandas Apply:   0%|          | 0/129971 [00:00<?, ?it/s]

Unnamed: 0,winery,province,country,code
0,Nicosia,Sicily & Sardinia,Italy,IT
1,Quinta dos Avidagos,Douro,Portugal,PT
2,Rainstorm,Oregon,US,US
3,St. Julian,Michigan,US,US
4,Sweet Cheeks,Oregon,US,US


In [3]:
# extract the winery locations to geolocate
location_cols = ["winery", "region_1", "region_2", "province", "country", "code"]
wineries = np.unique(reviews.winery.dropna())
locations = reviews.query("winery in @wineries")[location_cols].copy().drop_duplicates()
print(
    f"{locations.shape[0] - len(wineries):,d}",
    "duplicate winery names in different locations",
)
locations.info()

13,659 duplicate winery names in different locations
<class 'pandas.core.frame.DataFrame'>
Index: 30416 entries, 0 to 129952
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   winery    30416 non-null  object
 1   region_1  26697 non-null  object
 2   region_2  10023 non-null  object
 3   province  30389 non-null  object
 4   country   30389 non-null  object
 5   code      30389 non-null  object
dtypes: object(6)
memory usage: 1.6+ MB


In [4]:
winery_locations_by_country = (
    locations.groupby(["winery", "code"])
    .count()[["country"]]
    .rename(columns={"country": "count"})
    .reset_index()
)
winery_locations_by_country

Unnamed: 0,winery,region_1,region_2,province,country,code
0,Nicosia,Etna,,Sicily & Sardinia,Italy,IT
1,Quinta dos Avidagos,,,Douro,Portugal,PT
2,Rainstorm,Willamette Valley,Willamette Valley,Oregon,US,US
3,St. Julian,Lake Michigan Shore,,Michigan,US,US
4,Sweet Cheeks,Willamette Valley,Willamette Valley,Oregon,US,US
...,...,...,...,...,...,...
129940,Standish,Mendocino,,California,US,US
129941,Apriori,Mendocino County,,California,US,US
129945,Birichino,Santa Ynez Valley,Central Coast,California,US,US
129947,Feudo Principi di Butera,Terre Siciliane,,Sicily & Sardinia,Italy,IT


In [5]:
# preview the changes
clean_duplicate_location_field_value(keep='country', clean='region_2', preview=True)
clean_duplicate_location_field_value(keep='province', clean='region_2', preview=True)
clean_duplicate_location_field_value(keep='region_1', clean='region_2', preview=True)
clean_duplicate_location_field_value(keep='country', clean='region_1', preview=True)
clean_duplicate_location_field_value(keep='province', clean='region_1', preview=True)
clean_duplicate_location_field_value(keep='country', clean='province', preview=True)

0 locations['region_2'] values are the same as locations['country']
0 locations['region_2'] values contain 'Other'
unique locations['region_2'] values that contain 'Other': []

0 locations['region_2'] values are the same as locations['province']
1119 locations['region_2'] values contain 'Other'
unique locations['region_2'] values that contain 'Other': ['California Other' 'New York Other' 'Oregon Other' 'Washington Other']

882 locations['region_2'] values are the same as locations['region_1']
983 locations['region_2'] values contain 'Other'
unique locations['region_2'] values that contain 'Other': ['California Other' 'New York Other' 'Oregon Other' 'Washington Other']

87 locations['region_1'] values are the same as locations['country']
0 locations['region_1'] values contain 'Other'
unique locations['region_1'] values that contain 'Other': []

1988 locations['region_1'] values are the same as locations['province']
6 locations['region_1'] values contain 'Other'
unique locations['region_

In [62]:
# cleanup
clean_duplicate_location_field_value(keep='country', clean='region_2')
clean_duplicate_location_field_value(keep='province', clean='region_2')
clean_duplicate_location_field_value(keep='region_1', clean='region_2')
clean_duplicate_location_field_value(keep='country', clean='region_1')
clean_duplicate_location_field_value(keep='province', clean='region_1')
clean_duplicate_location_field_value(keep='country', clean='province')

In [63]:
# verify the changes
clean_duplicate_location_field_value(keep='country', clean='region_2', preview=True)
clean_duplicate_location_field_value(keep='province', clean='region_2', preview=True)
clean_duplicate_location_field_value(keep='region_1', clean='region_2', preview=True)
clean_duplicate_location_field_value(keep='country', clean='region_1', preview=True)
clean_duplicate_location_field_value(keep='province', clean='region_1', preview=True)
clean_duplicate_location_field_value(keep='country', clean='province', preview=True)

0 locations['region_2'] values are the same as locations['country']
0 locations['region_2'] values contain 'Other'
unique locations['region_2'] values that contain 'Other': []

0 locations['region_2'] values are the same as locations['province']
0 locations['region_2'] values contain 'Other'
unique locations['region_2'] values that contain 'Other': []

0 locations['region_2'] values are the same as locations['region_1']
0 locations['region_2'] values contain 'Other'
unique locations['region_2'] values that contain 'Other': []

0 locations['region_1'] values are the same as locations['country']
0 locations['region_1'] values contain 'Other'
unique locations['region_1'] values that contain 'Other': []

0 locations['region_1'] values are the same as locations['province']
0 locations['region_1'] values contain 'Other'
unique locations['region_1'] values that contain 'Other': []

0 locations['province'] values are the same as locations['country']
0 locations['province'] values contain 'Othe

**Locations with the word `Vin` in region_1**

Some indicate city or wine growing regions, like `Vin Santo di Montepulciano` or `Vin de Pays des Côtes de Gascogne`, but others are not regions like `Vin Mousseux` or `Vin Santo del Chianti Classico`. Need to remove the meaningless values, and reduce the remaining ones to their regions.

In [64]:
region_1_with_Vin = np.unique(locations[(locations['region_1'].notna())&(locations['region_1'].str.contains('Vin '))]['region_1'])
region_1_with_Vin

array(['Vin Doux Naturel Rasteau', 'Vin Mousseux', 'Vin Pétillant',
       'Vin Santo del Chianti', 'Vin Santo del Chianti Classico',
       'Vin Santo del Chianti Rufina', 'Vin Santo di Carmignano',
       'Vin Santo di Montepulciano', 'Vin de France', 'Vin de Liqueur',
       'Vin de Pays Cité de Carcassonne', 'Vin de Pays Var',
       "Vin de Pays d'Oc", 'Vin de Pays de France',
       "Vin de Pays de L'Aude", "Vin de Pays de L'Herault",
       'Vin de Pays de Montferrand', 'Vin de Pays de Vaucluse',
       "Vin de Pays de l'Atlantique", "Vin de Pays de l'Ile de Beauté",
       'Vin de Pays de la Haute Vallée du Gassac',
       'Vin de Pays de la Méditerranée', 'Vin de Pays des Alpilles',
       'Vin de Pays des Coteaux de Bessilles', 'Vin de Pays des Cévennes',
       'Vin de Pays des Côtes Catalanes',
       'Vin de Pays des Côtes de Gascogne', 'Vin de Pays des Maures',
       'Vin de Pays des Portes de Méditerranée',
       'Vin de Pays du Comté Tolosan', 'Vin de Pays du Gard',
 

Remove the meaningless regions

In [65]:
wineries_with_only_1_location_per_country = (
    winery_locations_by_country.query("count == 1")
    .drop(columns=["count"])
    .set_index("winery")
)
print(
    f"{wineries_with_only_1_location_per_country.shape[0]/locations.shape[0]:.1%}",
    "of the wineries have only 1 operation per country",
)
wineries_with_only_1_location_per_country

Unnamed: 0,winery,region_1,region_2,province,country,code
82,Lionel Osmin & Cie,Vin de France,,,France,FR
731,Kiwi Cuvée,Vin de France,,,France,FR
780,Castello d'Albola,Vin Santo del Chianti Classico,,Tuscany,Italy,IT
911,Frédéric Brouca,Vin de France,,,France,FR
1121,Domaine Rotier,Vin de Liqueur,,,France,FR
...,...,...,...,...,...,...
122397,Grandissime,Vin de France,,,France,FR
126713,Domaine du Grand Cros,Vin Mousseux,,,France,FR
127051,Fat Bastard,Vin de France,,,France,FR
128902,Château de Brigue,Vin Mousseux,,,France,FR


In [6]:
locations['region_1'] = locations['region_1'].mask(locations['region_1'].isin(regions_to_remove), None)
locations[locations['region_1'].isin(regions_to_remove)]

Unnamed: 0,winery,region_1,region_2,province,country,code


Reduce the region_1 field to the region name only

In [67]:
# italian wines
locations['region_1'] = locations['region_1'].mask((locations['region_1'].notna())&(locations['region_1'].str.startswith('Vin Santo di ')), locations['region_1'].str.replace('Vin Santo di ', ''))
locations[(locations['region_1'].notna())&locations['region_1'].str.startswith('Vin Santo di ')]

Unnamed: 0,winery,region_1,region_2,province,country,code


In [68]:
# french wines
locations['region_1'] = locations['region_1'].mask((locations['region_1'].notna())&(locations['region_1'].str.startswith('Vin de ')), locations['region_1'].str.replace('Vin de ', ''))
locations[(locations['region_1'].notna())&locations['region_1'].str.startswith('Vin de ')]

Unnamed: 0,winery,region_1,region_2,province,country,code


In [69]:
# verify the cleanup
locations[(locations['region_1'].notna())&(locations['region_1'].str.contains('Vin '))]

Unnamed: 0,winery,region_1,region_2,province,country,code


In [70]:
np.unique(locations['region_1'].dropna())

array(['Abruzzo', 'Adelaida District', 'Adelaide', ...,
       'Yorkville Highlands', 'Yountville', 'Zonda Valley'],
      shape=(1170,), dtype=object)

**Locations**

In [71]:
print('after cleanup, there are', locations.duplicated().sum(), 'duplicate locations')
locations = locations.drop_duplicates()

after cleanup, there are 87 duplicate locations


In [72]:
print('there are', locations.duplicated().sum(), 'duplicates left')
locations.describe()

there are 0 duplicates left


Unnamed: 0,winery,region_1,region_2,province,country,code
count,30329,24401,8022,29548,30302,30302
unique,16757,1170,13,392,43,43
top,Louis Latour,Napa Valley,Sonoma,California,US,US
freq,43,903,1934,8185,10868,10868


## Search Query Augmentation
Use the least location terms where possible.

In [73]:
# progress indicator
progress = lambda: print(
  ' '.join(
    [f'{locations['q'].notna().sum()/locations.shape[0]:.1%}', 'locations have geocode query expressions']
  )
)

### Create query string

**Wineries that have no `region_1`, `region_2`, or `province` fields**

In [74]:
mask = (
  (locations['winery'].notna())
    &((locations['region_1'].isna()))
    &((locations['region_2'].isna()))
    &((locations['province'].isna()))
)

locations = locations.assign(q=locations['winery'].where(mask, None))

In [75]:
# verify that these wineries were set
assert locations.loc[mask,'q'].isna().sum() == 0
locations[mask].head()

Unnamed: 0,winery,region_1,region_2,province,country,code,q
9071,Jean-Luc and Paul Aegerter,FR,Lionel Osmin & Cie,,,,
10327,Louis Jadot,FR,35,,,,
731,Kiwi Cuvée,,,,France,FR,Kiwi Cuvée
911,Frédéric Brouca,,,,France,FR,Frédéric Brouca
913,Gotsa Family Wines,,,,,,Gotsa Family Wines


In [76]:
wineries_with_multiple_locations_per_country = winery_locations_by_country.query(
    "count > 1"
).sort_values("count", ascending=False)
wineries_with_multiple_locations_per_country

Unnamed: 0,winery,region_1,region_2,province,country,code,q
7691,Frankland Estate,AU,2,,,,
7696,Franz Haas,IT,2,,,,
7704,François Bertheau,FR,2,,,,
7708,François Lurton,ES,2,,,,
8918,J Wilkes,US,2,,,,


In [7]:
mask = (
  locations.q.isna()
    & locations.winery.notna()
    & (locations.region_1.isna())
    & (locations.region_2.isna())
    & (locations.province.notna())
)

assert locations.region_1[mask].notna().sum() == 0 and locations.region_2[mask].notna().sum() == 0
locations[mask].head()

Unnamed: 0,winery,region_1,region_2,province,country,code,q
1,Quinta dos Avidagos,,,Douro,Portugal,PT,
7,Trimbach,,,Alsace,France,FR,
8,Heinz Eifel,,,Rheinhessen,Germany,DE,
9,Jean-Baptiste Adam,,,Alsace,France,FR,
11,Leon Beyer,,,Alsace,France,FR,


In [79]:
# some location details are not clean
locations[locations["winery"] == "Wines & Winemakers"].drop_duplicates()

19.5% locations have geocode query expressions


Unnamed: 0,winery,region_1,region_2,province,country,code,q
1,Quinta dos Avidagos,,,Douro,Portugal,PT,"Quinta dos Avidagos, Douro"
7,Trimbach,,,Alsace,France,FR,"Trimbach, Alsace"
8,Heinz Eifel,,,Rheinhessen,Germany,DE,"Heinz Eifel, Rheinhessen"
9,Jean-Baptiste Adam,,,Alsace,France,FR,"Jean-Baptiste Adam, Alsace"
11,Leon Beyer,,,Alsace,France,FR,"Leon Beyer, Alsace"


In [8]:
# some wineries operate in multiple countries, each with multiple locations
winery_locations_by_country.set_index("winery").loc["Baron Philippe de Rothschild"]

Unnamed: 0,winery,region_1,region_2,province,country,code,q
16,Felix Lavaque,Cafayate,,,Argentina,AR,
183,Alamos,Salta,,,Argentina,AR,
245,Finca Las Moras,San Juan,,,Argentina,AR,
292,Domaine Daniel Dugois,Arbois,,,France,FR,
728,Terrazas de Los Andes,Salta,,,Argentina,AR,


In [9]:
locations.loc[mask, 'q'] = locations.winery[mask] + ', ' + locations.region_1[mask]
progress()
locations[mask].head()

20.8% locations have geocode query expressions


Unnamed: 0,winery,region_1,region_2,province,country,code,q
16,Felix Lavaque,Cafayate,,,Argentina,AR,"Felix Lavaque, Cafayate"
183,Alamos,Salta,,,Argentina,AR,"Alamos, Salta"
245,Finca Las Moras,San Juan,,,Argentina,AR,"Finca Las Moras, San Juan"
292,Domaine Daniel Dugois,Arbois,,,France,FR,"Domaine Daniel Dugois, Arbois"
728,Terrazas de Los Andes,Salta,,,Argentina,AR,"Terrazas de Los Andes, Salta"


**Remaining wineries with `region_2` and no `province` or `region_1`**

In [83]:
mask = (
  locations.q.isna()
    & locations.winery.notna()
    & (locations.region_1.isna())
    & (locations.region_2.notna())
    & (locations.province.isna())
)

assert locations.province[mask].notna().sum() == 0 and locations.region_1[mask].notna().sum() == 0
locations[mask].head()

Unnamed: 0,winery,region_1,region_2,province,country,code,q


In [84]:
locations.loc[mask, 'q'] = locations.winery[mask] + ', ' + locations.region_2[mask]
progress()
locations[mask].head()

20.8% locations have geocode query expressions


Unnamed: 0,winery,region_1,region_2,province,country,code,q


**Remaining wineries with `province` and `region_1` and no `region_2`**

In [87]:
mask = (
  locations.q.isna()
    & locations.winery.notna()
    & (locations.region_1.notna())
    & (locations.region_2.isna())
    & (locations.province.notna())
)

assert locations.region_2[mask].notna().sum() == 0
locations[mask].head()

Unnamed: 0,winery,region_1,region_2,province,country,code,q
0,Nicosia,Etna,,Sicily & Sardinia,Italy,IT,
2,Rainstorm,Willamette Valley,,Oregon,US,US,
3,St. Julian,Lake Michigan Shore,,Michigan,US,US,
4,Sweet Cheeks,Willamette Valley,,Oregon,US,US,
5,Tandem,Navarra,,Northern Spain,Spain,ES,


In [88]:
# location details for a winery with operations in multiple countries and locations
reviews[location_cols].set_index("winery").loc[
    "Baron Philippe de Rothschild"
].drop_duplicates()

73.6% locations have geocode query expressions


Unnamed: 0,winery,region_1,region_2,province,country,code,q
0,Nicosia,Etna,,Sicily & Sardinia,Italy,IT,"Nicosia, Etna, Sicily & Sardinia"
2,Rainstorm,Willamette Valley,,Oregon,US,US,"Rainstorm, Willamette Valley, Oregon"
3,St. Julian,Lake Michigan Shore,,Michigan,US,US,"St. Julian, Lake Michigan Shore, Michigan"
4,Sweet Cheeks,Willamette Valley,,Oregon,US,US,"Sweet Cheeks, Willamette Valley, Oregon"
5,Tandem,Navarra,,Northern Spain,Spain,ES,"Tandem, Navarra, Northern Spain"


In [10]:
mask = (
  locations.q.isna()
    & locations.winery.notna()
    & (locations.region_1.isna())
    & (locations.region_2.notna())
    & (locations.province.notna())
)

assert locations.region_1[mask].notna().sum() == 0
locations[mask].head()

Unnamed: 0,winery,region_1,region_2,province,country,code,q


In [90]:
locations.loc[mask, 'q'] = locations.winery[mask] + ', ' + locations.region_2[mask] + ', ' + locations.province[mask]
progress()
locations[mask].head()

73.6% locations have geocode query expressions


Unnamed: 0,winery,region_1,region_2,province,country,code,q


**Remaining wineries with all 3 fields populated**

In [94]:
mask = (
  locations.q.isna()
    & locations.winery.notna()
    & (locations.region_1.notna())
    & (locations.region_2.notna())
    & (locations.province.notna())
)

assert locations[locations.q.isna()].shape[0] == locations[mask].shape[0]
locations[mask]

Unnamed: 0,winery,region_1,region_2,province,country,code,q
10,Kirkland Signature,Napa Valley,Napa,California,US,US,
12,Louis M. Martini,Alexander Valley,Sonoma,California,US,US,
23,Bianchi,Paso Robles,Central Coast,California,US,US,
25,Castello di Amorosa,Sonoma Coast,Sonoma,California,US,US,
29,Clarksburg Wine Company,Clarksburg,Central Valley,California,US,US,
...,...,...,...,...,...,...,...
129741,Cardinal Rule,Russian River Valley,Sonoma,California,US,US,
129763,People's Wine Revolution,Dry Creek Valley,Sonoma,California,US,US,
129833,Coniglio,Diamond Mountain District,Napa,California,US,US,
129945,Birichino,Santa Ynez Valley,Central Coast,California,US,US,


In [95]:
locations.loc[mask, 'q'] = locations.winery[mask] + ', ' + locations.region_1[mask]  + ', ' + locations.region_2[mask] + ', ' + locations.province[mask]
progress()
locations[mask].head()

100.0% locations have geocode query expressions


Unnamed: 0,winery,region_1,region_2,province,country,code,q
10,Kirkland Signature,Napa Valley,Napa,California,US,US,"Kirkland Signature, Napa Valley, Napa, California"
12,Louis M. Martini,Alexander Valley,Sonoma,California,US,US,"Louis M. Martini, Alexander Valley, Sonoma, Ca..."
23,Bianchi,Paso Robles,Central Coast,California,US,US,"Bianchi, Paso Robles, Central Coast, California"
25,Castello di Amorosa,Sonoma Coast,Sonoma,California,US,US,"Castello di Amorosa, Sonoma Coast, Sonoma, Cal..."
29,Clarksburg Wine Company,Clarksburg,Central Valley,California,US,US,"Clarksburg Wine Company, Clarksburg, Central V..."


**Some of these wineries may have had a province specified**

In [11]:
import requests


def geocode_by_winery_and_country(name, code):
    url = lambda endpoint: f"https://api.mapbox.com/{endpoint}"
    endpoint = "search/searchbox/v1/forward"
    params = dict(
        q=name,
        country=code,
        poi_category="winery",
        language="en",
        access_token="LETMEIN",
    )
    r = requests.get(url(endpoint), params=params)
    print(r.status_code, r.headers)
    return r.json()

In [12]:
geocode_by_winery_and_country("100 Percent Wine", "US")

401 {'Content-Type': 'application/json; charset=utf-8', 'Content-Length': '44', 'Connection': 'keep-alive', 'Date': 'Mon, 23 Dec 2024 07:38:17 GMT', 'ETag': 'W/"2c-mbs4WeZIt3tmYvk6HtTC1rbvKjQ"', 'X-Powered-By': 'Express', 'Access-Control-Allow-Origin': '*', 'X-Content-Type-Options': 'nosniff', 'X-Cache': 'Error from cloudfront', 'Via': '1.1 b0797f10be715dcb685d992d17347df4.cloudfront.net (CloudFront)', 'X-Amz-Cf-Pop': 'SFO53-P7', 'Alt-Svc': 'h3=":443"; ma=86400', 'X-Amz-Cf-Id': 'ciD2vpJTipirkpr4cBoOhqdEnbkKmiXxU5U96bKTiAXPn9r0Abi4hg=='}


Unnamed: 0,winery,region_1,region_2,province,country,code,q
166,Domaine Berthoumieu,Madiran,,Southwest France,France,FR,
308,Domaine Sigalas,,,Santorini,Greece,GR,
371,Elena Walch,Alto Adige,,Northeastern Italy,Italy,IT,
393,San Pedro,,,Lontué Valley,Chile,CL,
992,Domaine des Terrisses,Gaillac,,Southwest France,France,FR,
...,...,...,...,...,...,...,...
111876,Alto 3,Catamarca,,,Argentina,AR,
116736,Vitkin,,,Judean Hills,Israel,IL,
117368,Trio,,,Maipo Valley,Chile,CL,
121862,Tussock Jumper,Vino de la Tierra de Castilla,,Central Spain,Spain,ES,


- groupby country. of the list, those with
  - 1 means only 1 province, otherwise none. treat as only one operation in get_country_code
  - those with 2 means 2 provinces were defined, plus optionally some rows with none.

**Wineries with only 1 operation per country**

In [13]:
import unittest


class GeocodeWineryTestCase(unittest.TestCase):
    def test_invalid_request(self):
        print("not implemented")

    def test_not_authenticated(self):
        print("not implemented")

    def test_lookup_valid_winery(self):
        print("not implemented")

    def test_lookup_unknown_winery(self):
        print("not implemented")

    def test_multiple_matches(self):
        print("not implemented")

    def test_throttling_error(self):
        print("not implemented")


if __name__ == "__main__":
    unittest.main(argv=[""], verbosity=2, exit=False)

test_invalid_request (__main__.GeocodeWineryTestCase.test_invalid_request) ... ok
test_lookup_unknown_winery (__main__.GeocodeWineryTestCase.test_lookup_unknown_winery) ... ok
test_lookup_valid_winery (__main__.GeocodeWineryTestCase.test_lookup_valid_winery) ... ok
test_multiple_matches (__main__.GeocodeWineryTestCase.test_multiple_matches) ... ok
test_not_authenticated (__main__.GeocodeWineryTestCase.test_not_authenticated) ... ok
test_throttling_error (__main__.GeocodeWineryTestCase.test_throttling_error) ... ok

----------------------------------------------------------------------
Ran 6 tests in 0.011s

OK


not implemented
not implemented
not implemented
not implemented
not implemented
not implemented
