In [51]:
# load reviews
import os
import kagglehub
import pandas as pd
import numpy as np
import swifter

from utils.geocode_utils import get_country_code, memory

## Load Wine Review Data

In [52]:
# load the reviews
@memory.cache
def load_reviews():
  # fetch reviews from kaggle
  path = kagglehub.dataset_download("christopheiv/winemagdata130k")
  fname = "winemag-data-130k-v2.csv"
  return pd.read_csv(os.path.join(path, fname), index_col=0)

In [53]:
# load the reviews
reviews = load_reviews()

reviews.loc[reviews.country.notna(),"code"] = reviews[reviews.country.notna()].country.swifter.apply(get_country_code)
reviews[['winery', 'country', 'province', 'region_1', 'region_2', 'code']].head()

Pandas Apply:   0%|          | 0/129908 [00:00<?, ?it/s]

Unnamed: 0,winery,country,province,region_1,region_2,code
0,Nicosia,Italy,Sicily & Sardinia,Etna,,it
1,Quinta dos Avidagos,Portugal,Douro,,,pt
2,Rainstorm,US,Oregon,Willamette Valley,Willamette Valley,us
3,St. Julian,US,Michigan,Lake Michigan Shore,,us
4,Sweet Cheeks,US,Oregon,Willamette Valley,Willamette Valley,us


## Extract Winery Locations

In [54]:
# extract the winery locations to geolocate
location_cols = ['winery', 'country', 'province', 'region_1', 'region_2', 'code']
wineries = np.unique(reviews.winery.dropna())
locations = reviews.query("winery in @wineries").copy()[location_cols].drop_duplicates()
print(
    f"{locations.shape[0] - len(wineries):,d}",
    "duplicate winery names in different locations",
)
locations.info()
locations.head()

13,659 duplicate winery names in different locations
<class 'pandas.core.frame.DataFrame'>
Index: 30416 entries, 0 to 129952
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   winery    30416 non-null  object
 1   country   30389 non-null  object
 2   province  30389 non-null  object
 3   region_1  26697 non-null  object
 4   region_2  10023 non-null  object
 5   code      30389 non-null  object
dtypes: object(6)
memory usage: 1.6+ MB


Unnamed: 0,winery,country,province,region_1,region_2,code
0,Nicosia,Italy,Sicily & Sardinia,Etna,,it
1,Quinta dos Avidagos,Portugal,Douro,,,pt
2,Rainstorm,US,Oregon,Willamette Valley,Willamette Valley,us
3,St. Julian,US,Michigan,Lake Michigan Shore,,us
4,Sweet Cheeks,US,Oregon,Willamette Valley,Willamette Valley,us


## Cleanup Winery Location Data

**Locations where location fields duplicate the same information OR contain the word `Other`**

In [55]:
# cleanup helper
def clean_duplicate_location_field_value(keep, clean, df=locations, preview=False):
  same_value = df[clean] == df[keep]
  value_with_other = (df[clean].notna()) & ((df[clean] == 'Other')|(df[clean] == df[keep] + ' Other'))
  if preview:
    print(np.sum(same_value), f"locations['{clean}'] values are the same as locations['{keep}']")
    print(np.sum(value_with_other), f"locations['{clean}'] values contain 'Other'")
    print(f"unique locations['{clean}'] values that contain 'Other':", np.unique(locations[value_with_other][clean]))
    print()
  else:
    df[clean] = df[clean].mask(same_value | value_with_other, None)


In [56]:
# preview the changes
clean_duplicate_location_field_value(keep='country', clean='region_2', preview=True)
clean_duplicate_location_field_value(keep='province', clean='region_2', preview=True)
clean_duplicate_location_field_value(keep='region_1', clean='region_2', preview=True)
clean_duplicate_location_field_value(keep='country', clean='region_1', preview=True)
clean_duplicate_location_field_value(keep='province', clean='region_1', preview=True)
clean_duplicate_location_field_value(keep='country', clean='province', preview=True)

0 locations['region_2'] values are the same as locations['country']
0 locations['region_2'] values contain 'Other'
unique locations['region_2'] values that contain 'Other': []

0 locations['region_2'] values are the same as locations['province']
1119 locations['region_2'] values contain 'Other'
unique locations['region_2'] values that contain 'Other': ['California Other' 'New York Other' 'Oregon Other' 'Washington Other']

882 locations['region_2'] values are the same as locations['region_1']
983 locations['region_2'] values contain 'Other'
unique locations['region_2'] values that contain 'Other': ['California Other' 'New York Other' 'Oregon Other' 'Washington Other']

87 locations['region_1'] values are the same as locations['country']
0 locations['region_1'] values contain 'Other'
unique locations['region_1'] values that contain 'Other': []

1988 locations['region_1'] values are the same as locations['province']
6 locations['region_1'] values contain 'Other'
unique locations['region_

In [57]:
# cleanup
clean_duplicate_location_field_value(keep='country', clean='region_2')
clean_duplicate_location_field_value(keep='province', clean='region_2')
clean_duplicate_location_field_value(keep='region_1', clean='region_2')
clean_duplicate_location_field_value(keep='country', clean='region_1')
clean_duplicate_location_field_value(keep='province', clean='region_1')
clean_duplicate_location_field_value(keep='country', clean='province')

In [58]:
# verify the changes
clean_duplicate_location_field_value(keep='country', clean='region_2', preview=True)
clean_duplicate_location_field_value(keep='province', clean='region_2', preview=True)
clean_duplicate_location_field_value(keep='region_1', clean='region_2', preview=True)
clean_duplicate_location_field_value(keep='country', clean='region_1', preview=True)
clean_duplicate_location_field_value(keep='province', clean='region_1', preview=True)
clean_duplicate_location_field_value(keep='country', clean='province', preview=True)

0 locations['region_2'] values are the same as locations['country']
0 locations['region_2'] values contain 'Other'
unique locations['region_2'] values that contain 'Other': []

0 locations['region_2'] values are the same as locations['province']
0 locations['region_2'] values contain 'Other'
unique locations['region_2'] values that contain 'Other': []

0 locations['region_2'] values are the same as locations['region_1']
0 locations['region_2'] values contain 'Other'
unique locations['region_2'] values that contain 'Other': []

0 locations['region_1'] values are the same as locations['country']
0 locations['region_1'] values contain 'Other'
unique locations['region_1'] values that contain 'Other': []

0 locations['region_1'] values are the same as locations['province']
0 locations['region_1'] values contain 'Other'
unique locations['region_1'] values that contain 'Other': []

0 locations['province'] values are the same as locations['country']
0 locations['province'] values contain 'Othe

**Locations with the word `Vin` in `region_1`**

Some indicate city or wine growing regions, like `Vin Santo di Montepulciano` or `Vin de Pays des Côtes de Gascogne`, but others are not regions like `Vin Mousseux` or `Vin Santo del Chianti Classico`. Need to remove the meaningless values, and reduce the remaining ones to their regions.

In [59]:
region_1_with_Vin = np.unique(locations[(locations['region_1'].notna())&(locations['region_1'].str.contains('Vin '))]['region_1'])
region_1_with_Vin

array(['Vin Doux Naturel Rasteau', 'Vin Mousseux', 'Vin Pétillant',
       'Vin Santo del Chianti', 'Vin Santo del Chianti Classico',
       'Vin Santo del Chianti Rufina', 'Vin Santo di Carmignano',
       'Vin Santo di Montepulciano', 'Vin de France', 'Vin de Liqueur',
       'Vin de Pays Cité de Carcassonne', 'Vin de Pays Var',
       "Vin de Pays d'Oc", 'Vin de Pays de France',
       "Vin de Pays de L'Aude", "Vin de Pays de L'Herault",
       'Vin de Pays de Montferrand', 'Vin de Pays de Vaucluse',
       "Vin de Pays de l'Atlantique", "Vin de Pays de l'Ile de Beauté",
       'Vin de Pays de la Haute Vallée du Gassac',
       'Vin de Pays de la Méditerranée', 'Vin de Pays des Alpilles',
       'Vin de Pays des Coteaux de Bessilles', 'Vin de Pays des Cévennes',
       'Vin de Pays des Côtes Catalanes',
       'Vin de Pays des Côtes de Gascogne', 'Vin de Pays des Maures',
       'Vin de Pays des Portes de Méditerranée',
       'Vin de Pays du Comté Tolosan', 'Vin de Pays du Gard',
 

Remove the meaningless regions

In [60]:
regions_to_remove = [
  'Vin Doux Naturel Rasteau', 'Vin Mousseux', 'Vin Pétillant',
  'Vin Santo del Chianti', 'Vin Santo del Chianti Classico',
  'Vin Santo del Chianti Rufina', 'Vin de France', 'Vin de Liqueur',
  'Vin de Table Francais'
]
locations[locations['region_1'].isin(regions_to_remove)]

Unnamed: 0,winery,country,province,region_1,region_2,code
82,Lionel Osmin & Cie,France,,Vin de France,,fr
731,Kiwi Cuvée,France,,Vin de France,,fr
780,Castello d'Albola,Italy,Tuscany,Vin Santo del Chianti Classico,,it
911,Frédéric Brouca,France,,Vin de France,,fr
1121,Domaine Rotier,France,,Vin de Liqueur,,fr
...,...,...,...,...,...,...
122397,Grandissime,France,,Vin de France,,fr
126713,Domaine du Grand Cros,France,,Vin Mousseux,,fr
127051,Fat Bastard,France,,Vin de France,,fr
128902,Château de Brigue,France,,Vin Mousseux,,fr


In [61]:
locations['region_1'] = locations['region_1'].mask(locations['region_1'].isin(regions_to_remove), None)
locations[locations['region_1'].isin(regions_to_remove)]

Unnamed: 0,winery,country,province,region_1,region_2,code


Reduce the `region_1` field to the region name only

In [62]:
# italian wines
locations['region_1'] = locations['region_1'].mask((locations['region_1'].notna())&(locations['region_1'].str.startswith('Vin Santo di ')), locations['region_1'].str.replace('Vin Santo di ', ''))
locations[(locations['region_1'].notna())&locations['region_1'].str.startswith('Vin Santo di ')]

Unnamed: 0,winery,country,province,region_1,region_2,code


In [63]:
# french wines
locations['region_1'] = locations['region_1'].mask((locations['region_1'].notna())&(locations['region_1'].str.startswith('Vin de ')), locations['region_1'].str.replace('Vin de ', ''))
locations[(locations['region_1'].notna())&locations['region_1'].str.startswith('Vin de ')]

Unnamed: 0,winery,country,province,region_1,region_2,code


In [64]:
# verify the cleanup
locations[(locations['region_1'].notna())&(locations['region_1'].str.contains('Vin '))]

Unnamed: 0,winery,country,province,region_1,region_2,code


In [65]:
np.unique(locations['region_1'].dropna())

array(['Abruzzo', 'Adelaida District', 'Adelaide', ...,
       'Yorkville Highlands', 'Yountville', 'Zonda Valley'], dtype=object)

In [66]:
locations.head()

Unnamed: 0,winery,country,province,region_1,region_2,code
0,Nicosia,Italy,Sicily & Sardinia,Etna,,it
1,Quinta dos Avidagos,Portugal,Douro,,,pt
2,Rainstorm,US,Oregon,Willamette Valley,,us
3,St. Julian,US,Michigan,Lake Michigan Shore,,us
4,Sweet Cheeks,US,Oregon,Willamette Valley,,us


**Remove provinces that are not actual province names**

In [67]:
# province name includes country name, like Northern Spain

mask = locations.swifter.apply(
  lambda row: row.country in row.province 
              if row.province is not None and row.country is not None and type(row.province) == str and type(row.country) 
              else False,
  axis=1
)
locations[mask]

Pandas Apply:   0%|          | 0/30416 [00:00<?, ?it/s]

Unnamed: 0,winery,country,province,region_1,region_2,code
5,Tandem,Spain,Northern Spain,Navarra,,es
18,Pradorey,Spain,Northern Spain,Ribera del Duero,,es
38,Feudi di San Marzano,Italy,Southern Italy,Puglia,,it
61,Podere dal Nespoli,Italy,Central Italy,Romagna,,it
72,Grifalco,Italy,Southern Italy,Aglianico del Vulture,,it
...,...,...,...,...,...,...
129350,Vigneti Villabella,Italy,Northeastern Italy,Delle Venezie,,it
129507,Endrizzi,Italy,Northeastern Italy,Trento,,it
129760,Luigi Maffini,Italy,Southern Italy,Paestum,,it
129850,Macchialupa,Italy,Southern Italy,Campania,,it


In [68]:
locations.loc[mask, 'province'] = None
locations[mask].head()

Unnamed: 0,winery,country,province,region_1,region_2,code
5,Tandem,Spain,,Navarra,,es
18,Pradorey,Spain,,Ribera del Duero,,es
38,Feudi di San Marzano,Italy,,Puglia,,it
61,Podere dal Nespoli,Italy,,Romagna,,it
72,Grifalco,Italy,,Aglianico del Vulture,,it


**Remove provinces with `&` in name**

In [69]:
mask = (locations.province.notna()) & (locations.province.str.contains('&'))
print(np.unique(locations[mask].province))
pd.concat(
  [
    locations[locations.province == p].sample(3)
    for p in np.unique(locations[mask].province)
  ]
)

['Casablanca & Leyda Valleys' 'Sicily & Sardinia']


Unnamed: 0,winery,country,province,region_1,region_2,code
116329,Montes,Chile,Casablanca & Leyda Valleys,,,cl
109876,Kingston Family,Chile,Casablanca & Leyda Valleys,,,cl
82894,Carmen,Chile,Casablanca & Leyda Valleys,,,cl
88482,Case del Feudo,Italy,Sicily & Sardinia,Sicilia,,it
16250,Murgo,Italy,Sicily & Sardinia,Etna,,it
61215,Marabino,Italy,Sicily & Sardinia,Noto,,it


In [70]:
locations.loc[mask, 'province'] = None
locations[mask].sample(5)

Unnamed: 0,winery,country,province,region_1,region_2,code
81614,Avide,Italy,,Cerasuolo di Vittoria Classico,,it
29903,COS,Italy,,Cerasuolo di Vittoria Classico,,it
93466,Valle dell'Acate,Italy,,Vittoria Frappato,,it
73062,Sallier de la Tour,Italy,,Sicilia,,it
31584,Argiolas,Italy,,Cannonau di Sardegna,,it


**Check for Duplicate Locations**

In [71]:
print('after cleanup, there are', locations.duplicated().sum(), 'duplicate locations')
locations = locations.drop_duplicates()

after cleanup, there are 98 duplicate locations


In [72]:
print('there are', locations.duplicated().sum(), 'duplicates left')
locations.describe()

there are 0 duplicates left


Unnamed: 0,winery,country,province,region_1,region_2,code
count,30318,30291,25842,24401,8022,30291
unique,16757,43,379,1170,13,43
top,Louis Latour,US,California,Napa Valley,Sonoma,us
freq,43,10868,8185,903,1934,10868


## Augment Winery Location Data with Geocode Search Query
**Strategy**: Use the least location terms where possible.

#### Helper functions to create query expressions from select location fields

In [73]:
# progress indicator
progress = lambda: print(
  ' '.join(
    [f'{locations['q'].notna().sum()/locations.shape[0]:.1%}', 'locations have geocode query expressions']
  )
)

In [74]:
# helper function that builds mask based on which columns to select
def mask_builder(cols: list[str]):
  mask = locations.q.isna()
  mask &= locations.winery.notna()
  mask &= locations.region_1.notna() if 'region_1' in cols else locations.region_1.isna()
  mask &= locations.region_2.notna() if 'region_2' in cols else locations.region_2.isna()
  mask &= locations.province.notna() if 'province' in cols else locations.province.isna()
  return mask

In [75]:
# helper function that creates geocode query expression from selected location fields
def query_builder(cols: list[str], df: pd.DataFrame=locations):
  mask = mask_builder(cols)
  return df.loc[mask].swifter.apply(lambda row: ', '.join(row[c] for c in (['winery'] + cols)), axis=1)

In [76]:
# helper function that sets geocode query expressions from selected location fields
def create_query_expression(cols, df=locations):
  df.loc[mask, 'q'] = query_builder(cols, df)
  progress()

### Create query string

**Wineries that have no `region_1`, `region_2`, or `province` fields**

In [77]:
mask = (
  (locations['winery'].notna())
    &((locations['region_1'].isna()))
    &((locations['region_2'].isna()))
    &((locations['province'].isna()))
)

# create the query column and set the query to the winery name
locations = locations.assign(q=locations['winery'].where(mask, None))

In [78]:
# verify that these wineries were set
assert locations.loc[mask,'q'].isna().sum() == 0
locations[mask].head()

Unnamed: 0,winery,country,province,region_1,region_2,code,q
77,Yalumba,Australia,,,,au,Yalumba
82,Lionel Osmin & Cie,France,,,,fr,Lionel Osmin & Cie
232,Angove's,Australia,,,,au,Angove's
400,Cantine Maschio,Italy,,,,it,Cantine Maschio
731,Kiwi Cuvée,France,,,,fr,Kiwi Cuvée


In [79]:
# and that others are not
assert locations.loc[~mask,'q'].notna().sum() == 0
locations[~mask].head()

Unnamed: 0,winery,country,province,region_1,region_2,code,q
0,Nicosia,Italy,,Etna,,it,
1,Quinta dos Avidagos,Portugal,Douro,,,pt,
2,Rainstorm,US,Oregon,Willamette Valley,,us,
3,St. Julian,US,Michigan,Lake Michigan Shore,,us,
4,Sweet Cheeks,US,Oregon,Willamette Valley,,us,


In [80]:
# check on progress
progress()

1.6% locations have geocode query expressions


**Wineries with only 1 location per country**

In [81]:
# wineries with no location field, other than country
mask_no_location = locations.winery.notna()
mask_no_location &= locations.country.notna()
mask_no_location &= locations.region_1.isna()
mask_no_location &= locations.region_2.isna()
mask_no_location &= locations.province.isna()
mask_no_location
locations.loc[mask_no_location]

# wineries with at least one location field in addition to country
mask_with_location = locations.winery.notna()
mask_with_location &= locations.country.notna()
mask_with_location &= (locations.region_1.notna()) | (locations.region_2.notna()) | (locations.province.notna())
locations.loc[mask_with_location]

# verify the masks are complementary and complete
assert pd.merge(locations.loc[mask_no_location], locations.loc[mask_with_location], how='inner').shape[0] == 0
assert locations.loc[locations.country.notna()].shape[0] == (locations.loc[mask_no_location].shape[0] + locations.loc[mask_with_location].shape[0])

# wineries that only have one location per country don't need the location field
tmp = locations[mask_with_location].groupby(['winery','code']).count()[['country']].rename(columns={'country': 'count'}).reset_index().query('count == 1')
single_location_wineries = np.unique(tmp.winery)
print(f'{len(single_location_wineries):,d}', 'wineries with one location per country begining with', list(single_location_wineries[:5]) + ['...'] )

# TODO set these to only be the winery name
locations.q = locations.q.mask(mask_with_location & (locations.winery.isin(single_location_wineries)), locations.winery)
progress()
locations[mask_with_location & (locations.winery.isin(single_location_wineries))]

10,549 wineries with one location per country begining with ['100 Percent Wine', '1070 Green', '12C Wines', '13 Celsius', '1752 Signature Wines', '...']
37.6% locations have geocode query expressions


Unnamed: 0,winery,country,province,region_1,region_2,code,q
1,Quinta dos Avidagos,Portugal,Douro,,,pt,Quinta dos Avidagos
5,Tandem,Spain,,Navarra,,es,Tandem
7,Trimbach,France,Alsace,,,fr,Trimbach
10,Kirkland Signature,US,California,Napa Valley,Napa,us,Kirkland Signature
13,Masseria Setteporte,Italy,,Etna,,it,Masseria Setteporte
...,...,...,...,...,...,...,...
129761,Mas de Pampelonne,France,Provence,Côtes de Provence,,fr,Mas de Pampelonne
129856,Bodegas Eidosela,Spain,Galicia,Rías Baixas,,es,Bodegas Eidosela
129890,Penedo Borges,Argentina,Mendoza Province,Luján de Cuyo,,ar,Penedo Borges
129917,J. & F. Lurton,Argentina,Mendoza Province,Mendoza,,ar,J. & F. Lurton


**Remaining wineries with `province` and no `region_1` or `region_2`**

In [82]:
cols = ['province']

# verify mask
mask = mask_builder(cols)
assert locations.province[mask].isna().sum() == 0 and locations.region_1[mask].notna().sum() == 0 and locations.region_2[mask].notna().sum() == 0
assert locations.q[mask].notna().sum() == 0
locations[mask].head()

Unnamed: 0,winery,country,province,region_1,region_2,code,q
8,Heinz Eifel,Germany,Rheinhessen,,,de,
9,Jean-Baptiste Adam,France,Alsace,,,fr,
11,Leon Beyer,France,Alsace,,,fr,
21,Acrobat,US,Oregon,,,us,
36,Estampa,Chile,Colchagua Valley,,,cl,


In [83]:
create_query_expression(cols, locations)

# verify query expressions
assert locations.province[mask].isna().sum() == 0 and locations.region_1[mask].notna().sum() == 0 and locations.region_2[mask].notna().sum() == 0
assert locations.q[mask].isna().sum() == 0
locations[mask].head()

Pandas Apply:   0%|          | 0/2855 [00:00<?, ?it/s]

47.1% locations have geocode query expressions


Unnamed: 0,winery,country,province,region_1,region_2,code,q
8,Heinz Eifel,Germany,Rheinhessen,,,de,"Heinz Eifel, Rheinhessen"
9,Jean-Baptiste Adam,France,Alsace,,,fr,"Jean-Baptiste Adam, Alsace"
11,Leon Beyer,France,Alsace,,,fr,"Leon Beyer, Alsace"
21,Acrobat,US,Oregon,,,us,"Acrobat, Oregon"
36,Estampa,Chile,Colchagua Valley,,,cl,"Estampa, Colchagua Valley"


**Remaining wineries with `region_1` and no `province` or `region_2`**

In [84]:
cols = ['region_1']

# verify mask
mask = mask_builder(cols)
assert locations.province[mask].notna().sum() == 0 and locations.region_2[mask].notna().sum() == 0
assert locations.q[mask].notna().sum() == 0

# set the query expression
create_query_expression(cols, locations)

# verify query expressions
assert locations.province[mask].notna().sum() == 0 and locations.region_2[mask].notna().sum() == 0
assert locations.q[mask].isna().sum() == 0
locations[mask].head()


Pandas Apply:   0%|          | 0/2178 [00:00<?, ?it/s]

54.2% locations have geocode query expressions


Unnamed: 0,winery,country,province,region_1,region_2,code,q
0,Nicosia,Italy,,Etna,,it,"Nicosia, Etna"
6,Terre di Giurfo,Italy,,Vittoria,,it,"Terre di Giurfo, Vittoria"
18,Pradorey,Spain,,Ribera del Duero,,es,"Pradorey, Ribera del Duero"
22,Baglio di Pianetto,Italy,,Sicilia,,it,"Baglio di Pianetto, Sicilia"
24,Canicattì,Italy,,Sicilia,,it,"Canicattì, Sicilia"


**Remaining wineries with `region_2` and no `province` or `region_1`**

In [85]:
cols = ['region_2']

# verify mask
mask = mask_builder(cols)
assert locations.province[mask].notna().sum() == 0 and locations.region_2[mask].notna().sum() == 0
assert locations.q[mask].notna().sum() == 0

# set the query expression
create_query_expression(cols, locations)

# verify query expressions
assert locations.province[mask].notna().sum() == 0 and locations.region_2[mask].notna().sum() == 0
assert locations.q[mask].isna().sum() == 0
locations[mask].head()

54.2% locations have geocode query expressions


Unnamed: 0,winery,country,province,region_1,region_2,code,q


**Remaining wineries with `province` and `region_1` and no `region_2`**

In [86]:
cols = ['region_1', 'province']

# verify mask
mask = mask_builder(cols)
assert locations.province[mask].isna().sum() == 0 and locations.region_1[mask].isna().sum() == 0 and locations.region_2[mask].notna().sum() == 0
assert locations.q[mask].notna().sum() == 0

# set the query expression
create_query_expression(cols, locations)

# verify query expressions
assert locations.province[mask].isna().sum() == 0 and locations.region_1[mask].isna().sum() == 0 and locations.region_2[mask].notna().sum() == 0
assert locations.q[mask].isna().sum() == 0
locations[mask].head()

Pandas Apply:   0%|          | 0/7886 [00:00<?, ?it/s]

80.2% locations have geocode query expressions


Unnamed: 0,winery,country,province,region_1,region_2,code,q
2,Rainstorm,US,Oregon,Willamette Valley,,us,"Rainstorm, Willamette Valley, Oregon"
3,St. Julian,US,Michigan,Lake Michigan Shore,,us,"St. Julian, Lake Michigan Shore, Michigan"
4,Sweet Cheeks,US,Oregon,Willamette Valley,,us,"Sweet Cheeks, Willamette Valley, Oregon"
14,Mirassou,US,California,Central Coast,,us,"Mirassou, Central Coast, California"
41,Hawkins Cellars,US,Oregon,Willamette Valley,,us,"Hawkins Cellars, Willamette Valley, Oregon"


**Remaining wineries with `province` and `region_2` and no `region_1`**

In [87]:
cols = ['region_2', 'province']

# verify mask
mask = mask_builder(cols)
assert locations.province[mask].isna().sum() == 0 and locations.region_1[mask].isna().sum() == 0 and locations.region_2[mask].notna().sum() == 0
assert locations.q[mask].notna().sum() == 0

# set the query expression
create_query_expression(cols, locations)

# verify query expressions
assert locations.province[mask].isna().sum() == 0 and locations.region_1[mask].isna().sum() == 0 and locations.region_2[mask].notna().sum() == 0
assert locations.q[mask].isna().sum() == 0
locations[mask].head()

80.2% locations have geocode query expressions


Unnamed: 0,winery,country,province,region_1,region_2,code,q


**Remaining wineries with all 3 fields `province`, `region_1` and `region_2`, populated**

In [88]:
cols = ['region_1', 'region_2', 'province']

# verify mask
mask = mask_builder(cols)
assert locations.province[mask].isna().sum() == 0 and locations.region_1[mask].isna().sum() == 0 and locations.region_2[mask].isna().sum() == 0
assert locations.q[mask].notna().sum() == 0

# set the query expression
create_query_expression(cols, locations)

# verify query expressions
assert locations.province[mask].isna().sum() == 0 and locations.region_1[mask].isna().sum() == 0 and locations.region_2[mask].isna().sum() == 0
assert locations.q[mask].isna().sum() == 0
locations[mask].head()

Pandas Apply:   0%|          | 0/5988 [00:00<?, ?it/s]

100.0% locations have geocode query expressions


Unnamed: 0,winery,country,province,region_1,region_2,code,q
12,Louis M. Martini,US,California,Alexander Valley,Sonoma,us,"Louis M. Martini, Alexander Valley, Sonoma, Ca..."
23,Bianchi,US,California,Paso Robles,Central Coast,us,"Bianchi, Paso Robles, Central Coast, California"
25,Castello di Amorosa,US,California,Sonoma Coast,Sonoma,us,"Castello di Amorosa, Sonoma Coast, Sonoma, Cal..."
33,Envolve,US,California,Dry Creek Valley,Sonoma,us,"Envolve, Dry Creek Valley, Sonoma, California"
34,Envolve,US,California,Sonoma Valley,Sonoma,us,"Envolve, Sonoma Valley, Sonoma, California"


In [89]:
# verify all locations have a query string
assert locations[locations['q'].isna()].shape[0] == 0


**Remove any duplicated locations after augmentation**

In [90]:
print( 'there are', locations.duplicated().sum(), 'duplicates after location augmentation')
locations.drop_duplicates(inplace=True)
print( 'there are', locations.duplicated().sum(), 'duplicates left after de-duping location augmented locations')

there are 0 duplicates after location augmentation
there are 0 duplicates left after de-duping location augmented locations


TODO:
  - setup a readthru cache, LFU if possice. look at [shelved_cache](https://github.com/mariushelf/shelved_cache) with [cachetools](https://pypi.org/project/cachetools/)
  - decorate getter from mapbox
  - run multiple passes until all winery locations are found

**Use direct http requests to geocode the wineries**

In [91]:
import requests

def geocode_by_winery_and_country(name, code):
  url = lambda endpoint: f'https://api.mapbox.com/{endpoint}'
  endpoint = 'search/searchbox/v1/forward'
  params = dict(q=name, country=code, poi_category='winery', language='en', access_token='LETMEIN')
  r = requests.get(url(endpoint), params=params)
  print(r.status_code, r.headers)
  return r.json()

In [92]:
# geocode_by_winery_and_country('100 Percent Wine', 'US')

## Tests

In [93]:
import unittest


class GeocodeWineryTestCase(unittest.TestCase):

    def test_invalid_request(self):
        print('not implemented')
        
    def test_not_authenticated(self):
        print('not implemented')
        
    def test_lookup_valid_winery(self):
        print("not implemented")

    def test_lookup_unknown_winery(self):
        print("not implemented")

    def test_multiple_matches(self):
        print("not implemented")

    def test_throttling_error(self):
        print("not implemented")


if __name__ == "__main__":
    unittest.main(argv=[""], verbosity=2, exit=False)

test_invalid_request (__main__.GeocodeWineryTestCase.test_invalid_request) ... ok
test_lookup_unknown_winery (__main__.GeocodeWineryTestCase.test_lookup_unknown_winery) ... ok
test_lookup_valid_winery (__main__.GeocodeWineryTestCase.test_lookup_valid_winery) ... 

not implemented
not implemented
not implemented


ok
test_multiple_matches (__main__.GeocodeWineryTestCase.test_multiple_matches) ... ok
test_not_authenticated (__main__.GeocodeWineryTestCase.test_not_authenticated) ... ok
test_throttling_error (__main__.GeocodeWineryTestCase.test_throttling_error) ... ok

----------------------------------------------------------------------
Ran 6 tests in 0.091s

OK


not implemented
not implemented
not implemented
