In [1]:
# libraries
import numpy as np
import pandas as pd
import geopandas as gpd
import os
import json
from collections import namedtuple

### Load Data

In [2]:
# helper to extract region and country information from raw dataset
def extract_region(k: str) -> tuple[str]:
  parts = k.split(', ')
  if len(parts) == 2:
    return parts
  elif len(parts) == 3:
    return {parts[0], parts[2]}
  elif len(parts) < 2:
    if parts[0] == 'Unknown':
      return [None] * 2
    print(len(parts), parts)    
    return [parts[0], None]
  else:
    print(len(parts), parts)
    return [parts[0], parts[1]]

In [3]:
# data model
vineyard_cols = ['name', 'lat', 'lon', 'region', 'country', 'color', 'url']

Vineyard = namedtuple('Vineyard', vineyard_cols, defaults = [None]*len(vineyard_cols))

In [4]:
# Open and read the JSON file
with open(os.path.join('../winerymap', 'vineyards.json'), 'r') as file:
  data = json.load(file)
  file.close()

# Print the data
wineries = pd.DataFrame(
    [
      Vineyard(vineyard[2], vineyard[0], vineyard[1], *extract_region(key), data[key]['color'], vineyard[3])
      for key in data
      for vineyard in data[key]['vineyards']
  ]
)
wineries.info()
wineries

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34178 entries, 0 to 34177
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   name     34178 non-null  object 
 1   lat      34178 non-null  float64
 2   lon      34178 non-null  float64
 3   region   31247 non-null  object 
 4   country  31247 non-null  object 
 5   color    34178 non-null  object 
 6   url      27740 non-null  object 
dtypes: float64(2), object(5)
memory usage: 1.8+ MB


Unnamed: 0,name,lat,lon,region,country,color,url
0,Bodega Los Toneles,-32.89640,-68.81729,Guaymallén,Argentina,#3bbeaa,https://linktr.ee/Bodegalostoneles
1,Bodegas y Viñedos Amadeo Marañon,-32.89653,-68.81472,Guaymallén,Argentina,#3bbeaa,http://bodegasmaranon.com.ar/
2,Bodega Barberis,-32.86422,-68.75041,Guaymallén,Argentina,#3bbeaa,http://www.bodegabarberis.com/
3,Mipser Bodega,-32.86522,-68.79126,Guaymallén,Argentina,#3bbeaa,
4,BODEGA GASPARONI CARUSO,-32.85921,-68.77780,Guaymallén,Argentina,#3bbeaa,http://www.gasparonicaruso.com.ar/
...,...,...,...,...,...,...,...
34173,Remparts du Château,44.79917,1.61772,Rocamadour,France,#94a6fd,http://www.lerelaisdupelerin.fr/Les-Remparts-d...
34174,OPG Herak,45.51270,16.16231,Pokuplje,Croatia,#2f5672,https://opg-herak.hr/
34175,OPG Dvorneković,45.43486,16.26497,Pokuplje,Croatia,#2f5672,
34176,VINARIJA IURIS,45.72909,16.00865,Pokuplje,Croatia,#2f5672,http://www.iuriswinery.com/


### Data Exploration

In [5]:
# group by helper
by_ = lambda cols, df=wineries: df.groupby(cols).count()[['lat']].rename(columns={'lat':'count'}).sort_values('count', ascending=False)

In [6]:
# count of wineries by country
by_country = by_('country')
print('there are wineries from', f'{by_country.shape[0]:,d}', 'countries')
by_country.head()

there are wineries from 63 countries


Unnamed: 0_level_0,count
country,Unnamed: 1_level_1
France,7573
Italy,6260
United States,3800
Spain,1805
Australia,1517


In [7]:
# count of wineries by region
by_region = by_(['country', 'region'])
print('there are wineries from', f'{by_region.shape[0]:,d}', 'regions')
by_region.head()

there are wineries from 2,077 regions


Unnamed: 0_level_0,Unnamed: 1_level_0,count
country,region,Unnamed: 2_level_1
Spain,Cava,235
France,Champagne,150
Peru,Lima,143
France,Charentais,139
Greece,Pelopennese,122


In [8]:
# count of wineries by name
by_winery = by_(['name', 'region', 'country'])
print('there are', f'{by_winery.shape[0]:,d}', 'wineries, some with multiple locations within a region')
by_winery.head()

there are 31,174 wineries, some with multiple locations within a region


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count
name,region,country,Unnamed: 3_level_1
Semeynaya Vinodel'nya Antonenko,Anapa,Russian Federation,4
Rancho Capistrano Winery,South Coast,United States,3
Crama Tohani,Dealurile Munteniei,Romania,2
Abbazia San Giorgio,Pantelleria,Italy,2
Wilreza Winery,Rietrivier FS,South Africa,2


### Winery Location Lookup Logic

In [9]:
by_name = by_('name')
print(f'{by_name.query('count > 1').shape[0]:,d}', 'of', f'{by_name.shape[0]:,d}', 'wineries are not unique by name')

242 of 33,905 wineries are not unique by name


In [10]:
single_site_wineries = np.unique(by_name.query('count == 1').index)
len(single_site_wineries), single_site_wineries

(33663,
 array(['"Amiata...i Vini del Vulcano " Winery',
        '"Azienda Agricola Budignac" di Tonut Daniele',
        '"Cantina Massara" Burlotto Gian Carlo s.agr.s.', ..., '우아미 와이너리',
        '𝐂𝐚𝐧𝐭𝐢𝐧𝐚 𝐅𝐫𝐚𝐧𝐜𝐞𝐬𝐜𝐨 𝐌𝐚𝐥𝐞𝐧𝐚 - Wine shop, degustazioni, visite guidate',
        '🍇DOMAINE PILENIUM🌿🍷'], dtype=object))

In [11]:
by_country = by_(['name', 'country'], df=wineries[~wineries.name.isin(single_site_wineries)])
single_site_by_country_wineries = np.unique(by_country.query('count == 1').index)
len(single_site_by_country_wineries), single_site_by_country_wineries

(70,
 array([('4R Ranch Vineyards and Winery', 'United States'),
        ('Antler Ridge Winery', 'United States'),
        ('Bella Terra Vineyards', 'Canada'),
        ('Black Birch Vineyard', 'New Zealand'), ('Bodega', 'Argentina'),
        ('Bodega', 'Peru'), ('Bodega', 'Spain'),
        ('Bodega Juanita', 'Peru'), ('Bodega Juanita', 'Venezuela'),
        ('Bodega Los Toneles', 'Argentina'),
        ('Bodega Los Toneles', 'Spain'), ('Bodega Lovera', 'Peru'),
        ('Bodega Lovera', 'Venezuela'),
        ('Clover Hill Vineyards & Winery', 'United States'),
        ('Constellation Brands', 'Canada'),
        ('Constellation Brands', 'United States'),
        ('Crama Oprișor', 'Romania'), ('Cramele Recas', 'Romania'),
        ('Cramele Recaș', 'Romania'), ('Domaine Bousquet', 'Argentina'),
        ('Domaine Bousquet', 'France'), ('Domaine de Bellevue', 'France'),
        ('Domaine de Bellevue', 'Switzerland'),
        ('Domaine de la Croix', 'France'),
        ('Domaine de la Croix', 

In [12]:
by_region = by_(
  ['name', 'country', 'region'],
  df=wineries[
    (~wineries.name.isin(single_site_wineries))
    &(~wineries.set_index(['name', 'country']).index.isin(single_site_by_country_wineries))
  ]
)
single_site_by_region_wineries = np.unique(by_region.query('count == 1').index)
len(single_site_by_region_wineries), single_site_by_region_wineries

(238,
 array([('Aurora Vineyards', 'United States', 'Columbia Valley'),
        ('Aurora Vineyards', 'United States', 'Willamette Valley'),
        ('Azienda Agricola Castelvecchio', 'Italy', 'Carso'),
        ('Azienda Agricola Castelvecchio', 'Italy', 'San Torpè'),
        ('Azienda Agricola Castelvecchio', 'Italy', 'Vin Santo del Chianti'),
        ('Azienda Agricola Valentini', 'Italy', 'Colline Pescaresi'),
        ('Azienda Agricola Valentini', 'Italy', 'Monteregio di Massa Marittima'),
        ('Becker Vineyards', 'United States', 'Fredericksburg in the Texas Hill Country'),
        ('Becker Vineyards', 'United States', 'Texas Hill Country'),
        ('Bernardus Winery', 'United States', 'Carmel Valley'),
        ('Bernardus Winery', 'United States', 'Central Coast'),
        ('Bodega Carmencita', 'Peru', 'Arequipa'),
        ('Bodega Carmencita', 'Peru', 'Ica'),
        ('Bodega Chacra', 'Argentina', 'Avellaneda'),
        ('Bodega Chacra', 'Argentina', 'General Roca'),
       

In [13]:
multisite_wineries = by_(
  ['name', 'country', 'region'],
  df=wineries[
    (~wineries.name.isin(single_site_wineries))
    &(~wineries.name.isin(single_site_by_country_wineries))
    &(~wineries.set_index(['name', 'country']).index.isin(single_site_by_country_wineries))
    &(~wineries.set_index(['name', 'country', 'region']).index.isin(single_site_by_region_wineries))
  ]
)
multisite_wineries = np.unique(multisite_wineries.index)
multisite_wineries

array([('Abbazia San Giorgio', 'Italy', 'Pantelleria'),
       ('Adega da Pinguela', 'Spain', 'Valdeorras'),
       ('Agrícola de Barberà S C L L', 'Spain', 'Cava'),
       ('Bodega Carmencita', 'Peru', 'Lima'),
       ('Bodega Marlene', 'Peru', 'Lima'),
       ('Bodegas González Cabezas', 'Spain', 'Manchuela'),
       ('Bodegas y Viñedos Nicolas Catena S.A.', 'Argentina', 'Famatina'),
       ('Borbély Családi Pincészet', 'Hungary', 'Badacsony'),
       ('Brenca Cantina', 'Italy', 'Colli di Salerno'),
       ('CVNE', 'Spain', 'Rioja Alta'),
       ('Cantina Stefanoni', 'Italy', 'Est! Est!! Est!!! di Montefiascone'),
       ('Champagne Bollinger', 'France', 'Champagne'),
       ('Château Luchey-Halde', 'France', 'Pessac-Léognan'),
       ('Château des Vergers', 'France', 'Régnié'),
       ('Clos des Lambrays', 'France', 'Clos de la Roche'),
       ('Contentious Character', 'Australia', 'Canberra District'),
       ('Cooperativa Vinicola Chianti Montalbano S.C.R.L.', 'Italy', 'Valdinievo

In [14]:
review_location_cols = ['winery', 'country', 'province', 'region_1', 'region_2']
geocode_cols = ['lat', 'lon', 'region', 'country', 'url']

UNKNOWN_WINERY = namedtuple('UNKNOWN_WINERY', review_location_cols, defaults=[None]*len(review_location_cols))
RESOLVED_WINERY = namedtuple('RESOLVED_WINERY', geocode_cols, defaults=[None]*len(geocode_cols))

to_resolved_winery = lambda result: RESOLVED_WINERY(*next(result.itertuples(index=False)))

def geocode(row):
  # fail fast for unrecognized winery names
  if wineries.name.isin([row.winery]).sum() < 1:
    return UNKNOWN_WINERY(row.winery, row.country, row.province, row.region_1, row.region_2)
  
  query_terms = [
    f'name == "{row.winery}"',
    f'country == "{row.country}"',
    f'region == "{row.province}"',
  ]


  if row.winery in single_site_wineries:
    q = query_terms[:1]
  elif (row.winery, row.country) in map(lambda k: tuple(k), single_site_by_country_wineries):
    q = query_terms[:2]
  elif (row.winery, row.country, row.province) in map(lambda k: tuple(k), single_site_by_region_wineries):
    q = query_terms[:3]
  elif (row.winery, row.country, row.province) in map(lambda k: tuple(k), multisite_wineries):
    # TODO take centroid of lat-lon points
    print('multisite winery', query_terms[:3])
    q = query_terms[:3]
    return to_resolved_winery(wineries.query(' and '.join(q))[geocode_cols].groupby(['region', 'country', 'url']).mean().reset_index()[geocode_cols])
  
  if q is not None:
    print(q)
    return to_resolved_winery(wineries.query(' and '.join(q))[geocode_cols])

  print('***', row)
  return None

### Verify Lookup Logic

In [15]:
# data model for testing
MOCK_ROW = namedtuple('MOCK_ROW', review_location_cols, defaults=[None]*len(review_location_cols))

**Single Location Winery**

In [16]:
hafner = geocode(MOCK_ROW('Hafner Vineyard'))
# assert hafner.region == 'Alexander Valley'
# assert 'hafner' in hafner.url
hafner

['name == "Hafner Vineyard"']


RESOLVED_WINERY(lat=38.68112, lon=-122.80371, region='Alexander Valley', country='United States', url='https://www.hafnervineyard.com/')

**Single Location By Country Winery**

In [17]:
domaine_de_la_croix = geocode(MOCK_ROW('Domaine de la Croix', 'France'))
assert domaine_de_la_croix.region == 'Maures'
assert 'domainedelacroix' in domaine_de_la_croix.url
domaine_de_la_croix

['name == "Domaine de la Croix"', 'country == "France"']


RESOLVED_WINERY(lat=43.19824, lon=6.56929, region='Maures', country='France', url='http://www.domainedelacroix.com/')

**Single Location By Region Winery**

In [18]:
san_trope = geocode(MOCK_ROW('Azienda Agricola Castelvecchio', 'Italy', 'San Torpè'))
assert san_trope.region == 'San Torpè'
assert 'castelvecchio' in san_trope.url
san_trope

['name == "Azienda Agricola Castelvecchio"', 'country == "Italy"', 'region == "San Torpè"']


RESOLVED_WINERY(lat=43.52329, lon=10.66766, region='San Torpè', country='Italy', url='http://www.agricastelvecchio.it/')

**Multisite Winery**

In [19]:
capistrano = geocode(MOCK_ROW('Rancho Capistrano Winery', 'United States', 'South Coast'))
assert capistrano.region == 'South Coast'
assert 'ranchoca' in capistrano.url
capistrano

multisite winery ['name == "Rancho Capistrano Winery"', 'country == "United States"', 'region == "South Coast"']


RESOLVED_WINERY(lat=33.49787, lon=-117.63661, region='South Coast', country='United States', url='https://www.ranchocapwinery.com/')

**Unknown Winery**