In [27]:
# libraries
import pandas as pd
import geopandas as gpd
import os
import json
from collections import namedtuple
import numpy as np
import swifter

from geocode_winery import geocode, __lookup, __calculate

import matplotlib.pyplot as plt
import seaborn as sns

### Download Data

Source: Kaggle [Wine Reviews](https://www.kaggle.com/datasets/zynicide/wine-reviews)

In [28]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("christopheiv/winemagdata130k")

print("Path to dataset files:", path)

Path to dataset files: /Users/patrick/.cache/kagglehub/datasets/christopheiv/winemagdata130k/versions/1


### Load Data

**Load Wine Reviews Data**

In [29]:
fname = "winemag-data-130k-v2.csv"
reviews = pd.read_csv(os.path.join(path, fname), index_col=0)
reviews.info()
reviews

<class 'pandas.core.frame.DataFrame'>
Index: 129971 entries, 0 to 129970
Data columns (total 13 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   country                129908 non-null  object 
 1   description            129971 non-null  object 
 2   designation            92506 non-null   object 
 3   points                 129971 non-null  int64  
 4   price                  120975 non-null  float64
 5   province               129908 non-null  object 
 6   region_1               108724 non-null  object 
 7   region_2               50511 non-null   object 
 8   taster_name            103727 non-null  object 
 9   taster_twitter_handle  98758 non-null   object 
 10  title                  129971 non-null  object 
 11  variety                129970 non-null  object 
 12  winery                 129971 non-null  object 
dtypes: float64(1), int64(1), object(11)
memory usage: 13.9+ MB


Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm
3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks
...,...,...,...,...,...,...,...,...,...,...,...,...,...
129966,Germany,Notes of honeysuckle and cantaloupe sweeten th...,Brauneberger Juffer-Sonnenuhr Spätlese,90,28.0,Mosel,,,Anna Lee C. Iijima,,Dr. H. Thanisch (Erben Müller-Burggraef) 2013 ...,Riesling,Dr. H. Thanisch (Erben Müller-Burggraef)
129967,US,Citation is given as much as a decade of bottl...,,90,75.0,Oregon,Oregon,Oregon Other,Paul Gregutt,@paulgwine,Citation 2004 Pinot Noir (Oregon),Pinot Noir,Citation
129968,France,Well-drained gravel soil gives this wine its c...,Kritt,90,30.0,Alsace,Alsace,,Roger Voss,@vossroger,Domaine Gresser 2013 Kritt Gewurztraminer (Als...,Gewürztraminer,Domaine Gresser
129969,France,"A dry style of Pinot Gris, this is crisp with ...",,90,32.0,Alsace,Alsace,,Roger Voss,@vossroger,Domaine Marcel Deiss 2012 Pinot Gris (Alsace),Pinot Gris,Domaine Marcel Deiss


### Data Exploration

In [30]:
# group by helper
by_ = lambda cols, df=reviews: df.groupby(cols).count()[['title']].rename(columns={'title':'count'}).sort_values('count', ascending=False)

In [31]:
# count of reviews by country
by_country = by_('country')
print('there are reviews of wines from', f'{by_country.shape[0]:,d}', 'countries')
by_country.head()

there are reviews of wines from 43 countries


Unnamed: 0_level_0,count
country,Unnamed: 1_level_1
US,54504
France,22093
Italy,19540
Spain,6645
Portugal,5691


In [32]:
# count of reviews by province
by_province = by_(['country', 'province'])
print('there are reviews of wines from', f'{by_province.shape[0]:,d}', 'provinces')
by_province.head()

there are reviews of wines from 425 provinces


Unnamed: 0_level_0,Unnamed: 1_level_0,count
country,province,Unnamed: 2_level_1
US,California,36247
US,Washington,8639
France,Bordeaux,5941
Italy,Tuscany,5897
US,Oregon,5373


In [33]:
# count of reviews by winery name
by_winery = by_(['country', 'province', 'winery'])
print('there are reviews of wines from', f'{by_winery.shape[0]:,d}', 'wineries')
by_winery.head()

there are reviews of wines from 19,028 wineries


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count
country,province,winery,Unnamed: 3_level_1
US,California,Testarossa,218
US,California,Williams Selyem,211
France,Burgundy,Louis Latour,198
US,Washington,Chateau Ste. Michelle,194
France,Beaujolais,Georges Duboeuf,165


In [34]:
s = reviews.iloc[112531]
s

country                                                             France
description              Produced in western Touraine, close to Chinon ...
designation                                                  Cuvée Réserve
points                                                                  88
price                                                                 19.0
province                                                      Loire Valley
region_1                                                          Touraine
region_2                                                               NaN
taster_name                                                     Roger Voss
taster_twitter_handle                                           @vossroger
title                    Château du Petit Thouars 2011 Cuvée Réserve Ca...
variety                                                     Cabernet Franc
winery                                            Château du Petit Thouars
Name: 112531, dtype: obje

In [35]:
geocode(s)

RESOLVED_WINERY(lat=47.18218, lon=0.10892, region='Chinon', country='France', url='http://www.chateaudptwines.com/')

In [36]:
locations = pd.DataFrame(filter(lambda loc: loc is not None, reviews.swifter.apply(geocode, axis=1)), index=reviews.index)
print(__lookup.cache_info())
print(__calculate.cache_info())
locations.info()
locations

Pandas Apply:   0%|          | 0/129971 [00:00<?, ?it/s]

CacheInfo(hits=28691, misses=1463, maxsize=None, currsize=1463)
CacheInfo(hits=0, misses=0, maxsize=None, currsize=0)
<class 'pandas.core.frame.DataFrame'>
Index: 129971 entries, 0 to 129970
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   lat      14716 non-null  float64
 1   lon      14716 non-null  float64
 2   region   14495 non-null  object 
 3   country  14495 non-null  object 
 4   url      14099 non-null  object 
dtypes: float64(2), object(3)
memory usage: 10.0+ MB


Unnamed: 0,lat,lon,region,country,url
0,,,,,
1,,,,,
2,,,,,
3,,,,,
4,,,,,
...,...,...,...,...,...
129966,,,,,
129967,,,,,
129968,48.3863,7.41669,Alsace grand cru,France,http://www.gresser.fr/
129969,,,,,


In [37]:
print(f'{len(np.unique(reviews['winery'])):,d}', 'unique winery names in', f'{reviews.shape[0]:,d}', 'reviews')

16,757 unique winery names in 129,971 reviews


In [38]:
print(f'{len(np.unique(reviews.loc[locations.lat.notna(), 'winery'])):,d}', 'unique winery names in', f'{locations[locations.lat.notna()].shape[0]:,d}', 'winery locations')

1,463 unique winery names in 14,716 winery locations


### Load OSM Points of Interest Data