In [1]:
# libraries
import os
import pandas as pd
import numpy as np
import kagglehub

# project code modules
from utils.geocode_utils import load_country_code_lookup

## Download Data

**Load the reviews dataset**

In [2]:
fname = "winemag-data-130k-v2.csv"

# Download latest version
path = kagglehub.dataset_download("christopheiv/winemagdata130k")
print("Path to dataset files:", path)
print()

reviews = pd.read_csv(os.path.join(path, fname), index_col=0)
print(
    f'{reviews['country'].isna().sum():,d}', "reviews are missing location information"
)
print()
reviews.info()

Path to dataset files: /Users/patrick/.cache/kagglehub/datasets/christopheiv/winemagdata130k/versions/1

63 reviews are missing location information

<class 'pandas.core.frame.DataFrame'>
Index: 129971 entries, 0 to 129970
Data columns (total 13 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   country                129908 non-null  object 
 1   description            129971 non-null  object 
 2   designation            92506 non-null   object 
 3   points                 129971 non-null  int64  
 4   price                  120975 non-null  float64
 5   province               129908 non-null  object 
 6   region_1               108724 non-null  object 
 7   region_2               50511 non-null   object 
 8   taster_name            103727 non-null  object 
 9   taster_twitter_handle  98758 non-null   object 
 10  title                  129971 non-null  object 
 11  variety                129970 non-null  object 
 1

**Load country codes**

In [3]:
# Download the country code dataset
path = kagglehub.dataset_download("juanumusic/countries-iso-codes")
print("Path to dataset files:", path)

fname = "wikipedia-iso-country-codes.csv"
df = pd.read_csv(os.path.join(path, fname))
print(df.info())
df.head()

Path to dataset files: /Users/patrick/.cache/kagglehub/datasets/juanumusic/countries-iso-codes/versions/1
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 246 entries, 0 to 245
Data columns (total 5 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   English short name lower case  246 non-null    object
 1   Alpha-2 code                   245 non-null    object
 2   Alpha-3 code                   246 non-null    object
 3   Numeric code                   246 non-null    int64 
 4   ISO 3166-2                     246 non-null    object
dtypes: int64(1), object(4)
memory usage: 9.7+ KB
None


Unnamed: 0,English short name lower case,Alpha-2 code,Alpha-3 code,Numeric code,ISO 3166-2
0,Afghanistan,AF,AFG,4,ISO 3166-2:AF
1,Åland Islands,AX,ALA,248,ISO 3166-2:AX
2,Albania,AL,ALB,8,ISO 3166-2:AL
3,Algeria,DZ,DZA,12,ISO 3166-2:DZ
4,American Samoa,AS,ASM,16,ISO 3166-2:AS


In [4]:
del df

#### Alternate Location Datasets
**Kaggle datasets**
| path | file |
| --- | --- |
| `juanmah/world-cities` | `worldcities.csv` | 
|  `max-mind/world-cities-database` | `worldcitiespop.csv` |

**Other sata sources**
- [world cities database](https://simplemaps.com/data/world-cities)
- [opendatasoft](https://public.opendatasoft.com/explore/dataset/geonames-all-cities-with-a-population-1000/export/?flg=en-us&disjunctive.cou_name_en&sort=name)
- [maxmind](https://www.kaggle.com/datasets/max-mind/world-cities-database) world cities database
- [geoapify](https://www.geoapify.com/download-all-the-cities-towns-villages/)
- [country codes](https://datahub.io/core/country-list#readme)
- [wine producing regions](https://en.wikipedia.org/wiki/List_of_wine-producing_regions)
- [vineyards](https://vineyards.com/)
- [old wine registry](https://www.oldvineregistry.org/faq)
- [LWIN](https://www.liv-ex.com/wwd/lwin/get-lwin-database/) database

## Add Country Code to Reviews Dataset

In [5]:
# lookup helper function that returns the 2-letter code for a country
country_codes_lookup = load_country_code_lookup()


def get_country_code(name: str):
    return country_codes_lookup[name] if name in country_codes_lookup else None


# check the results
assert get_country_code("France") == "FR"
assert type(get_country_code("France")) is str

Add ISO country code to reviews

In [6]:
reviews["code"] = reviews["country"].apply(get_country_code)

Classify as old world or new world wines

[What is the Difference Between Old World Wine and New World Wine?
](https://www.volioimports.com/what-is-the-difference-between-old-world-wine-and-new-world-wine/)

In [7]:
old_world = [
    get_country_code(country)
    for country in ["Italy", "France", "Spain", "Portugal", "Germany"]
]
old_world

['IT', 'FR', 'ES', 'PT', 'DE']

In [8]:
reviews["old_world"] = np.where(reviews["code"].isin(old_world), True, False)
reviews.head()

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery,code,old_world
0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia,IT,True
1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos,PT,True
2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm,US,False
3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian,US,False
4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks,US,False


## Explore Winery data

In [9]:
print(f'{reviews['winery'].isna().sum():,d}', "reviews missing winery")
print(f'{reviews['description'].isna().sum():,d}', "reviews missing description")

0 reviews missing winery
0 reviews missing description


In [10]:
wineries = np.unique(reviews.winery.dropna())
print("data set contains", f"{len(wineries):,d}", "distinct wineries")

data set contains 16,757 distinct wineries


## Explore Location Information

**Reviews missing location information**

In [11]:
location_cols = ["winery", "region_1", "region_2", "province", "country", "code"]
reviews[location_cols]

Unnamed: 0,winery,region_1,region_2,province,country,code
0,Nicosia,Etna,,Sicily & Sardinia,Italy,IT
1,Quinta dos Avidagos,,,Douro,Portugal,PT
2,Rainstorm,Willamette Valley,Willamette Valley,Oregon,US,US
3,St. Julian,Lake Michigan Shore,,Michigan,US,US
4,Sweet Cheeks,Willamette Valley,Willamette Valley,Oregon,US,US
...,...,...,...,...,...,...
129966,Dr. H. Thanisch (Erben Müller-Burggraef),,,Mosel,Germany,DE
129967,Citation,Oregon,Oregon Other,Oregon,US,US
129968,Domaine Gresser,Alsace,,Alsace,France,FR
129969,Domaine Marcel Deiss,Alsace,,Alsace,France,FR


In [12]:
print(
    "there are ",
    f"{reviews.query('winery in @wineries')[location_cols].duplicated().sum():,d}",
    "duplicated winery locations",
)

there are  99,555 duplicated winery locations


In [13]:
locations = reviews.query("winery in @wineries")[location_cols].copy().drop_duplicates()
print(
    f"{locations.shape[0] - len(wineries):,d}",
    "duplicate winery names in different locations",
)
locations

13,659 duplicate winery names in different locations


Unnamed: 0,winery,region_1,region_2,province,country,code
0,Nicosia,Etna,,Sicily & Sardinia,Italy,IT
1,Quinta dos Avidagos,,,Douro,Portugal,PT
2,Rainstorm,Willamette Valley,Willamette Valley,Oregon,US,US
3,St. Julian,Lake Michigan Shore,,Michigan,US,US
4,Sweet Cheeks,Willamette Valley,Willamette Valley,Oregon,US,US
...,...,...,...,...,...,...
129940,Standish,Mendocino,,California,US,US
129941,Apriori,Mendocino County,,California,US,US
129945,Birichino,Santa Ynez Valley,Central Coast,California,US,US
129947,Feudo Principi di Butera,Terre Siciliane,,Sicily & Sardinia,Italy,IT


In [14]:
locations.info()

<class 'pandas.core.frame.DataFrame'>
Index: 30416 entries, 0 to 129952
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   winery    30416 non-null  object
 1   region_1  26697 non-null  object
 2   region_2  10023 non-null  object
 3   province  30389 non-null  object
 4   country   30389 non-null  object
 5   code      30389 non-null  object
dtypes: object(6)
memory usage: 1.6+ MB


In [15]:
print(f"{locations.province.isna().sum():,d}", "wineries without location information")
locations[locations["code"].isna()].head()

27 wineries without location information


Unnamed: 0,winery,region_1,region_2,province,country,code
913,Gotsa Family Wines,,,,,
3131,Barton & Guestier,,,,,
4243,Kakhetia Traditional Winemaking,,,,,
9509,Tsililis,,,,,
9750,Ross-idi,,,,,


In [16]:
locations.groupby(["code"]).count()[["winery"]].rename(
    columns={"winery": "count"}
).sort_values("count", ascending=False).head()

Unnamed: 0_level_0,count
code,Unnamed: 1_level_1
US,10925
FR,6606
IT,6042
ES,1695
AR,785


In [17]:
print("Missing location values:")
print(reviews[location_cols].isna().sum().to_frame().T.rename(index={0: "count"}))
print()
reviews[location_cols].info()

Missing location values:
       winery  region_1  region_2  province  country  code
count       0     21247     79460        63       63    63

<class 'pandas.core.frame.DataFrame'>
Index: 129971 entries, 0 to 129970
Data columns (total 6 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   winery    129971 non-null  object
 1   region_1  108724 non-null  object
 2   region_2  50511 non-null   object
 3   province  129908 non-null  object
 4   country   129908 non-null  object
 5   code      129908 non-null  object
dtypes: object(6)
memory usage: 6.9+ MB


In [18]:
_2 = np.unique(reviews["region_2"].dropna())

In [19]:
reviews.query("region_2 in @_2")[
    ["winery", "region_1", "region_2", "province", "country", "code"]
]

Unnamed: 0,winery,region_1,region_2,province,country,code
2,Rainstorm,Willamette Valley,Willamette Valley,Oregon,US,US
4,Sweet Cheeks,Willamette Valley,Willamette Valley,Oregon,US,US
10,Kirkland Signature,Napa Valley,Napa,California,US,US
12,Louis M. Martini,Alexander Valley,Sonoma,California,US,US
14,Mirassou,Central Coast,Central Coast,California,US,US
...,...,...,...,...,...,...
129945,Birichino,Santa Ynez Valley,Central Coast,California,US,US
129949,Flora Springs,Napa Valley,Napa,California,US,US
129950,Hendry,Napa Valley,Napa,California,US,US
129952,Houdini,Chiles Valley,Napa,California,US,US


## Explore Winery Detail

In [20]:
locations[locations[locations.columns.drop(["region_1"])].duplicated()]

Unnamed: 0,winery,region_1,region_2,province,country,code
28,Terre di Giurfo,Cerasuolo di Vittoria,,Sicily & Sardinia,Italy,IT
34,Envolve,Sonoma Valley,Sonoma,California,US,US
74,Hindsight,Calistoga,Napa,California,US,US
95,Henry Fessy,Juliénas,,Beaujolais,France,FR
96,Henry Fessy,Régnié,,Beaujolais,France,FR
...,...,...,...,...,...,...
129940,Standish,Mendocino,,California,US,US
129941,Apriori,Mendocino County,,California,US,US
129945,Birichino,Santa Ynez Valley,Central Coast,California,US,US
129947,Feudo Principi di Butera,Terre Siciliane,,Sicily & Sardinia,Italy,IT


In [21]:
locations[locations[locations.columns.drop(["region_2"])].duplicated()]

Unnamed: 0,winery,region_1,region_2,province,country,code
1078,Bergström,Willamette Valley,Willamette Valley,Oregon,US,US
1086,Trisaetum,Willamette Valley,,Oregon,US,US
6030,Erath,Willamette Valley,Willamette Valley,Oregon,US,US
7140,Panther Creek,Willamette Valley,Willamette Valley,Oregon,US,US
7207,Fullerton,Willamette Valley,,Oregon,US,US
7217,Roco,Willamette Valley,,Oregon,US,US
9003,WillaKenzie Estate,Willamette Valley,Willamette Valley,Oregon,US,US
9026,Ken Wright,Willamette Valley,Willamette Valley,Oregon,US,US
9532,Elk Cove,Willamette Valley,,Oregon,US,US
10236,Raptor Ridge,Willamette Valley,,Oregon,US,US


In [22]:
locations.query('winery == "Bergström"')

Unnamed: 0,winery,region_1,region_2,province,country,code
1069,Bergström,Dundee Hills,Willamette Valley,Oregon,US,US
1070,Bergström,Willamette Valley,,Oregon,US,US
1078,Bergström,Willamette Valley,Willamette Valley,Oregon,US,US
1504,Bergström,Eola-Amity Hills,Willamette Valley,Oregon,US,US
14947,Bergström,Chehalem Mountains,Willamette Valley,Oregon,US,US
27320,Bergström,Ribbon Ridge,Willamette Valley,Oregon,US,US


In [23]:
locations.query('winery == "Terre di Giurfo"')

Unnamed: 0,winery,region_1,region_2,province,country,code
6,Terre di Giurfo,Vittoria,,Sicily & Sardinia,Italy,IT
28,Terre di Giurfo,Cerasuolo di Vittoria,,Sicily & Sardinia,Italy,IT
11858,Terre di Giurfo,Sicilia,,Sicily & Sardinia,Italy,IT
17177,Terre di Giurfo,Vittoria Frappato,,Sicily & Sardinia,Italy,IT


## Geocoding Strategy

**Geocoding Process**

1. use the `/forward` endpoint to get matches for winery points of interest (POI)

    `request`
    ```yaml
    GET: /search/searchbox/v1/forward?q=Terre di Giurfo&country=it&poi_category=winery&access_token=pk.eyJ1IjoidHVxdWUiLCJhIjoiY200eXVqY2MxMTRjMDJqcTdlb3pnYms4NiJ9.gQ6bHR8LxR1OdNCeR4p0sA HTTP/1.1
    Host: api.mapbox.com
    ```
    `response`
    ```json
    {
      "type": "FeatureCollection",
      "features": [
          {
              "type": "Feature",
              "geometry": {
                  "coordinates": [
                      14.633219,
                      37.110816
                  ],
                  "type": "Point"
              },
              "properties": {
                  "name": "Terre di Giurfo",
                  "mapbox_id": "dXJuOm1ieHBvaTpmZjAzZTc4Mi1lNTlmLTQ2MWMtYmJmMC0zMmYyOWYyYjVlZTA",
                  "feature_type": "poi",
                  "address": "Contrada Giurfo",
                  "full_address": "Contrada Giurfo, 95040 Licodia Eubea, Italy",
                  "place_formatted": "95040 Licodia Eubea, Italy",
                  "context": {
                      "country": {
                          "name": "Italy",
                          "country_code": "IT",
                          "country_code_alpha_3": "ITA"
                      },
                      "postcode": {
                          "id": "dXJuOm1ieHBsYzpBakF1Y0E",
                          "name": "95040"
                      },
                      "place": {
                          "id": "dXJuOm1ieHBsYzpBYlpJY0E",
                          "name": "Licodia Eubea"
                      },
                      "street": {
                          "name": "contrada giurfo"
                      }
                  },
                  "coordinates": {
                      "latitude": 37.110816,
                      "longitude": 14.633219,
                      "routable_points": [
                          {
                              "name": "POI",
                              "latitude": 37.110982,
                              "longitude": 14.633116
                          }
                      ]
                  },
                  "language": "",
                  "maki": "bar",
                  "poi_category": [
                      "bar",
                      "food and drink",
                      "nightlife",
                      "winery"
                  ],
                  "poi_category_ids": [
                      "bar",
                      "food_and_drink",
                      "nightlife",
                      "winery"
                  ],
                  "external_ids": {
                      "dataplor": "437c0e04-ff9b-4a54-b317-0068d13bb704"
                  },
                  "metadata": {
                      "phone": "+393356238909",
                      "website": "http://www.terredigiurfo.it/",
                      "open_hours": {
                          "periods": [
                              {
                                  "open": {
                                      "day": 1,
                                      "time": "0900"
                                  },
                                  "close": {
                                      "day": 1,
                                      "time": "1700"
                                  }
                              },
                              {
                                  "open": {
                                      "day": 2,
                                      "time": "0900"
                                  },
                                  "close": {
                                      "day": 2,
                                      "time": "1700"
                                  }
                              },
                              {
                                  "open": {
                                      "day": 3,
                                      "time": "0900"
                                  },
                                  "close": {
                                      "day": 3,
                                      "time": "1700"
                                  }
                              },
                              {
                                  "open": {
                                      "day": 4,
                                      "time": "0900"
                                  },
                                  "close": {
                                      "day": 4,
                                      "time": "1700"
                                  }
                              },
                              {
                                  "open": {
                                      "day": 5,
                                      "time": "0900"
                                  },
                                  "close": {
                                      "day": 5,
                                      "time": "1700"
                                  }
                              },
                              {
                                  "open": {
                                      "day": 6,
                                      "time": "0900"
                                  },
                                  "close": {
                                      "day": 6,
                                      "time": "1700"
                                  }
                              }
                          ]
                      }
                  }
              }
          }
      ],
      "attribution": "© 2024 Mapbox and its suppliers. All rights reserved. Use of this data is subject to the Mapbox Terms of Service. (https://www.mapbox.com/about/maps/)",
      "response_id": "dvOsuPxXi66rMTotPSVuv-BpZJ9xTHgOSiD6hHRJi7Vq6i3HuJZVXU1yA_c44VoXtFopeXqOITLkugaQ-_2CpOC3t-8QyeJOzB0="
    }
    ```
    `resonse header`
    
    | Key | Value |
    |-----|-------|
    | X-Rate-Limit-Limit | 10 |
    | X-Rate-Limit-Interval | 1 |
    | X-Rate-Limit-Reset | 1734834269 |
    
    

In [24]:
# rate limit
limit = 10  # 10 requests per second
per_minute = 60 * limit  # limit of requests per minute
per_hour = 60 * per_minute  # limit of requests per minute

print(
    "With a limit of",
    f"{per_hour:,d}",
    "geocode requests per hour, it will take",
    f"{round(locations.shape[0]/per_hour*60,0):,.0f}",
    "minutes to geocode all the wineries",
)

With a limit of 36,000 geocode requests per hour, it will take 51 minutes to geocode all the wineries
