In [1]:
import numpy as np
import pandas as pd
import swifter
import geopandas
from geodatasets import get_path
from ratelimit import limits, sleep_and_retry

from utils.geocode_utils import LOCATION

from random_address import real_random_address, real_random_address_by_state
from faker import Faker

from time import perf_counter_ns as timer

import plotly.express as px

In [None]:
# load reviews
import kagglehub
import os

path = kagglehub.dataset_download("christopheiv/winemagdata130k")
fname = "winemag-data-130k-v2.csv"
reviews = pd.read_csv(os.path.join(path, fname), index_col=0)
reviews.info()

reviews

In [None]:
# determine the top 5 countries
locales = reviews.groupby('country').count()[['winery']].rename(columns={'winery':'count'}).sort_values('count', ascending=False).head(5).copy().reset_index()
locales

In [None]:
us_reviews = reviews[(reviews.country == 'US')&(reviews.province != 'America')&(reviews.province != 'Washington-Oregon')]
us_reviews.info()

In [5]:
s1 = set(us_reviews.sample(500).index)

In [6]:
s2 = set(us_reviews.sample(500).index)

while np.sum([s in s2 for s in s1]) < 10:
  s2 = set(us_reviews.sample(500).index)

In [None]:
print(np.sum([s in s2 for s in s1]), 'items in common to test caching')

In [None]:
us_reviews.loc[list(s1)].head()

In [9]:
# map of US state names to 2 letter state code
us_state_to_abbrev = {
    "Alabama": "AL",
    "Alaska": "AK",
    "Arizona": "AZ",
    "Arkansas": "AR",
    "California": "CA",
    "Colorado": "CO",
    "Connecticut": "CT",
    "Delaware": "DE",
    "Florida": "FL",
    "Georgia": "GA",
    "Hawaii": "HI",
    "Idaho": "ID",
    "Illinois": "IL",
    "Indiana": "IN",
    "Iowa": "IA",
    "Kansas": "KS",
    "Kentucky": "KY",
    "Louisiana": "LA",
    "Maine": "ME",
    "Maryland": "MD",
    "Massachusetts": "MA",
    "Michigan": "MI",
    "Minnesota": "MN",
    "Mississippi": "MS",
    "Missouri": "MO",
    "Montana": "MT",
    "Nebraska": "NE",
    "Nevada": "NV",
    "New Hampshire": "NH",
    "New Jersey": "NJ",
    "New Mexico": "NM",
    "New York": "NY",
    "North Carolina": "NC",
    "North Dakota": "ND",
    "Ohio": "OH",
    "Oklahoma": "OK",
    "Oregon": "OR",
    "Pennsylvania": "PA",
    "Rhode Island": "RI",
    "South Carolina": "SC",
    "South Dakota": "SD",
    "Tennessee": "TN",
    "Texas": "TX",
    "Utah": "UT",
    "Vermont": "VT",
    "Virginia": "VA",
    "Washington": "WA",
    "West Virginia": "WV",
    "Wisconsin": "WI",
    "Wyoming": "WY",
    "District of Columbia": "DC",
    "American Samoa": "AS",
    "Guam": "GU",
    "Northern Mariana Islands": "MP",
    "Puerto Rico": "PR",
    "United States Minor Outlying Islands": "UM",
    "Virgin Islands, U.S.": "VI",
}

In [10]:
no_address = []
found = []


def geocode(row):
  state = us_state_to_abbrev[row['province']]
  address = real_random_address_by_state(state)
  (no_address if len(address) == 0 else found).append(state)

  

In [None]:
us_reviews.loc[list(s1)].swifter.apply(geocode, axis=1)

In [None]:
np.unique(no_address)

In [None]:
any([s in found for s in np.unique(no_address)])

In [None]:
[real_random_address_by_state(s) for s in np.unique(no_address)]

In [15]:
location_cols = ['winery', 'region_1', 'region_2', 'province', 'country']

In [None]:
# no location information
mask = (reviews.region_1.isna())&(reviews.region_2.isna())&(reviews.province.isna())&(reviews.country.isna())
reviews[location_cols][mask]

In [None]:
# only country information
mask = (reviews.country.notna())&(reviews.region_1.isna())&(reviews.region_2.isna())&(reviews.province.isna())
reviews[location_cols][mask]

In [None]:
# only province information
mask = (reviews.country.notna())&(reviews.province.notna()&(reviews.region_1.isna())&(reviews.region_2.isna()))
reviews[location_cols][mask]

In [None]:
# location only contains country information, in redundant forms
def test(row):
  return (str(row.country) in str(row.province)) or (str(row.country) in str(row.region_1)) or (str(row.country) in str(row.region_2))

mask = reviews[location_cols].swifter.apply(test, axis=1)
reviews.loc[mask,['region_1','region_2','province']] = None
reviews.loc[mask,location_cols]

In [None]:
def test(row):
  return (row.region_2 == row.region_1) or ('Other' in str(row.region_2))

mask = reviews.swifter.apply(test, axis=1)
reviews.loc[mask, 'region_2'] = None
reviews[mask]

In [None]:
def test(row):
  return (row.province is not None) and (row.region_1 == row.province) or ('Other' in str(row.region_1))

mask = (reviews.swifter.apply(test, axis=1))
reviews.loc[mask, 'region_1'] = None
reviews.loc[mask,location_cols]

In [None]:
mask = (reviews.region_1.notna())&((reviews.region_1.str.contains('Vin Santo'))|(reviews.region_1.str.contains('Vin Doux')))
reviews.loc[mask, 'region_1'] = None
reviews.loc[mask, location_cols]

In [None]:
np.unique(reviews[(reviews.region_1.notna())&(reviews.region_1.str.contains('Vin'))].region_1)

In [None]:
np.unique(reviews[((reviews.region_1).notna())&(not all(reviews.region_1.str.contains('Vin')))].region_1)

In [None]:
np.unique(reviews[(reviews.region_2.notna())].region_2)

In [None]:
mask = reviews[(reviews.region_1.notna())|(reviews.region_2.notna())|reviews.province.notna()].groupby(['winery', 'country']).count()[['title']].rename(columns={'title': 'count'}).query('count == 1').index.to_list()
expected = len(mask)
mask

In [None]:
mask = reviews.swifter.apply(lambda row: (row.region_1 is not None or row.region_2 is not None or row.province is not None) and ((row.winery, row.country) in mask), axis=1)
reviews.loc[mask, 'q'] = reviews.loc[mask].winery

In [28]:
assert reviews[reviews.q.notna()].shape[0] == expected

In [None]:
mask = (reviews.q.isna())&(reviews.region_1.isna())&(reviews.region_2.isna())&(reviews.province.isna())
expected += reviews[mask].shape[0]
reviews.loc[mask,'q'] = reviews.loc[mask].winery
assert reviews[reviews.q.notna()].shape[0] == expected
reviews[reviews.q.notna()]

In [None]:
reviews[(reviews.q.notna())&(reviews.country == 'US')]

In [None]:
mask = (reviews.q.isna()) & (reviews.province.notna()) & (reviews.region_1.isna())& (reviews.region_2.isna())
expected += reviews[mask].shape[0]
reviews.loc[mask,'q'] = reviews.loc[mask].winery + ', ' + reviews.loc[mask].province
assert reviews[reviews.q.notna()].shape[0] == expected
reviews.loc[mask]

In [None]:
reviews[reviews.q.notna()]

In [None]:
mask = (reviews.q.isna())&(reviews.region_1.notna()|reviews.region_2.notna()|reviews.province.notna())&(reviews.country == 'US')
expected += reviews[mask].shape[0]
reviews.loc[mask,'q'] = reviews.loc[mask].apply(lambda row: ', '.join([row.winery, row.region_1 if row.region_1 is not None else row.region_2]), axis=1)
assert reviews[reviews.q.notna()].shape[0] == expected
reviews.loc[mask]

In [None]:
mask = (reviews.q.isna())&(reviews.region_1.notna()|reviews.region_2.notna()|reviews.province.notna())&(reviews.country != 'US')
expected += reviews[mask].shape[0]
reviews.loc[mask,'q'] = reviews.loc[mask].apply(lambda row: ', '.join([row.winery, row.region_1 if row.region_1 is not None else row.region_2]), axis=1)
assert reviews[reviews.q.notna()].shape[0] == expected
reviews.loc[mask]

In [None]:
reviews.loc[reviews.designation.isna(), 'designation'] = reviews.loc[reviews.designation.isna()].variety
reviews.loc[reviews.designation.isna()]

In [None]:
reviews[(reviews.country == 'US') & (~reviews.province.isin(['America','Washington-Oregon']))]

In [37]:
def geocode_winery(row):
  # Implement your mock logic here
  location = real_random_address_by_state(us_state_to_abbrev[row.province])
  if 'coordinates' in location:            
    lat, lon = location['coordinates'].values()
    # del location['coordinates']
    loc = LOCATION(lat, lon, location)
    # print(loc)
    return loc
  return LOCATION()


In [None]:

tmp = reviews[(reviews.country == 'US') & (~reviews.province.isin(['America','Washington-Oregon']))].sample(100)
tmp.info()
tmp

In [None]:
geoloc = pd.DataFrame(tmp.swifter.apply(geocode_winery, axis=1).to_list(), index=tmp.index)
geoloc.info()
geoloc

In [None]:
pd.concat([tmp, geoloc], axis=1)

In [None]:
addresses = pd.DataFrame(
  reviews[(reviews.country == 'US') & (~reviews.province.isin(['America','Washington-Oregon']))].swifter.apply(geocode_winery, axis=1).to_list(), 
  index=reviews[(reviews.country == 'US') & (~reviews.province.isin(['America','Washington-Oregon']))].index
)
addresses

In [None]:
pd.concat([reviews, addresses], axis=1)

In [None]:
reviews[(reviews.country == 'US') & (~reviews.province.isin(['America','Washington-Oregon']))].info()

In [44]:
from ratelimit import limits, sleep_and_retry
from runstats import Statistics

stats = Statistics()

# throttle search on geocode service
@sleep_and_retry
@limits(calls=10, period=1)  # Adjust rate limits as needed
def geocode(name:str, country:str, province:str = None):
  last = None
  if last is not None:
    stats.push(timer()-last)
  # Implement mock logic
  location = real_random_address_by_state(us_state_to_abbrev[province])
  last = timer()
  if 'coordinates' in location:            
    lat, lon = location['coordinates'].values()
    # del location['coordinates']
    loc = LOCATION(lat, lon, location)
    # print(loc)
    return loc
  return LOCATION()

In [45]:
from cachetools import LFUCache, cached
from shelved_cache import PersistentCache
from shelved_cache.keys import autotuple_hashkey

filename = 'winery-geolocation-cache'
pc = PersistentCache(LFUCache, filename, maxsize=1024*32)

@cached(cache=pc, key=autotuple_hashkey, info=True)
def get_geocode(name:str, country:str, province:str = None):
  return geocode(name, country, province)

In [None]:
def geocode_winery(row):
  return get_geocode(row.winery, row.country, row.province)

In [None]:
addresses = pd.DataFrame(
  reviews[(reviews.country == 'US') & (~reviews.province.isin(['America','Washington-Oregon']))].swifter.apply(geocode_winery, axis=1).to_list(), 
  index=reviews[(reviews.country == 'US') & (~reviews.province.isin(['America','Washington-Oregon']))].index
)
addresses

In [None]:
get_geocode.cache_info()

In [None]:
import geopandas as gpd

reviews = pd.concat([reviews,addresses], axis=1)
gdf = gpd.GeoDataFrame(
  reviews, 
  geometry=gpd.points_from_xy(reviews.lon, reviews.lat)
)
gdf.info()
gdf