In [1]:
#import dependencies

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import gmaps
import numpy as np
import pandas as pd
import requests
import time
from us import states
import json
from config import gkey

In [2]:
#importing the dataset

wine_df = pd.read_csv("winemag-data-130k-v2.csv", encoding='ISO-8859-1')
wine_df.head()

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkíæ Bianco,87,,Sicily & Sardinia,Etna,,Kerin Oäó»Keefe,@kerinokeefe,Nicosia 2013 Vulkíæ Bianco (Etna),White Blend,Nicosia
1,1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwineæ,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm
3,3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwineæ,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks


In [3]:
#keeping the relevant columns only--don't need reviewer name or twitter, dropping value columns without values

wine_df = wine_df[['country','description','designation','points','price','province','region_1','region_2', 'title', 'variety','winery']]

#grouping by province so we only have each province once to cut down on API calls

wine_df = wine_df.dropna(subset=['province', 'points','price'])
wine_provinces = wine_df.groupby(['province'], as_index=False).mean()
wine_provinces.head()

Unnamed: 0,province,points,price
0,Achaia,85.8,23.0
1,Aconcagua Costa,87.62963,23.074074
2,Aconcagua Valley,88.192982,40.298246
3,Aegean,88.954545,33.409091
4,Agioritikos,86.333333,24.0


In [4]:
#call to google geocoding api for lat and longs for all the provinces, handle error 
#errors for geocoding not finding returns an entry that is blank rather than an error so I had to handle that
lat = []
lon = []
provinces = []

for province in wine_provinces['province']:
    target_url = ('https://maps.googleapis.com/maps/api/geocode/json?' 
    'address={0}&key={1}').format(province, gkey)
    response = requests.get(target_url)
    if response.ok:
        print(f'processing {province}')
        geo_data = response.json()
        if geo_data['results']:
            lat.append(geo_data["results"][0]["geometry"]["location"]["lat"])
            lon.append(geo_data["results"][0]["geometry"]["location"]["lng"])
            provinces.append(province)
        else:
            print('Skipping %s because no data'%province)
    else:
        print('Got error on %s'%target_url)
        print(response)

processing Achaia
processing Aconcagua Costa
processing Aconcagua Valley
processing Aegean
processing Agioritikos
Skipping Agioritikos because no data
processing Ahr
processing Alenquer
processing Alentejano
processing Alentejo
processing Alenteo
processing Algarve
processing Alsace
processing America
processing Amindeo
processing Amyndeon
processing Andalucia
processing Ankara
processing Apalta
processing Arcadia
processing Arizona
processing Armenia
processing Atalanti Valley
processing Atlantida
Skipping Atlantida because no data
processing Attica
processing Australia Other
processing Austria
processing Awatere Valley
processing Baden
processing Bairrada
processing Beaujolais
Skipping Beaujolais because no data
processing Beira Atlantico
processing Beira Interior
processing Beiras
processing Bekaa Valley
processing Beotia
processing Black Sea Coastal
processing Bordeaux
processing Bot River
processing Brazil
processing Brda
processing Breede River Valley
processing Breedekloof
proce

processing Podunavlje
Skipping Podunavlje because no data
processing Polkadraai Hills
processing Port
processing Portugal
processing Portuguese Table Wine
processing Primorska
processing Progreso
processing Provence
processing Puente Alto
processing Rapel Valley
processing Rapsani
processing Recas
processing Requinoa
processing Retsina
processing Rheingau
processing Rheinhessen
Skipping Rheinhessen because no data
processing Rhode Island
processing RhíÇne Valley
Skipping RhíÇne Valley because no data
processing Ribatejano
Skipping Ribatejano because no data
processing Ribatejo
processing Rio Claro
processing Robertson
processing Romania
processing Sagrada Familia
processing Samos
processing Samson
processing San Antonio
processing San Antonio de las Minas Valley
processing San Clemente
processing San Jose
processing San Vicente
processing Santa Catarina
processing Santa Cruz
processing Santorini
processing Sebes
processing Serra Gaí_cha
Skipping Serra Gaí_cha because no data
processing

In [5]:
#merge the lat and lon with the full data set and then also make one just with what I need for maps
#save as csv so I don't need to rerun the api calls everytime
geo_dict = {
   "province": provinces,
    "lat": lat,
    "lon": lon
}
geo_df = pd.DataFrame(geo_dict)
full_set = wine_df.merge(geo_df, how="left", on = "province")
full_set = full_set.dropna(subset=['lat'])
full_set = full_set.merge(wine_provinces, how = "left", on = "province")
full_set = full_set.rename(columns ={"points_x":"points", "price_x": "price", "points_y": "province average points", "price_y":"province average price"})

full_set.to_csv("full_dataset.csv", index=False, header=True)
full_set.head()

map_set = full_set.groupby(["province"], as_index = False).mean()
map_set["province average utility"] = map_set["province average points"]/map_set["province average price"]
map_set["price dollars"] = map_set["price"].map("${:.2f}".format)
map_set["points"] = map_set["points"].map("{:.2f}".format)
map_set["province average utility"] = map_set["province average utility"].map("{:.2f}".format)
map_set.to_csv("map_dataset.csv", index = False, header = True)
map_set.head()

Unnamed: 0,province,points,price,lat,lon,province average points,province average price,province average utility,price dollars
0,Achaia,85.8,23.0,38.115873,21.952249,85.8,23.0,3.73,$23.00
1,Aconcagua Costa,87.63,23.074074,-32.653179,-70.010867,87.62963,23.074074,3.8,$23.07
2,Aconcagua Valley,88.19,40.298246,-32.653179,-70.010868,88.192982,40.298246,2.19,$40.30
3,Aegean,88.95,33.409091,39.019184,25.268555,88.954545,33.409091,2.66,$33.41
4,Ahr,90.22,59.111111,41.122313,-73.372417,90.222222,59.111111,1.53,$59.11


In [6]:
#import the csvs as dataframes
full_csv = pd.read_csv("full_dataset.csv")
map_csv = pd.read_csv("map_dataset.csv")

# make a heatmap of average rating by province--the redder the better
gmaps.configure(api_key=gkey)
locations = map_csv[["lat", "lon"]].astype(float)

ratings = map_csv["points"].astype(float)

fig1 = gmaps.figure()

heat_layer = gmaps.heatmap_layer(locations, weights=ratings, 
                                 dissipating=False, max_intensity=100,
                                 point_radius = 1)
heat_layer.dissipating = False
heat_layer.max_intensity = 100
heat_layer.point_radius = 1

fig1.add_layer(heat_layer)

fig1

Figure(layout=FigureLayout(height='420px'))

In [7]:
#make a map with popup info box for each province: average points, average price, average utility
#use wine colored dots rather than markers because markers were overcrowded

data_dict = map_csv.to_dict('records')

info_box_template = """
<dl>
<dt>Province</dt><dd>{province}</dd>
<dt>Average Utility</dt><dd>{province average utility}
<dt>Average Rating</dt><dd>{points}</dd>
<dt>Average Price</dt><dd>{price dollars}</dd>
</dl>
"""
wine_info = [info_box_template.format(**province) for province in data_dict]

fig2 = gmaps.figure()
marker_layer = gmaps.symbol_layer(
   locations, fill_color='maroon', stroke_color='maroon', scale=2, info_box_content=wine_info
)

fig2.add_layer(marker_layer)
fig2



Figure(layout=FigureLayout(height='420px'))

In [8]:
#heatmap of average price by province

prices = map_csv["price"].astype(float)

fig3 = gmaps.figure()

heat_layer = gmaps.heatmap_layer(locations, weights=prices, 
                                 dissipating=False, max_intensity = 100,
                                 point_radius = 1)

heat_layer.dissipating = False
heat_layer.max_intensity = 100
heat_layer.point_radius = 1
fig3.add_layer(heat_layer)

fig3


Figure(layout=FigureLayout(height='420px'))

In [11]:
#heatmap of average utilty by province
utility = map_csv["province average utility"].astype(float)

fig4 = gmaps.figure()

heat_layer = gmaps.heatmap_layer(locations, weights=utility*10, 
                                 dissipating=False, max_intensity = 100,
                                 point_radius = 1)

heat_layer.dissipating = False
heat_layer.max_intensity = 100
heat_layer.point_radius = 1
fig4.add_layer(heat_layer)

fig4

Figure(layout=FigureLayout(height='420px'))