# American Viticultural Area Scrape

The following was used to scrape American Viticultural Areas (AVAs) from [Wikipedia](https://en.wikipedia.org/wiki/List_of_American_Viticultural_Areas) @ May 1, 2019.


In [1]:
# Dependencies
from bs4 import BeautifulSoup
import requests
import pandas as pd
import os
# not condoned
import warnings
warnings.filterwarnings("ignore")

In [2]:
# We chose this page to obtain a comprehensive list of AVAs
# URL of page to be scraped
url = 'https://en.wikipedia.org/wiki/List_of_American_Viticultural_Areas'

In [3]:
# Retrieve page with the requests module
response = requests.get(url, verify = False)

In [4]:
# Create BeautifulSoup object; parse with 'html.parser'
soup = BeautifulSoup(response.text, 'html.parser')

In [5]:
# Examine the results, then determine element that contains sought info
# print(soup.prettify())

In [6]:
# State, Region (if available), and AVA names/links were listed sequentially within a page content div
# so traverse the content sibling to sibling looking for matching tag names, paring off extra string content

results = soup.find('div', id="mw-content-text")
h2s = results.find_all('h2') #.find_all('span',class_='mw-headline')
avalist=[]
linkL = []

for h2 in h2s:
    wstate = h2.find('span',class_='mw-headline')
    if wstate!=None:
        if wstate.text.lower()=="references":
            #end of page content
            break 
        mystate=wstate.text
        region=""
        next = h2.next_sibling
        while next.name != "h2":
            # get Regions (h3) or AVAs (li or li>a) until next State (h2 > span.mw-headline)
            if (next.name == "h3"):
                region = next.text.split("[edit]")[0]
            elif (next.name=='ul'):
                avas=next.find_all('li')
                for a in avas:
                    ava = a.find('a')
                    if (ava==None):
                        ava = a.text
                        href=""
                    else:
                        href = ava["href"]
                        ava = ava.text
                    ava=ava.split(" AVA")[0]
                    # save AVA - link list for further content
                    linkL.append({"ava":ava, "state":mystate, "link":href})
                    print(f"state: {mystate}, region: {region}, ava: {ava}" )
                    # append record to main list
                    avalist.append({"state":mystate, "region":region, "ava":ava})
            next= next.next_sibling


state: Arizona, region: , ava: Sonoita
state: Arizona, region: , ava: Willcox
state: Arkansas, region: , ava: Altus
state: Arkansas, region: , ava: Arkansas Mountain
state: Arkansas, region: , ava: Ozark Mountain
state: California, region: Central Coast and Santa Cruz Mountains, ava: Arroyo Grande Valley
state: California, region: Central Coast and Santa Cruz Mountains, ava: Arroyo Seco
state: California, region: Central Coast and Santa Cruz Mountains, ava: Ballard Canyon
state: California, region: Central Coast and Santa Cruz Mountains, ava: Ben Lomond Mountain
state: California, region: Central Coast and Santa Cruz Mountains, ava: Carmel Valley
state: California, region: Central Coast and Santa Cruz Mountains, ava: Central Coast
state: California, region: Central Coast and Santa Cruz Mountains, ava: Chalone
state: California, region: Central Coast and Santa Cruz Mountains, ava: Cienega Valley
state: California, region: Central Coast and Santa Cruz Mountains, ava: Edna Valley
state: C

In [7]:
# Save to dataframe and .csv

avadf=pd.DataFrame(avalist, columns=["state","region","ava"])
ava_file = os.path.join("saveData","avalist.csv")
avadf.to_csv(ava_file)
avadf.head()

Unnamed: 0,state,region,ava
0,Arizona,,Sonoita
1,Arizona,,Willcox
2,Arkansas,,Altus
3,Arkansas,,Arkansas Mountain
4,Arkansas,,Ozark Mountain


## Follow the AVA links

For most AVAs recorded above, there is a link to a wiki page with table summary of AVA climate, area, and grapes to scrape.

In [21]:
linkL

[{'ava': 'Sonoita', 'state': 'Arizona', 'link': '/wiki/Sonoita_AVA'},
 {'ava': 'Willcox',
  'state': 'Arizona',
  'link': '/w/index.php?title=Willcox_AVA&action=edit&redlink=1'},
 {'ava': 'Altus', 'state': 'Arkansas', 'link': '/wiki/Altus_AVA'},
 {'ava': 'Arkansas Mountain',
  'state': 'Arkansas',
  'link': '/wiki/Arkansas_Mountain_AVA'},
 {'ava': 'Ozark Mountain',
  'state': 'Arkansas',
  'link': '/wiki/Ozark_Mountain_AVA'},
 {'ava': 'Arroyo Grande Valley',
  'state': 'California',
  'link': '/wiki/Arroyo_Grande_Valley_AVA'},
 {'ava': 'Arroyo Seco',
  'state': 'California',
  'link': '/wiki/Arroyo_Seco_AVA'},
 {'ava': 'Ballard Canyon',
  'state': 'California',
  'link': '/w/index.php?title=Ballard_Canyon&action=edit&redlink=1'},
 {'ava': 'Ben Lomond Mountain',
  'state': 'California',
  'link': '/wiki/Ben_Lomond_Mountain_AVA'},
 {'ava': 'Carmel Valley',
  'state': 'California',
  'link': '/wiki/Carmel_Valley_AVA'},
 {'ava': 'Central Coast',
  'state': 'California',
  'link': '/wiki/Ce

In [8]:
# Loop scrape of each AVA link

avadetails=[]
baseurl = "https://en.wikipedia.org"
myrec = linkL[0]
for myrec in linkL:
    link = myrec['link']
    ava = myrec['ava']
    mystate = myrec['state']
    year=""
    climate=""
    area=""
    grapeL=[]
    if link!="":
        aresp = requests.get(baseurl+link, verify = False)
        asoup = BeautifulSoup(aresp.text, 'html.parser')
        # geo = asoup.find('span', 'geo-dec').text.split(" ")
        # lat = float(geo[0].split('°')[0])
        # lng = float(geo[1].split('°')[0])
        info = asoup.find('table', 'infobox')
        if info!= None:
            yearh = info.find('th',text="Year established")
            if yearh!=None:
                year=yearh.next_sibling.text.split('[')[0]
            climth = info.find('a', title="Winkler index")
            if climth!=None:
                climate = climth.parent.next_sibling.text
            areah = info.find('th',text="Total area")
            if areah==None:
                areah = info.find('th',text="Size of planted vineyards")
            if areah!=None:
                area = areah.next_sibling.text.split('[')[0]
            grapes = info.find('th',text="Grapes produced")
            if grapes==None:
                grapes = info.find('th',text="Varietals produced")
            if grapes!=None:
                glist = grapes.next_sibling.find_all('a')
                for g in glist:
                    if (g.parent.name!="sup"):
                        grapeL.append(g.text)
                        
    grapevals = "|".join(grapeL)
    adict={"ava":ava, "state":mystate, "year":year, "climate":climate, "area": area, "grapes":grapevals}
    print(mystate+" - "+ava+":")
    print(f"year:{year}, climate:{climate}, area: {area}, grapes:{grapevals}")
    print("-----")
    avadetails.append(adict)

Arizona - Sonoita:
year:1984, climate:Subtropical continental, area: 208,000 acres (84,200 ha), grapes:Cabernet Franc|Cabernet Sauvignon|Chardonnay|Malvasia|Merlot|Mourvedre|Petite Sirah|Pinot noir|Riesling|Sangiovese|Sauvignon blanc|Syrah|Tempranillo|Viognier|Zinfandel
-----
Arizona - Willcox:
year:, climate:, area: , grapes:
-----
Arkansas - Altus:
year:1984, climate:Humid subtropical, area: 208,000 acres (84,175 ha), grapes:Cabernet Sauvignon|Chardonnay|Kerner|Müller-Thurgau|Muscadine|Niagara|Oraniensteiner|Scheurebe|Vignoles
-----
Arkansas - Arkansas Mountain:
year:1986, climate:Continental/humid subtropical, area: 2,880,000 acres (1,165,000 ha), grapes:
-----
Arkansas - Ozark Mountain:
year:1986, amended 1988, climate:Continental/humid subtropical, area: 3,520,000 acres (1,424,493 ha), grapes:Catawba|Chambourcin|Chardonel|Concord|Norton|St. Vincent|Vidal blanc|Vignoles|Villard noir
-----
California - Arroyo Grande Valley:
year:1990, climate:, area: 42,880 acres (17,353 ha), grapes

California - York Mountain:
year:1983, amended in 1987, climate:, area: 9,360 acres (3,788 ha), grapes:Cabernet Sauvignon|Chardonnay|Petit Verdot|Pinot noir|Syrah|Viognier
-----
California - Alta Mesa:
year:2006, climate:, area: 55,400 acres (224 km2), grapes:Zinfandel|Syrah|Cabernet Sauvignon|Cabernet Franc|Merlot
-----
California - Borden Ranch:
year:2006, climate:, area: 70,000 acres (28,328 ha), grapes:Cabernet Sauvignon|Chardonnay|Merlot|Syrah|Zinfandel
-----
California - Capay Valley:
year:2002, climate:, area: 102,400 acres (41,440 ha), grapes:Cabernet Franc|Cabernet Sauvignon|Syrah|Mourvedre|Tempranillo|Viognier
-----
California - Clarksburg:
year:1984, climate:, area: 64,640 acres (26,159 ha), grapes:Albarino|Cabernet Sauvignon|Chardonnay|Chenin blanc|Gewurztraminer|Grenache blanc|Malvasia|Merlot|Orange Muscat|Petite Sirah|Pinot gris|Riesling|Sauvignon blanc|Syrah|Tempranillo|Verdelho|Viognier
-----
California - Clements Hills:
year:2006, climate:, area: 85,400 acres (34,560 h

California - Mendocino Ridge:
year:1997, climate:, area: 262,400 acres (1,100 km2), grapes:Pinot Noir|Zinfandel|Syrah|Sauvignon Blanc|Chardonnay|Merlot|Riesling|Albariño|Grüner Veltliner
-----
California - Moon Mountain District Sonoma County:
year:, climate:, area: , grapes:
-----
California - Mt. Veeder:
year:1993 , climate:, area: 15,000 acres (6,100 ha) , grapes:Cabernet Franc|Cabernet Sauvignon|Chardonnay|Malbec|Merlot|Petite Sirah|Syrah|Viognier|Zinfandel
-----
California - Napa Valley:
year:1981, climate:Mediterranean, area: 43,000 acres (174 km2), grapes:Cabernet Sauvignon|Merlot|Cabernet Franc|Pinot noir|Zinfandel|Chardonnay|Sauvignon blanc
-----
California - North Coast:
year:1983, climate:Mediterranean/maritime, area: 3,000,000 acres (12,000 km2), grapes:Barbera|Cabernet Franc|Cabernet Sauvignon|Carignane|Chardonnay|Dolcetto|Gamay noir|Gewurztraminer|Lagrein|Malbec|Merlot|Muscat Canelli|Petit Verdot|Petite Sirah|Pinot Meunier|Pinot noir|Sangiovese|Sauvignon blanc|Semillon|Sy

California - Malibu Coast:
year:, climate:, area: , grapes:
-----
California - Ramona Valley:
year:2006, climate:Mediterranean, area: 89,000 acres (360 km2) , grapes:
-----
California - Saddle Rock-Malibu:
year:2006, climate:Mediterranean, area: 2,100 acres (8 km2), grapes:
-----
California - San Pasqual Valley:
year:1981, climate:Mediterranean, area: 9,000 acres (3,642 ha), grapes:Merlot|Sangiovese|Syrah|Viognier
-----
California - Sierra Pelona Valley:
year:2010, climate:Arid, area: 9.7 square miles (25 km2), grapes:
-----
California - South Coast:
year:1985, climate:Mediterranean, area: 3,000 acres (12 km2), grapes:Cabernet Franc|Cabernet Sauvignon|Chardonnay|Lenoir|Merlot|Montepulciano|Muscat Canelli|Petit Verdot|Petite Sirah|Pinot gris|Riesling|Sangiovese|Sauvignon blanc|Symphony|Syrah|Tempranillo|Trebbiano|Viognier|Zinfandel
-----
California - Temecula Valley:
year:1984 (amended in 1986, 1987, and 2004), climate:Mediterranean, Semi-arid, area: 33,000 acres (134 km2), grapes:Black

New Jersey - Warren Hills:
year:1988, climate:Continental, area: 144,640 acres (58,534 ha), grapes:
-----
New Mexico - Mesilla Valley:
year:1985, climate:, area: 280,000 acres (1,133 km2), grapes:Black Muscat|Cabernet Sauvignon|Chardonnay|Dolcetto|Malvasia|Merlot|Mourvedre|Muscat of Alexandria|Primitivo|Riesling|Sangiovese|Viognier|Zinfandel
-----
New Mexico - Middle Rio Grande Valley:
year:1988, climate:Continental, area: 278,400 acres (112,664 ha), grapes:
-----
New Mexico - Mimbres Valley:
year:1985, climate:Arid, area: 636,800 acres (257,704 ha), grapes:Cabernet Sauvignon|Chardonnay|Merlot|Syrah
-----
New York - Cayuga Lake:
year:1988, climate:Continental, area: 460 acres (186 ha), grapes:Baco noir|Cabernet Franc|Cabernet Sauvignon|Catawba|Cayuga|Chambourcin|Chancellor|Chardonnay|Concord|Delaware|Diamond|Gewurztraminer|Isabella|Ives noir|Lemberger|Marechal Foch|Melody|Merlot|Niagara|Pinot gris|Pinot noir|Riesling|Sangiovese|Seyval blanc|Syrah|Traminette|Vidal blanc|Vignoles|Viognie

Oregon - McMinnville:
year:2005, climate:Maritime, area: 40,500 acres (16,390 ha), grapes:Pinot blanc|Pinot gris|Pinot noir|Riesling
-----
Oregon - Red Hill Douglas County, Oregon:
year:2005, climate:Maritime, area: 5,500 acres (2,226 ha), grapes:
-----
Oregon - Ribbon Ridge:
year:2005, climate:Maritime, area: 3,350 acres (1,356 ha), grapes:Auxerrois Blanc|Chardonnay|Muscat Canelli|Pinot gris|Pinot noir|Gamay Noir
-----
Oregon - Rogue Valley:
year:2000, climate:Maritime, area: 1,100 acres (450 ha), grapes:Cabernet Franc|Cabernet Sauvignon|Chardonnay|Dolcetto|Gewurztraminer|Grenache|Malbec|Merlot|Pinot blanc|Pinot gris|Pinot noir|Sangiovese|Sauvignon blanc|Semillon|Syrah|Tempranillo|Viognier
-----
Oregon - Snake River Valley:
year:2007, climate:Continental, area: 8,263 square miles (21,401 km2), 5,280,000 acres (2,140,000 ha) , grapes:Cabernet Franc|Cabernet Sauvignon|Canadice|Chardonnay|Cinsault|Gewurztraminer|Grenache|Lemberger|Malbec|Merlot|Mourvedre|Riesling|Syrah
-----
Oregon - Sou

Washington - Horse Heaven Hills:
year:2005, climate:Continental/maritime, area: 570,000 acres (230,671 ha), grapes:Barbera|Cabernet Franc|Cabernet Sauvignon|Chenin blanc|Grenache|Malbec|Marsanne|Merlot|Mourvedre|Petit Verdot|Riesling|Roussanne|Sauvignon blanc|Syrah|Viognier|Zinfandel
-----
Washington - Lake Chelan:
year:, climate:, area: 24,040 acres (9,730 ha), grapes:
-----
Washington - Lewis-Clark Valley:
year:, climate:, area: , grapes:
-----
Washington - Naches Heights:
year:2012, climate:Continental, area: 13,254 acres (53.64 km2), grapes:Cabernet Franc|Cabernet Sauvignon|Malbec|Merlot|Petite Verdot|Semillon|Sauvignon blanc|Syrah|Mourvedre|Viognier|Barbera|Nebbiolo|Sangiovese|Sagrantino|Pinot grigio|White Muscat|Souzao|Tinta Cao|Touriga Nacional|Tinta Roriz
-----
Washington - Puget Sound:
year:1995, climate:Temperate, Maritime, area: 100 acres (40 ha), grapes:Madeleine Angevine|Madeleine Sylvaner|Müller-Thurgau|Pinot gris|Pinot noir|Siegerrebe
-----
Washington - Rattlesnake Hills

In [9]:
# save ava details to dataframe
adetailsdf=pd.DataFrame(avadetails, columns=["ava","state","year","area","climate","grapes"])
adetailsdf.head()

Unnamed: 0,ava,state,year,area,climate,grapes
0,Sonoita,Arizona,1984,"208,000 acres (84,200 ha)",Subtropical continental,Cabernet Franc|Cabernet Sauvignon|Chardonnay|M...
1,Willcox,Arizona,,,,
2,Altus,Arkansas,1984,"208,000 acres (84,175 ha)",Humid subtropical,Cabernet Sauvignon|Chardonnay|Kerner|Müller-Th...
3,Arkansas Mountain,Arkansas,1986,"2,880,000 acres (1,165,000 ha)",Continental/humid subtropical,
4,Ozark Mountain,Arkansas,"1986, amended 1988","3,520,000 acres (1,424,493 ha)",Continental/humid subtropical,Catawba|Chambourcin|Chardonel|Concord|Norton|S...


## Fill in some blanks

To decrease "grapes" blanks to smaller percentage, manual fill of data especially for multi-state AVAs
Looked up at [https://www.wine-searcher.com/regions](https://www.wine-searcher.com/regions)

In [10]:
# url returned pages in non-standard format

adetailsdf.loc[adetailsdf["ava"]=='Upper Mississippi River Valley', "grapes"]="Chardonel|Edelweiss|La Crosse|Marechal Foch|Frontenac|Marquette|Saint Croix"
adetailsdf.loc[adetailsdf["ava"]=='Upper Hiwassee Highlands', "grapes"] = "Vidal Blanc|Chambourcin|Cabernet Sauvignon|Chardonnay"
adetailsdf.loc[adetailsdf["ava"]=='Cumberland Valley', "grapes"] = "Chardonnay|Cabernet Franc|Vidal Blanc|Niagara"
adetailsdf.loc[adetailsdf["ava"]=='Central Delaware Valley', "grapes"] = "Chardonnay|Riesling|Cabernet Sauvignon|Delaware"
adetailsdf.loc[adetailsdf["ava"]=='High Valley', "grapes"]="Red Bordeaux|Syrah|Pinot grigio|Pinot noir|Chardonnay|Riesling|Sauvignon blanc|Gewurztraminer"
adetailsdf.loc[adetailsdf["ava"]=='Wisconsin Ledge', "grapes"]="Frontenac|Niagara|Marechal Foch"
adetailsdf.loc[adetailsdf["ava"]=='Ballard Canyon', "grapes"]="Syrah|Grenache"
adetailsdf.loc[adetailsdf["ava"]=='Ballard Canyon', "year"]="2013"
adetailsdf.loc[adetailsdf["ava"]=='Arkansas Mountain', "grapes"]="Muscadine|Niagara|Catawba"
adetailsdf.loc[adetailsdf["ava"]=='Eagle Foothills', "grapes"]="Cabernet Sauvignon|Merlot|Syrah|Mourvedre|Chardonnay|Riesling"
adetailsdf.loc[adetailsdf["ava"]=='Indiana Uplands', "grapes"]="Chambourcin|Ravat Vignoles|Chardonel|Traminette"
adetailsdf.loc[adetailsdf["ava"]=='Tip of the Mitt', "grapes"]="Frontenac|Marquette|La Crescent"
adetailsdf.loc[adetailsdf["ava"]=='Alexandria Lakes', "grapes"]="Frontenac|La Crescent|Traminette"
adetailsdf.loc[adetailsdf["ava"]=='Outer Coastal Plain', "grapes"]="Chardonnay|Cabernet Sauvignon|Vidal|Chambourcin"
adetailsdf.loc[adetailsdf["ava"]=='Middle Rio Grande Valley', "grapes"]="Zinfandel|Cabernet Sauvignon|Syrah|Riesling"
adetailsdf.loc[adetailsdf["ava"]=='Haw River Valley', "grapes"]="Cabernet Franc|Syrah|Chardonnay|Traminette"
adetailsdf.loc[adetailsdf["ava"]=='Loramie Creek', "grapes"]="Baco Noir"
adetailsdf.loc[adetailsdf["ava"]=='Lewis-Clark Valley', "grapes"]="Cabernet Sauvignon|Cabernet Franc|Merlot|Chardonnay|Syrah|Viogner"
adetailsdf.loc[adetailsdf["ava"]=='Texoma', "grapes"]="Cabernet Sauvignon|Merlot|Pinot Noir|Chardonnay"
adetailsdf.loc[adetailsdf["ava"]=='The Rocks District of Milton-Freewater', "grapes"]="Syrah"
adetailsdf.loc[adetailsdf["ava"]=='Warren Hills', "grapes"]="Vidal Blanc|Seyval Blanc|Pinot Noir|Chardonnay"
adetailsdf.loc[adetailsdf["ava"]=='Ramona Valley', "grapes"]="Cabernet Sauvignon|Petite Sirah|Tempranillo|Syrah|Barbera|Zinfandel"
adetailsdf.loc[adetailsdf["ava"]=='Antelope Valley of the California High Desert', "grapes"]="Tempranillo|Zinfandel|Syrah|Viognier"
adetailsdf.loc[adetailsdf["ava"]=='Moon Mountain District Sonoma County', "grapes"]="Cabernet Sauvignon|Syrah"
adetailsdf.loc[adetailsdf["ava"]=='Lamorinda', 'year']='2016'
adetailsdf.loc[adetailsdf["ava"]=='Lamorinda', 'grapes']='Cabernet Sauvignon|Pinot Noir'

In [11]:
# fix hypen/n-dash problem causing same AVA to appear as different
adetailsdf.loc[ 204, "ava"] = adetailsdf.loc[ 245]["ava"]
adetailsdf.loc[adetailsdf["ava"]=='The Rocks District of Milton-Freewater', "grapes"]="Syrah"

In [12]:
# remove state tacked on
adetailsdf.loc[adetailsdf["ava"]=='Red Hill Douglas County, Oregon', "ava"]="Red Hill Douglas County"
adetailsdf.loc[adetailsdf["ava"]=='Red Hill Douglas County', "grapes"]="Pinot Noir|Chardonnay|Riesling"

In [13]:
#save to csv
details_file = os.path.join("saveData","avadetails.csv")
adetailsdf.to_csv(details_file)

## Common Varietals

Get data for common varietals from [http://www.wines.com/wine-varietals/](http://www.wines.com/wine-varietals/)


In [14]:
# get data from page

varlink = "http://www.wines.com/wine-varietals/"
vresp = requests.get(varlink, verify = False)
vsoup = BeautifulSoup(vresp.text, 'html.parser')
varnameL = vsoup.find_all("h5")
varietals=[]
if varnameL!=None:
    for v in varnameL:
        name = v.text
        descnode = v.next_sibling
        desc = ""
        while descnode.name!="p":
            descnode = descnode.next_sibling
            if descnode.name == 'h5':
                break
        if (descnode.name=="p"):
            desc = descnode.text
        else:
            desc = ""
        print(f"name: {name}, desc: {desc}")
        varietals.append({"name":name, "desc":desc})


name: Albariño, desc: Spanish white wine grape that makes crisp, refreshing, and light-bodied wines.
name: Aligoté, desc: White wine grape grown in Burgundy making medium-bodied, crisp, dry wines with spicy character.
name: Amarone, desc: From Italy’s Veneto Region a strong, dry, long- lived red, made from a blend of partially dried red grapes.
name: Arneis, desc: A light-bodied dry wine the Piedmont Region of Italy
name: Asti Spumante, desc: From the Piedmont Region of Italy, A semidry sparkling wine produced from the Moscato di Canelli grape in the village of Asti
name: Auslese, desc: German white wine from grapes that are very ripe and thus high in sugar
name: Banylus, desc: A French wine made from late-harvest Grenache grapes and served with chocolate or dishes with a hint of sweetness. By law the wine must contain 15 percent alcohol.
name: Barbaresco, desc: A red wine from the Piedmont Region of Italy, made from Nebbiolo grapes it is lighter than Barolo.
name: Bardolino, desc: A l

In [15]:
# save to dataframe
varietalsdf=pd.DataFrame(varietals, columns=["name","desc"])
varietalsdf.head()

Unnamed: 0,name,desc
0,Albariño,"Spanish white wine grape that makes crisp, ref..."
1,Aligoté,White wine grape grown in Burgundy making medi...
2,Amarone,"From Italy’s Veneto Region a strong, dry, long..."
3,Arneis,A light-bodied dry wine the Piedmont Region of...
4,Asti Spumante,"From the Piedmont Region of Italy, A semidry s..."


In [16]:
# save to csv
var_file = os.path.join("saveData","varietals.csv")
varietalsdf.to_csv(var_file)

## Cheese Pairings

varietal - cheese pairings from [https://winemonger.com/blog/the-wine-and-cheese-pairing-guide-grape-by-grape-cheese-by-cheese/2012/05/17/](https://winemonger.com/blog/the-wine-and-cheese-pairing-guide-grape-by-grape-cheese-by-cheese/2012/05/17/)

In [17]:
# get data from page
clink = "https://winemonger.com/blog/the-wine-and-cheese-pairing-guide-grape-by-grape-cheese-by-cheese/2012/05/17/"
cresp = requests.get(clink, verify = False)
csoup = BeautifulSoup(cresp.text, 'html.parser')
parL = csoup.find_all('p')
pairingL = []

for p in parL:
    pstr= p.find('strong')
    if pstr==None:
        continue
    clist = p.text.split(os.linesep)
    var = clist.pop(0)
    if var=="ST.LAURENT":
        var="ST. LAURENT"
    if len(clist)<1:
        continue
    ntxt=clist[0]
    if ntxt.endswith(':'):
        del clist[0]
    if len(clist)<1:
        np = p.next_sibling
        while np.name!="p":
            np = np.next_sibling
        if '\xa0' in np.text:
            np = np.next_sibling
            while np.name!="p":
                np = np.next_sibling
        clist = np.text.split(os.linesep)
        print(f"{var} took the next para")
        print(clist)
    else:
        print(f"{var}")
        print(clist)
    print("------")
    pairingL.append({"name":var, "cheese":"|".join(clist)})
    


['JavaScript seems to be disabled in your browser.', '                    You must have JavaScript enabled in your browser to utilize the functionality of this website.                ']
------
BARBERA
['Abbaye de Belloc', 'Banon', 'Fiore Sardo', 'Fontina', 'Grana Padano', 'Lancashire', 'Ossau-Iraty', 'Piave', 'Taleggio']
------
BLAUFRANKISCH
['Cantalet', 'Feta', 'Limburger (esp. from Bavaria)', 'Monterey Jack', 'Gouda (smoked, aged, or straight up)', 'Pepper Jack', 'Piave', 'Provolone', 'Smoked cheeses', 'Sublimity (washed rind cheese from Oregon)', 'Washed Curd']
------
CABERNET SAUVIGNON
['Abbaye de Belloc', 'Ardrahan', 'Bra Tenero', 'Chalosse', 'Cheddar (sharp)', 'Comte', 'Danish Blue', 'Gouda (aged)', 'Llangloffan', 'Le Moulis', 'Ouray', 'Reblochon', 'San Andreas', 'Tome de Couserans']
------
CHARDONNAY
['Affidelice', 'Alpine Shepard', 'Bel Paese', 'Bucheron', 'Brie', 'Cambazola', 'Cantal', 'Cashel Blue', 'Chaource', 'Cotija', 'Dry Jack']
------
CHENIN BLANC
['Blue Castello', 'Ca

In [18]:
# save pairingL to dataframe
pairingdf=pd.DataFrame(pairingL, columns=["name","cheese"])
pairingdf.head()

Unnamed: 0,name,cheese
0,,JavaScript seems to be disabled in your browse...
1,BARBERA,Abbaye de Belloc|Banon|Fiore Sardo|Fontina|Gra...
2,BLAUFRANKISCH,Cantalet|Feta|Limburger (esp. from Bavaria)|Mo...
3,CABERNET SAUVIGNON,Abbaye de Belloc|Ardrahan|Bra Tenero|Chalosse|...
4,CHARDONNAY,Affidelice|Alpine Shepard|Bel Paese|Bucheron|B...


In [19]:
# nothing pairs well with disabled JavaScript
pairingdf = pairingdf.drop(0)
pairingdf

Unnamed: 0,name,cheese
1,BARBERA,Abbaye de Belloc|Banon|Fiore Sardo|Fontina|Gra...
2,BLAUFRANKISCH,Cantalet|Feta|Limburger (esp. from Bavaria)|Mo...
3,CABERNET SAUVIGNON,Abbaye de Belloc|Ardrahan|Bra Tenero|Chalosse|...
4,CHARDONNAY,Affidelice|Alpine Shepard|Bel Paese|Bucheron|B...
5,CHENIN BLANC,Blue Castello|Camembert|Derby|Fouchtra|Graddos...
6,GAMAY,Beemster XO|Brie|Camembert|Cheddar|Comte|Durru...
7,GEWURZTRAMINER,Alpine Shepard|Ardrahan|Boursin|Chevre|Durrus|...
8,GRUNER VELTLINER,Appenzeller|Brin D’Amour|Caerphilly|Chimay|Dou...
9,MALBEC,Cashel Blue|Iberico|Manchego|Mimolette|Taleggio
10,MERLOT,Abbaye de Belloc|Alpine Shepard|Cantalet|Camem...


In [20]:
# save to csv
cfile = os.path.join("saveData","pairing.csv")
pairingdf.to_csv(cfile)