In [1]:
import pandas as pd
import requests
import re
from bs4 import BeautifulSoup
import json
from collections import Counter, defaultdict
import shutil
import urllib.request
import urllib

In [2]:
URL = "https://en.wikipedia.org/wiki/List_of_cocktails"
ALLOWED = re.compile('/wiki/')
NOT_ALLOWED = re.compile('History|List|:|drinks|#')
END_BLOCK = "Other"

def check_pattern(link, allowed=ALLOWED, not_allowed=NOT_ALLOWED):
    return bool(link is not None and allowed.match(link) and not not_allowed.search(link))
    

def get_id(soup, header_name):
    for h in soup.find_all('h2'):
        if h.find_all(class_='mw-headline')[0].text == header_name:
            return h.find_all(class_='mw-headline')[0].get('id')
                
def find_prev_link(soup, header_name):
    header_id = get_id(soup, header_name)
    end_block = soup.find(id=header_id)    
    prev_link = end_block.find_next('h2').find_previous('a').get('href')   
    return prev_link


def get_list_items(soup):
    
    li_items = []
    for i in soup.find_all('li'):
        if i.find('a') is not None and check_pattern(i.find('a').get('href')):
            li_items.append(i.find('a'))
    return li_items
                
    
def get_links(url=URL, header=END_BLOCK):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    soup = soup.find(id='content')
    _ = soup.find(id='toc').extract() 
    
    li_items = get_list_items(soup)
    links = {}
    last_link = find_prev_link(soup, header)
    for i in li_items:
        if i.get('href') not in links.values():
            links[i.string] = i.get('href')
        if i.get('href')== last_link:
            break

    return links
        

In [3]:
links = get_links()


In [4]:
len(links)

280

In [14]:
[x for x in links.keys() if x not in links2.keys()]

['Vargtass']

In [12]:
links2 = json.load(open('cocktails_wiki.json','rb'))

In [10]:
len(links2)

301

In [111]:
# param to rewrite or not
# tests
# get igredients, get table with field presents info, add pic caption to data
#download pictures
#get first paragraph for every coctail as a summary
# get meanings for ingredients


False

In [83]:
json.dump(links, open('cocktails_wiki.json','w'))

In [8]:
def count_unique_rows(links):
    row_count = defaultdict(int)
    for l in links.values():
        url = "https://en.wikipedia.org"+l
        
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        tbl = soup.find("table", {"class": "infobox"})
        if tbl:
            list_of_table_rows = tbl.find_all('th')
            
            if len(list_of_table_rows)>0:
                for r in list_of_table_rows:
                    row_count[r.text]+=1
    return row_count

        

In [81]:
rows = count_unique_rows(links)

In [82]:
rows

defaultdict(int,
            {'Cocktail': 150,
             'Type': 217,
             'Primary alcohol by volume': 177,
             'Standard drinkware': 200,
             'Served': 190,
             'Commonly used ingredients': 134,
             'Preparation': 195,
             'Standard garnish': 121,
             'IBA official cocktail': 66,
             'IBA specifiedingredients': 66,
             'Timing': 44,
             'Common alcohol(s)': 5,
             'Notes': 35,
             'Ingredients as listed at CocktailDB': 4,
             'The Day After (Edvard Munch, 1894-95)': 1,
             'Product type': 1,
             'Owner': 1,
             'Country': 2,
             'Introduced': 2,
             'Previous owners': 1,
             'Website': 1,
             'Country of origin': 1,
             'Official name': 1,
             'Year established': 1,
             'No. of vineyards': 1,
             'Wine produced': 1,
             'Comments': 1,
             'Course': 1,


In [22]:
# id for pics cocktaildb
# 1st paragraph as a summary
# info on alcohols with pics 
# info in bartender terminology
# info on glass types with pics

In [75]:
PATH_DATA ='/Users/marina/dev/barchik/data'

DESC_FIELDS = ['type','primary alcohol by volume','standard drinkware', 'served', 'preparation','standard garnish', \
               'timing','iba official cocktail', 'ingredients', 'alcohol by volume']


def img_link(tbl):        
    imgs = [x.find('img') for x in tbl.find_all('td')]
    if imgs[0]:
        if 'srcset' in imgs[0].attrs:
            return imgs[0]['srcset'].split(',')[-1].strip().split()[0]
        else:
            return imgs[0]['src'].strip().split()[0]  
    return

def save_img(img, img_name, path):
    url = "https:"+img
    urllib.request.urlretrieve(url,path+img_name+'.JPG' )
    

def format_data(desc_dict, fields = DESC_FIELDS):
    
    if len(desc_dict) > 0:
        for n in desc_dict:
            if n in ['commonly used ingredients','main ingredients',\
                     'ingredients as listed at cocktaildb', 'iba specifiedingredients']:
                desc_dict['ingredients'] = desc_dict.pop(n)
             
        if 'iba official cocktail' in desc_dict:
            desc_dict['iba official cocktail'] = 1
        else:
            desc_dict['iba official cocktail'] = 0
        to_delete = [k for k in desc_dict.keys() if k not in fields]
        for k in to_delete:
            del desc_dict[k]
            
    return desc_dict
    


def parse_infobox(links, path=PATH_DATA):
    cocktail_data = defaultdict(dict)
    
    for n, l in links.items():
        url = "https://en.wikipedia.org"+l
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        tbl = soup.find("table", {"class": "infobox"})
        if tbl:
            list_of_table_rows = [x.text.lower() for x in tbl.find_all('th')]
            list_of_table_contents = [x.text for x in tbl.find_all('td')]
            table_data = dict(zip(list_of_table_rows,list_of_table_contents))
            table_data['photo'] = img_link(tbl)
            if table_data['photo']:
                save_img(table_data['photo'], l,path)

            cocktail_data[n] = format_data(table_data)
#             except:
#                 print(n)
    return cocktail_data


In [77]:
data = parse_infobox(links)

Death in the Afternoon
Black and Tan
Black Velvet
Boilermaker
Hangman's Blood
Irish Car Bomb
Michelada
Porchcrawler
Sake Bomb
Shandy
Snakebite
U-Boot
B & B
The Blenheim
Blow my Skull Off
Brandy Alexander
Brandy Manhattan
Brandy Sour (Cyprus)
Brandy Sour
Chicago Cocktail
Curacao Punch
Diki-Diki
Four Score
French Connection
Hennchata
Horse's Neck
Incredible Hulk
Jack Rose
Panama
Paradise
Pisco Sour
Porto flip
Savoy Affair
Savoy Corpse Reviver
Sazerac
Sidecar
Singapore Sling
Stinger
Tom and Jerry
Caipirinha
20th Century
Alexander
Angel Face
Aviation
Bee's Knees
Bijou
Blackthorn
Bloody Margaret
Bramble
Breakfast martini
Bronx
Casino
Cloister
Clover Club Cocktail
Cooperstown Cocktail
Corpse Reviver #2
Damn the Weather
French 75
Derby
Gibson
Gimlet
Gin and tonic
Gin Fizz
Gin pahit
Gin sour
Greyhound
Hanky-Panky
John Collins
The Last Word
Lime Rickey
Long Island Iced Tea
Lorraine
Martini
Mickey Slim
Monkey Gland
My Fair Lady
Negroni
Old Etonian
Pegu
Pimm's Cup
Pink Gin
Pink Lady
Ramos Gin Fiz

In [None]:
'Type', 'Primary alcohol by volume', 'Standard drinkware', 'Served', 'Commonly used ingredients', 'Preparation', 'Standard garnish', 'IBA specifiedingredients', 'Timing', 
'Main ingredients', 'Ingredients', 'IBA official cocktail', 'Notes', 
'Alcohol by volume',
'Ingredients as listed at CocktailDB'

In [1]:
https://en.wikipedia.org/wiki/File:Irish_Car_Bomb.jpg
https

SyntaxError: invalid syntax (<ipython-input-1-570b9a9a6a9a>, line 1)

In [100]:
def format_df(dfs):
    if dfs[0].shape[0]<3:
        df = dfs[1]
    else:
        df = dfs[0]
        
    d = df.T.drop(0, axis=1).reset_index(drop=True)
    d.columns = d.iloc[0]
    d = d.drop(d.index[0])
  
    return {k:v[1] for k,v in d.to_dict().items()}

def get_data(item):
    url = "https://en.wikipedia.org"+item
    content = requests.get(url).content
    
    try:
        dfs = pd.read_html(content)

        res = format_df(dfs)
        return res
    except:
        print('err', item)
        pass


    

In [33]:
import pandas as pd

def check_report(df):
    """
    Each report should contain the following columns:
    - country:   country name
    - state:     state name if reporting regional stats and None otherwise
    - confirmed: number of confirmed cases
    - deaths:    number of deaths
    - recovered: number of recovered patients
    :param df:   pandas DataFrame to check
    """
    for field in ['country', 'state', 'confirmed', 'deaths', 'recovered']:
        if field not in df.columns:
            raise ValueError(f'Required report field "{field}" is not in the report')



In [None]:
def get_report_countries():
    """
    Get data from Wikipedia page with
    COVID-19 statistics for each country
    :return: Pandas DataFrame
    """
    url = 'https://en.wikipedia.org/wiki/2019%E2%80%9320_coronavirus_pandemic_by_country_and_territory'
    df = utils.get_wiki_table_df(url, 'Countries, territories, and')
    df = pd.DataFrame(df.values[:, 1:5], columns=['country', 'confirmed', 'deaths', 'recovered'])
    df = df[~df['country'].isna()]
    df['country'] = df['country'].apply(lambda x: utils.clean_territory_name(x))
    df.drop(df[df['country'].str.len() > 40].index, inplace=True)
    df = utils.wiki_table_df_numeric_column_clean(df, ['confirmed', 'deaths', 'recovered'])
    df['state'] = None
    check_report(df)
    return df


In [None]:
def get_wiki_table_df(page_url, match_string):
    """
    Get table from Wiki page by page url
    and unique string that must appear in table
    :param page_url: page url
    :param match_string: match string
    :return: pandas DataFrame
    """
    response = requests.get(page_url)
    df = None
    tables = pd.read_html(response.content)
    for table in tables:
        df = table
        if match_string in str(df):
            break
    return df

In [141]:
links['Pisco Sour']


'/wiki/Pisco_Sour'

In [125]:
res = {k:get_data(k) for k in links.values()}

err /wiki/Moloko_Plus


  # This is added back by InteractiveShellApp.init_path()


err /wiki/20th_Century_(cocktail)
err /wiki/Bloody_Margaret
err /wiki/Ancient_Mariner_(cocktail)
err /wiki/Fish_House_Punch
err /wiki/Grog
err /wiki/Jungle_Bird
err /wiki/Q.B._Cooler
err /wiki/Royal_Bermuda_Yacht_Club#Cocktail
err /wiki/Sumatra_Kula
err /wiki/Test_Pilot_(cocktail)
err /wiki/Ti%27_Punch
err /wiki/Mexican_martini
err /wiki/Harlem_Mugger
err /wiki/Cocktails_with_cacha%C3%A7a#Batida_(Shaken)
err /wiki/Bay_Breeze
err /wiki/Glowtini
err /wiki/Rose_Kennedy_Cocktail
err /wiki/Jungle_Juice
err /wiki/Whisky_Mac
err /wiki/Whiskey_smash
err /wiki/Oatmeal_Cookie_(cocktail)
err /wiki/Revelation_(cocktail)
err /wiki/Boomerang_(cocktail)
err /wiki/Brut_Cocktail
err /wiki/Caju_Amigo
err /wiki/Cocktails_with_cacha%C3%A7a#Leite_de_Onça_(Jaguar_Milk)
err /wiki/Cocktails_with_cacha%C3%A7a#Quentão_(Hot_Stuff)
err /wiki/Rabo-de-galo
err /wiki/Fuzzy_navel
err /wiki/Tamango_(Italian_cocktail)


In [139]:
url = "https://en.wikipedia.org"+'/wiki/Tamango_(Italian_cocktail)'
content = requests.get(url).content

In [30]:
url = "https://en.wikipedia.org/wiki/List_of_cocktails"
# content2 = requests.get(url).content

In [31]:
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

In [133]:
res = {k:v for k,v in res.items() if v is not None and len(v)>2}

In [155]:
# didn't make it
[x for x in links.values() if x not in res.keys() ]

['/wiki/Moloko_Plus',
 '/wiki/Beer',
 '/wiki/Boilermaker_(beer_cocktail)',
 '/wiki/Hangman%27s_Blood',
 '/wiki/Porchcrawler',
 '/wiki/Shandy',
 '/wiki/U-Boot_(beer_cocktail)',
 '/wiki/Duo_and_trio_cocktails#B_and_B',
 '/wiki/Blow_my_Skull_Off',
 '/wiki/Curacao_Punch',
 '/wiki/Diki-Diki_(cocktail)',
 '/wiki/Amaretto#French_Connection',
 '/wiki/Hennchata',
 '/wiki/Panama_(cocktail)#Trios',
 '/wiki/20th_Century_(cocktail)',
 '/wiki/Bloody_Margaret',
 '/wiki/Gin_Fizz',
 '/wiki/Gin_pahit',
 '/wiki/Hanky-Panky_cocktail',
 '/wiki/Pimm%27s_Cup_(cocktail)',
 '/wiki/Shirley_Temple_cocktail#Variations',
 '/wiki/Ancient_Mariner_(cocktail)',
 '/wiki/Caribou_Lou',
 '/wiki/Cobra%27s_Fang',
 '/wiki/Fish_House_Punch',
 '/wiki/Fluffy_Critter',
 '/wiki/Grog',
 '/wiki/Jungle_Bird',
 '/wiki/Mr._Bali_Hai',
 '/wiki/Q.B._Cooler',
 '/wiki/Royal_Bermuda_Yacht_Club#Cocktail',
 '/wiki/Sumatra_Kula',
 '/wiki/Test_Pilot_(cocktail)',
 '/wiki/Ti%27_Punch',
 '/wiki/Tschunk',
 '/wiki/Tequila#Cocktails',
 '/wiki/Duo_and

In [154]:
res

{'/wiki/Death_in_the_Afternoon_(cocktail)': {'Type': 'Wine cocktail',
  'Primary alcohol by volume': 'Champagne Absinthe',
  'Standard drinkware': 'Champagne flute',
  'Death in the Afternoon recipe at DrinkBoy': 'Death in the Afternoon recipe at DrinkBoy'},
 '/wiki/Black_and_Tan': {'Type': 'Mixed drink',
  'Served': 'Neat; undiluted and without ice',
  'Standard drinkware': 'Pint glass',
  'Commonly used ingredients': 'Pale ale or lager and stout or porter'},
 '/wiki/Black_Velvet_(beer_cocktail)': {'Type': 'Mixed drink',
  'Served': 'Straight',
  'Standard drinkware': 'Pilsner glass',
  'Commonly used ingredients': 'Stout and Champagne',
  'Preparation': 'Mix equal parts stout and Champagne'},
 '/wiki/Irish_Car_Bomb': {'Type': 'Beer cocktail',
  'Standard drinkware': 'A pub glass and a shot glass.',
  'Commonly used ingredients': 'coffee, baileys and crea',
  'Preparation': 'The whiskey is floated on top of the Irish cream in a shot glass, and the shot glass is then dropped into the s

In [134]:
len(res)

228

In [145]:
dfs = pd.read_html(content2)

In [147]:
dfs[0]

Unnamed: 0,IBA official cocktail,IBA official cocktail.1
0,Peruvian pisco sour,Peruvian pisco sour
1,Type,Cocktail
2,Primary alcohol by volume,Pisco
3,Served,Straight up; without ice
4,Standard garnish,Angostura bitters (1 dash)
5,Standard drinkware,Old Fashioned glass
6,IBA specifiedingredients,4.5cl Pisco 3cl lime juice 2cl simple syrup 1 ...
7,Preparation,Vigorously shake contents in a cocktail shaker...
8,Timing,All day
9,Pisco Sour recipe at International Bartenders ...,Pisco Sour recipe at International Bartenders ...


In [270]:
non = {k:v for k,v in res.items() if 'Type' not in v and 'Preparation' not in v }

In [136]:
res = {k:v for k,v in res.items() if 'Type' in v or 'Preparation' in v or 'Primary alcohol by volume' in v}

In [138]:
res

{'/wiki/Death_in_the_Afternoon_(cocktail)': {'Type': 'Wine cocktail',
  'Primary alcohol by volume': 'Champagne Absinthe',
  'Standard drinkware': 'Champagne flute',
  'Death in the Afternoon recipe at DrinkBoy': 'Death in the Afternoon recipe at DrinkBoy'},
 '/wiki/Black_and_Tan': {'Type': 'Mixed drink',
  'Served': 'Neat; undiluted and without ice',
  'Standard drinkware': 'Pint glass',
  'Commonly used ingredients': 'Pale ale or lager and stout or porter'},
 '/wiki/Black_Velvet_(beer_cocktail)': {'Type': 'Mixed drink',
  'Served': 'Straight',
  'Standard drinkware': 'Pilsner glass',
  'Commonly used ingredients': 'Stout and Champagne',
  'Preparation': 'Mix equal parts stout and Champagne'},
 '/wiki/Irish_Car_Bomb': {'Type': 'Beer cocktail',
  'Standard drinkware': 'A pub glass and a shot glass.',
  'Commonly used ingredients': 'coffee, baileys and crea',
  'Preparation': 'The whiskey is floated on top of the Irish cream in a shot glass, and the shot glass is then dropped into the s

In [279]:
[k for k in res.keys() if  'Common alcohol(s)' in res[k]]

['/wiki/Buck_(cocktail)',
 '/wiki/Gin_sour',
 '/wiki/Sour_(cocktail)#White_Lady',
 '/wiki/Sour_(cocktail)',
 '/wiki/Tequila_sour',
 '/wiki/Flip_(cocktail)',
 '/wiki/White_Lady_(cocktail)',
 '/wiki/Sour_(cocktail)#Other_sours']

In [280]:
res['/wiki/Sour_(cocktail)']

{'Type': 'Cocktail family',
 'Common alcohol(s)': 'Gin Bourbon whiskey Brandy Pisco Rum Amaretto',
 'Notes': 'See the article for specifics.'}

In [None]:
'Cocktail family'

In [None]:
 'Common alcohol(s)'


In [276]:
set([x for el in res.values() for x in el.keys() if 'International Bartenders Association' not in x and len(x)<30 ])

{'Areas',
 'Black tea',
 'Blended or flavoured teas',
 'By country',
 'Common alcohol(s)',
 'Commonly used ingredients',
 'Country of origin',
 'Culture',
 'Customs',
 'Fermented tea',
 'General',
 'Green tea',
 'Health',
 'History',
 'IBA specifiedingredients',
 'May be served flaming',
 'Notes',
 'Oolong tea',
 'Preparation',
 'Primary alcohol by volume',
 'Production and distribution',
 'Reference [1]',
 'See also',
 'Served',
 'Standard drinkware',
 'Standard garnish',
 'Tea-based drinks',
 'Timing',
 'Type',
 'Variants',
 'White tea',
 'Yellow tea'}

In [257]:
forb = [x for x in found if 'List' in x or 'History' in x] 

In [175]:
d = df.T.drop(0, axis=1).reset_index(drop=True)
d.columns = d.iloc[0]
d = d.drop(d.index[0])
if 'International Bartenders Association' in d.columns[-1]:
    d = d.drop(d.columns[-1], axis=1)
if 'Timing' not in d.columns:
    d['Timing'] = 'Not Specified'

# d.columns = ['Type', 'Primary alcohol by volume', 'Served', 'Standard garnish', 
#              'Standard drinkware', 'Ingredients', 'Preparation', 'Timing']   
# # d.columns = ['Type', 'Primary alcohol by volume', 'Served', 'Standard garnish',
# #    'Standard drinkware', 'Ingredients', 'Preparation']
# {k:v[1] for k,v in d.to_dict().items()}

In [177]:
d.columns

Index(['Type', 'Primary alcohol by volume', 'Served', 'Standard drinkware',
       'IBA specifiedingredients', 'Preparation', 'Timing'],
      dtype='object', name=0)

In [161]:
{k:v[1] for k,v in d.to_dict().items()}

{'Type': 'Cocktail',
 'Primary alcohol by volume': 'Gin',
 'Served': 'Straight up; without ice',
 'Standard garnish': 'Lemon twist and maraschino cherry',
 'Standard drinkware': 'Cocktail glass',
 'Ingredients': '4 cl gin (Old Tom) 1 cl Maraschino 1 cl fresh lemon juice 2 dashes orange bitters',
 'Preparation': 'Pour all ingredients into shaker with ice cubes. Shake well. Strain into chilled cocktail glass. Garnish with a lemon twist and a maraschino cherry. Serve without a straw.',
 'Timing': 'All Day'}

In [138]:
url = "https://en.wikipedia.org"+'/wiki/Pisco_sour'
content = requests.get(url).content