In [13]:
import wikipedia as wp
import pandas as pd
import requests
from bs4 import BeautifulSoup
import gender_guesser.detector as gender

## GET PAGE

In [14]:
# send requests
r = requests.get('https://en.wikipedia.org/wiki/List_of_national_anthems')

In [15]:
# parse into searchable object
soup = BeautifulSoup(r.content, 'html5lib')

## GET INFO OUT

In [16]:
# the rows are stored in an attribute like this
trs = soup.findAll('tr')

In [17]:
# country can be parsed like this
def get_country(tr):
    try:
        return tr.findAll('a')[0].text
    except:
        return 'No country'

# other attributes can be parsed like this
def get_nth_td_text(tr, n):
    try:
        return tr.findAll('td')[n].text
    except:
        return 'N/A'
        

In [18]:
# this is very task specific data munging
cut_val = -28

countries = [get_country(tr) for tr in trs[2:]][:-10][:cut_val]
anthems = [get_nth_td_text(tr, 0).replace('"', '') for tr in trs[2:]][:cut_val-10]
years = [get_nth_td_text(tr, 1) for tr in trs[2:]][:cut_val-10]
lyrics = [get_nth_td_text(tr, 2) for tr in trs[2:]][:cut_val-10]
composers = [get_nth_td_text(tr, 3) for tr in trs[2:]][:cut_val-10]

### CHECK QUALITY

In [19]:
assert len(countries)==len(years)==len(anthems)==len(lyrics)==len(composers), ''

## CREATE DF

In [20]:
df = pd.DataFrame(index=countries)
df['year'] = years
df['anthem'] = anthems
df['lyricist'] = lyrics
df['composer'] = composers

In [21]:
df.head()

Unnamed: 0,year,anthem,lyricist,composer
Afghanistan,2006,Millī Surūd (National Anthem),Abdul Bari Jahani,Babrak Wassa
Albania,1912,Betimi mbi Flamur(The Pledge on the Flag),Aleksandër Stavre Drenova,Ciprian Porumbescu
Algeria,1962,Kassaman (We Pledge),Moufdi Zakaria,Mohamed Fawzi
Andorra,1914,El Gran Carlemany(The Great Charlemagne),Enric Marfany Bons,Juan Benlloch y Vivó
Angola,1975,Angola Avante(Forward Angola),Manuel Rui Alves Monteiro,Rui Alberto Vieira Dias Mingas [pt]


In [22]:
df.tail()

Unnamed: 0,year,anthem,lyricist,composer
Venezuela,1881,Gloria al Bravo Pueblo(Glory to the Brave People),Vicente Salias,Juan José Landaeta
Vietnam,1945,Tiến Quân Ca(Marching Song),Văn Cao,Văn Cao
Yemen,1990,National anthem of Yemen,Abdallah Abdulwahab Noman,Ayoob Tarish
Zambia,1964,"Stand and Sing of Zambia, Proud and Free",collectively,Enoch Sontonga
Zimbabwe,1994,Simudzai Mureza WeZimbabwe(O Lift High The Ban...,Solomon Mutswairo,Fred Changundega


## GUESS THE GENDER

In [23]:
# instatiate the detector
d = gender.Detector()

In [24]:
# this functions adds a gender column for a specific column
def guess_col_gender(col, suff='_gender', df=df, d=d):
    # extract first names by splitting by ' ' and choosing the first element
    first_names = [f.split(' ')[0] for f in df[col].tolist()]
    # guessing the gender
    genders = [d.get_gender(first_name) for first_name in first_names]
    # adding it as a col_gender column
    df[f'{col}{suff}'] = genders
    
    return df

In [25]:
# doing gender guessing for both composers and lyricists
df_g = guess_col_gender('composer', df=guess_col_gender('lyricist'))

In [26]:
df_g.to_csv('anthems-gender-scrape.csv')