# Web Scraping Wikipedia for Birth and Death dates

In [None]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import re
from difflib import SequenceMatcher
from unidecode import unidecode
from datetime import datetime

In [None]:
composers_url = 'https://en.wikipedia.org/wiki/List_of_composers_by_name'

page = requests.get(composers_url)

composers_full_page = BeautifulSoup(page.text, 'html')

### Finding elements on Wikipedia page based on html tag

In [None]:
comp_main_section = composers_full_page.find_all('div', class_="div-col")

In [None]:
comp_list_items = []

for subsection in comp_main_section:
    for item in subsection.find_all('li'):
        comp_list_items.append(item)

### Extracting strings (name + years)

In [None]:
composer_list_strings = [item.text.strip() for item in comp_list_items]

## Manipulating extracted strings

### Separating names from years

In [None]:
name_strings = []
year_strings = []

for string in composer_list_strings:
    split_string = string.split('(')
    name_strings.append(unidecode(split_string[0].strip()))
    if len(split_string) == 3:
        year_strings.append('(' + split_string[1] + '(' + split_string[2])
    elif len(split_string) == 2:
        year_strings.append('('+ split_string[1])
    else:
        year_strings.append("ERROR")

### Extracting only numerical characters from 'year' strings

In [None]:
numbers_only = []

In [None]:
for i in year_strings:
    numbers_only.append(re.sub(r"\D", "", i))

### Converting century format (12th) to 1200

In [None]:
for c, i in enumerate(numbers_only):
    if len(i) == 2:
        numbers_only[c] = i+"00"

### Converting one 128B.C. (Limenius) to 0128

In [None]:
numbers_only[2973] = '0128'

### Converting century+footnote 12th[2] to 1200

In [None]:
for c, i in enumerate(numbers_only):
    if len(i) == 3:
        numbers_only[c] = i[:2]+"00"

### Converting multiple years (more  than 8 characters) to only first and last

In [None]:
for count, value in enumerate(numbers_only):
    if len(value) > 8:
        numbers_only[count] = value[:4]+value[-4:]

### Spliting numbers based on number of characters

In [None]:
split_string = [re.findall('....', i) for i in numbers_only]

### Creating lists for year of birth and year of death

In [None]:
year_birth = []
year_death =[]

In [None]:
for c, i in enumerate(split_string):
    if len(i) == 2:
        year_birth.append(i[0])
        year_death.append(i[1])
    elif len(i) == 1:
        year_birth.append(i[0])
        year_death.append("")
    else:
        year_birth.append('')
        year_death.append('')
        

In [None]:
print(len(year_birth), len(year_death))

### Creating dataframe and populating with name, year of birth, year of death, and link

In [None]:
wiki_data = pd.DataFrame(list(zip(name_strings, year_birth, year_death, composer_list_links)),
               columns =['Name', 'Birth', 'Death', 'Links'])

## Cross-referencing names from database with wikipedia list of composers

### Loading data and extracting only the composer name

In [None]:
composers_data = pd.read_excel("./programs_works.xlsx")

In [None]:
composers_only_data = composers_data['nameComposer'].drop_duplicates().reset_index(drop=True)

In [None]:
composers_only_data = pd.DataFrame(composers_only_data)

### Creating lists for the names in the original set and the wikipedia extracted set

In [None]:
wiki_list = wiki_data['Name'].tolist()

In [None]:
data_list = composers_only_data['nameComposer'].tolist()

In [None]:
data_list = [i.lower() for i in data_list]

In [None]:
wiki_list = [i.lower() for i in wiki_list]

### Comparing each name on our data with each name on the wikipedia list of composers

In [None]:
result_list = []
len(result_list)
false_count = 0
        

In [None]:
for b, d in enumerate(data_list):
    for c, w in enumerate(wiki_list):
        if d in w or w in d:
            result_list.append([d, b, w, c])
            found = True
            break
    if found:
        pass
    else:
        result_list.append([d, b, "NOT FOUND", "", ""])
        
        #Debugging falses
        print(found, b, d, c, w)
        
        false_count += 1
    found = False

### Using index to retrieve Birth and Death year from the previously created lists (year_birth and year_death)

In [None]:
year_birth_list =[]
for c, i in enumerate(result_list):
    if i[3] == "":
        year_birth_list.append('')
    else:
        year_birth_list.append(year_birth[i[3]])

In [None]:
year_death_list =[]
for c, i in enumerate(result_list):
    if i[3] == "":
        year_death_list.append('')
    else:
        year_death_list.append(year_death[i[3]])

### Creating new data frame to include birth and death dates

In [None]:
composers_years_data = pd.DataFrame(composers_only_data)

In [None]:
composers_years_data['Birth'] = year_birth_list

In [None]:
composers_years_data['Death'] = year_death_list

In [None]:
composers_years_data.to_csv('composers_years_data.csv')