# 1) Data Scraping and Data Cleaning:

 In this note notebook, we scrape the data for celebrities, namely actors, actress, directors, comedians, singers, musicians, dancers, models, writers, playwrights, photographers, journalists and etc. from wikipedia. The purpose of this projects is to figure out the divorce rate and also life expectancy among celebrities. We also attempt to run the model to predict the divorce and life expectancy of celebrities based on the extracted features.

Importing required libraries:

In [1]:
import numpy as np
import pandas as pd
import requests
import bs4
import re
import time
import concurrent.futures
pd.set_option('display.max_columns', None)

Defining required functions:

In [87]:
def get_HTML_content(url):
    
    '''This function extracts the HTML content for a particular url'''
    
    html = requests.get(url)
    
    if html.status_code == 200:
        return bs4.BeautifulSoup(html.text, 'lxml')
    elif html.status_code == 404:
        return None
    else:
        return None

def get_all_subcategories_for_each_category(url):
    
    '''This function gets the url for a category and return 
        entire subcategories and titles'''
    
    url_base = "https://en.wikipedia.org"
    soup = get_HTML_content(url)
    subcategories = soup.find_all('div', class_="CategoryTreeSection")
    
    url_subcategories = [url_base + subcategory.find('a').get('href') for subcategory in subcategories]
    titles_subcategories = [subcategory.find('a').text for subcategory in subcategories]
    
    return url_subcategories, titles_subcategories

def get_all_pages_for_each_subcategory(url, title):
    
    '''This function extract urls and titles for all pages for each subcategories'''
    
    urls_subcategories_all_pages = []
    urls_subcategories_all_pages.append(url)
    url_base = "https://en.wikipedia.org"
    
    while True:

        row_next = None
        soup = get_HTML_content(url)
        rows = soup.find( 'div', id = "mw-pages").find_all(title = f"Category:{title}")
        
        for row in rows: 
            if 'next page' in row.text:
                row_next = row
            else:
                continue
        try: 
            href = row_next.get('href')
        except:
            break
            
        url = url_base + href
        urls_subcategories_all_pages.append(url)
        
    return urls_subcategories_all_pages

def get_all_subcategories(url):
    
    '''This function extracts all urls for each category including those 
        subcategories which have multi-pages'''

    urls_subcategories_all = []
    urls_subcategories, titles_subcategories = get_all_subcategories_for_each_category(url)
    for i in range(len(urls_subcategories)):
        try:
            urls_subcategories_all_pages = get_all_pages_for_each_subcategory(urls_subcategories[i], titles_subcategories[i])
            for j in range(len(urls_subcategories_all_pages)):
                urls_subcategories_all.append(urls_subcategories_all_pages[j])
        except:
            continue
            
    return urls_subcategories_all

def get_the_link_for_profiles(urls):
    
    '''This function extract links and also the name for each celebrity'''
    
    links = []
    names = []
    
    for url in urls:
        soup = get_HTML_content(url)
        
        try:
            rows = soup.find('div', {'id': 'mw-pages'}).find_all('li')
            url_base = "https://en.wikipedia.org"
            
            for i in range(len(rows)):
                href = rows[i].find('a').get('href')
                name = rows[i].find('a').text.split('(', 1)[0]
                link = url_base + href
                
                if link not in links or name not in names:
                    links.append(link)
                    names.append(name)
        except:
            continue
            
    return links, names

def get_the_data_from_table(soup, name):
    
    url_base = "https://en.wikipedia.org" 
    
    date_formats = [ r"\b[A-Za-z]+\s\d{1,2},\s\d{4}\b",   
                r"\d{1,2}\s[A-Za-z]+\s\d{4}", 
                r"\b\d{4}\b"]
    
    row_born = None
    row_died = None
    row_spouse = None
    row_children = None

    try:
        rows = soup.find('table', class_ = 'infobox biography vcard').find_all('tr')

        # Here we get the corresponding infromation about the Birth, Death, Spouse and Children from table of content

        for row in rows:
            if row.text and re.search(r'Born', row.text.split()[0]):
                row_born = row
            elif row.text and re.search(r'Died', row.text.split()[0]):
                row_died = row
            elif row.text and re.search(r'Spouse', row.text.split()[0]):
                row_spouse = row
            elif row.text and re.search(r'Children', row.text.split()[0]):
                row_children = row
                break
    except: 
        pass

    try:
        match = ''
        match = row_born.find(class_ = 'nickname').text
        if match != '':
            name = match
        else:
            pass
    except:
        pass
    
    try:
        match = None
        for date_format in date_formats:
            match = re.search(date_format, str(row_born))
            if match: 
                date_of_birth = match.group()
                break 
        else:
            date_of_birth = np.nan 
    except:
        date_of_birth = np.nan

    try:
        place_of_birth = row_born.find(class_ = 'birthplace').text
    except:
        place_of_birth = np.nan

    try:
        match = None
        for date_format in date_formats:
            match = re.search(date_format, str(row_died))
            if match:
                date_of_death = match.group()
                break
        else:
            date_of_death = np.nan 
    except: 
        date_of_death = np.nan

    try:
        place_of_death = row_died.find(class_ = 'deathplace').text
    except: 
        place_of_death = np.nan
 
    try:
        name_of_spouse = row_spouse.find('td').text.split('(', 1)[0].strip().replace('\n', '')
    except:
        name_of_spouse = np.nan
        
    try:  
        matches = None
        text = row_spouse.find_all(class_ = 'marriage-display-ws')[0].text
        matches = re.findall(r'\d{4}', text)

        if matches:
            date_of_marriage = matches[0]
        else:
            date_of_marriage = np.nan

        keywords = ['died', 'death']

        if len(matches)==2 and not any(keyword in text for keyword in keywords):
            date_of_divorce = matches[1]
        else:
            date_of_divorce = np.nan

    except:
        date_of_marriage = np.nan
        date_of_divorce = np.nan

    try:
        nom = None
        nom = len(row_spouse.find_all(class_ = 'marriage-display-ws'))
        if nom !=0:
            number_of_marriage = nom
        else:
            number_of_marriage=1
    except:
        number_of_marriage = np.nan

    try:
        matches = None
        marriages_end_in_death = None

        marriages_end_in_death = row_spouse.text.count('died') + row_spouse.text.count('death')

        last_marriage = row_spouse.find_all(class_ = 'marriage-display-ws')[number_of_marriage-1].text

        matches = re.findall(r'\d{4}', last_marriage)

        keywords = ['div', 'sep', 'ann', 'divorced', 'divorce', 'annulled', 'separated', 'died', 'death']

        if len(matches)==2 or any(keyword in last_marriage for keyword in keywords):
            number_of_divorce = number_of_marriage - marriages_end_in_death
        else: 
            number_of_divorce = number_of_marriage - marriages_end_in_death-1

    except:
        marriages_end_in_death = np.nan
        number_of_divorce = np.nan

    try:
        number_of_children = None
        number_of_children = row_children.find('td').text.split('[', 1)[0]
    except:
        number_of_children = np.nan

    try:
        href_spouse = None
        soup_spouse = None

        if any(words in row_spouse.find_all('a')[0].get('href') for words in name_of_spouse.replace('\u200b', '').split()[1:]):
            href_spouse = row_spouse.find_all('a')[0].get('href')

        if href_spouse:
            link_spouse = url_base + href_spouse
            soup_spouse = get_HTML_content(link_spouse)
    except: 
        soup_spouse = None
        
    return row_born, row_died, row_spouse, row_children, name, date_of_birth, place_of_birth, date_of_death, place_of_death, date_of_marriage, date_of_divorce, number_of_marriage, number_of_divorce, number_of_children, marriages_end_in_death, name_of_spouse, soup_spouse


def get_data_for_celebrity_and_spouse(links, names, sex, profession):
    
    '''This function get the list of links and names of celebrities and return Name (full name), 
        Date_of_Birth, Place_of_Birth, Date_of_Death, Place_of_Death, Name_of_Spouse, Date_of_Marriage,
        Date_of_Divorce, Number_of_Marriage, Number_of_Divorce, Date_of_Birth_Spouse, Place_of_Birth_Spouse,
        Date_of_Death_Spouse, Place_of_Death_Spouse, Number_of_Marriage_Spouse, Number_of_Divorce_Spouse,
        Number_of_Children_Spouse, Name_of_Spouse_Spouse as single dataframe'''

    Name = []
    Date_of_Birth = [] 
    Place_of_Birth = []
    Date_of_Death = []
    Place_of_Death = []
    Name_of_Spouse = []
    Number_of_Marriage = []
    Number_of_Divorce = []
    Number_of_Children = []
    Date_of_Marriage = []
    Date_of_Divorce = []
    Marriages_End_in_Death = []
    Date_of_Birth_Spouse = []
    Place_of_Birth_Spouse = []
    Date_of_Death_Spouse = []
    Place_of_Death_Spouse = []
    Number_of_Marriage_Spouse = []
    Number_of_Divorce_Spouse = []
    Number_of_Children_Spouse = []
    Name_of_Spouse_Spouse = []
    
    # Here we iterate over all the links

    for i in range(len(links)):
        
        # we implement retry mechanism to overcome the limiting rate for sending request
        
        max_retries = 3
        retry = 0
        soup = None
        
        while retry < max_retries:
            
            try:
                soup = get_HTML_content(links[i])
                if soup is not None:
                    break
            except:
                retry+=1
                time.sleep(5)

        row_born, row_died, row_spouse, row_children, name, date_of_birth, place_of_birth, date_of_death, place_of_death, date_of_marriage, date_of_divorce, number_of_marriage, number_of_divorce, number_of_children, marriages_end_in_death, name_of_spouse, soup_spouse = get_the_data_from_table(soup, names[i])
        row_born_spouse, row_died_spouse, row_spouse_spouse, row_children_spouse, name_of_spouse, date_of_birth_spouse, place_of_birth_spouse, date_of_death_spouse, place_of_death_spouse, date_of_marriage_spouse, date_of_divorce_spouse, number_of_marriage_spouse, number_of_divorce_spouse, number_of_children_spouse, marriages_end_in_death_spouse, name_of_spouse_spouse, soup_spouse_spouse = get_the_data_from_table(soup_spouse, name_of_spouse)
        
          
        Name.append(name)
        Date_of_Birth.append(date_of_birth) 
        Place_of_Birth.append(place_of_birth)
        Date_of_Death.append(date_of_death)
        Place_of_Death.append(place_of_death)
        Name_of_Spouse.append(name_of_spouse)
        Number_of_Marriage.append(number_of_marriage)
        Number_of_Divorce.append(number_of_divorce)
        Number_of_Children.append(number_of_children)
        Date_of_Marriage.append(date_of_marriage)
        Date_of_Divorce.append(date_of_divorce)
        Marriages_End_in_Death.append(marriages_end_in_death)
        Date_of_Birth_Spouse.append(date_of_birth_spouse) 
        Place_of_Birth_Spouse.append(place_of_birth_spouse)
        Date_of_Death_Spouse.append(date_of_death_spouse)
        Place_of_Death_Spouse.append(place_of_death_spouse)
        Number_of_Marriage_Spouse.append(number_of_marriage_spouse)
        Number_of_Divorce_Spouse.append(number_of_divorce_spouse)
        Number_of_Children_Spouse.append(number_of_children_spouse)
        Name_of_Spouse_Spouse.append(name_of_spouse_spouse)


    data = {'Name': Name, 
        'Date_of_Birth': Date_of_Birth,
        'Place_of_Birth': Place_of_Birth,
        'Date_of_Death' : Date_of_Death,
        'Place_of_Death': Place_of_Death,
        'Name_of_Spouse':  Name_of_Spouse,
        'Date_of_Marriage': Date_of_Marriage,
        'Date_of_Divorce': Date_of_Divorce,
        'Number_of_Marriage': Number_of_Marriage,
        'Number_of_Divorce': Number_of_Divorce,
        'Number_of_Children': Number_of_Children,
        'Marriages_End_in_Death': Marriages_End_in_Death,
        'Date_of_Birth_Spouse': Date_of_Birth_Spouse,
        'Place_of_Birth_Spouse': Place_of_Birth_Spouse,
        'Date_of_Death_Spouse': Date_of_Death_Spouse,
        'Place_of_Death_Spouse': Place_of_Death_Spouse,
        'Number_of_Marriage_Spouse': Number_of_Marriage_Spouse,
        'Number_of_Divorce_Spouse' : Number_of_Divorce_Spouse,
        'Number_of_Children_Spouse': Number_of_Children_Spouse,
        'Name_of_Spouse_Spouse': Name_of_Spouse_Spouse,
        'Sex': sex,
        'Profession': profession
       }

    df = pd.DataFrame(data)

    try:
        df["Date_of_Birth"] = pd.to_datetime(df["Date_of_Birth"], errors='coerce')
        df["Date_of_Death"] = pd.to_datetime(df["Date_of_Death"], errors='coerce')
        df["Date_of_Marriage"] = pd.to_datetime(df["Date_of_Marriage"], errors='coerce')
        df["Date_of_Divorce"] = pd.to_datetime(df["Date_of_Divorce"], errors='coerce')
        df["Date_of_Birth_Spouse"] = pd.to_datetime(df["Date_of_Birth_Spouse"], errors='coerce')
        df["Date_of_Death_Spouse"] = pd.to_datetime(df["Date_of_Death_Spouse"], errors='coerce')
    except:
        pass

    return df

def scraper(url, sex, profession):

    '''This is the scraper to extract all data into one single dataframe by giving 
        the url for category'''

    urls = get_all_subcategories(url)
    links, names = get_the_link_for_profiles(urls)
    data = get_data_for_celebrity_and_spouse(links[:100], names[:100], sex, profession)

    return data


def scraper_with_multithreading(url_female, url_male, profession):
    
    '''This function use multithreading to scrape data for both male and female celebrity 
        by providing male and female urls for each category'''
    
    with concurrent.futures.ThreadPoolExecutor() as executor:
        future_female = executor.submit(scraper, url_female, 'F', profession)
        future_male = executor.submit(scraper, url_male, 'M', profession)
        
    female_data = future_female.result()
    male_data = future_male.result()
    
    data = pd.concat([female_data, male_data])
    data.reset_index(drop=True, inplace=True)
    
    return data

def scraper_with_multithreading_using_links(links_female, names_female, links_male, names_male, profession):
    
    '''This function also use multithreading to scrape data for both male and female celebrity
        by providing links and names'''
    
    with concurrent.futures.ThreadPoolExecutor() as executor:
        future_female = executor.submit(get_data_for_celebrity_and_spouse, links_female, names_female, 'F', profession)
        future_male = executor.submit(get_data_for_celebrity_and_spouse, links_male, names_male, 'M', profession)
        
    female_data = future_female.result()
    male_data = future_male.result()
    
    data = pd.concat([female_data, male_data])
    data.reset_index(drop=True, inplace=True)
    
    return data

## 1.1) Scraping data for models

In [7]:
start = time.time()
url_female = "https://en.wikipedia.org/wiki/Category:Female_models_by_nationality"
url_male = "https://en.wikipedia.org/wiki/Category:Male_models_by_nationality"
models = scraper_with_multithreading(url_female, url_male, 'model')
end = time.time()
elapsed = end - start
print(elapsed)

1955.9075269699097


In [67]:
models.to_csv('Models.csv', index=False)

## 1.2) Scraping data for dancer

In [9]:
start = time.time()
url_female = "https://en.wikipedia.org/wiki/Category:Female_dancers_by_nationality"
url_male = "https://en.wikipedia.org/wiki/Category:Male_dancers_by_nationality"
dancers = scraper_with_multithreading(url_female, url_male, 'dancer')
end = time.time()
elapsed = end - start
print(elapsed)

554.0225760936737


In [68]:
dancers.to_csv('Dancers.csv', index=False)

## 1.3) Scraping data for journalists

In [11]:
start = time.time()
url_female = "https://en.wikipedia.org/wiki/Category:Women_journalists_by_nationality"
url_male = "https://en.wikipedia.org/wiki/Category:Male_journalists_by_nationality"
journalists = scraper_with_multithreading(url_female, url_male, 'journalist')
end = time.time()
elapsed = end - start
print(elapsed)

2804.695547103882


In [69]:
journalists.to_csv('Journalists.csv', index=False)

## 1.4) Scraping data for actors and actress

In [13]:
start = time.time()
url_female = "https://en.wikipedia.org/wiki/Category:Film_actresses_by_nationality"
url_male = "https://en.wikipedia.org/wiki/Category:Male_film_actors_by_nationality"
actors_actress = scraper_with_multithreading(url_female, url_male, 'actor/actress')
end = time.time()
elapsed = end - start
print(elapsed)

9560.27544093132


In [70]:
actors_actress.to_csv('Actors-actress.csv', index=False)

## 1.5) Scraping data for singers

In [15]:
start = time.time()
url_female = "https://en.wikipedia.org/wiki/Category:21st-century_women_singers_by_nationality"
url_male = "https://en.wikipedia.org/wiki/Category:21st-century_male_singers_by_nationality"
singers = scraper_with_multithreading(url_female, url_male, 'singer')
end = time.time()
elapsed = end - start
print(elapsed)

3048.709326028824


In [71]:
singers.to_csv('Singers.csv', index=False)

## 1.6) Scraping data for writers

In [17]:
start = time.time()
url_female = "https://en.wikipedia.org/wiki/Category:21st-century_women_writers_by_nationality"
url_male = "https://en.wikipedia.org/wiki/Category:21st-century_male_singers_by_nationality"
writers = scraper_with_multithreading(url_female, url_male, 'writers')
end = time.time()
elapsed = end - start
print(elapsed)

3611.731799840927


In [18]:
writers.to_csv('Writers.csv', index=False)

## 1.7) Scraping data for musicians

In [21]:
start = time.time()
url_female = "https://en.wikipedia.org/wiki/Category:21st-century_women_writers_by_nationality"
url_male = "https://en.wikipedia.org/wiki/Category:21st-century_male_singers_by_nationality"
musicians = scraper_with_multithreading(url_female, url_male, 'musician')
end = time.time()
elapsed = end - start
print(elapsed)

4816.020472049713


In [23]:
musicians.to_csv('Musicians.csv', index=False)

## 1.8) Scraping data for comedians

In [24]:
start = time.time()
url_female = "https://en.wikipedia.org/wiki/Category:Women_comedians_by_nationality"
url_male = "https://en.wikipedia.org/wiki/Category:Male_comedians_by_nationality"
comedians = scraper_with_multithreading(url_female, url_male, 'comedians')
end = time.time()
elapsed = end - start
print(elapsed)

929.846773147583


In [25]:
comedians.to_csv('Comedians.csv', index=False)

## 1.9) Scraping data for playwright

In [26]:
start = time.time()
url_female = "https://en.wikipedia.org/wiki/Category:Women_dramatists_and_playwrights_by_nationality"
url_male = "https://en.wikipedia.org/wiki/Category:Male_dramatists_and_playwrights_by_nationality"
playwrights = scraper_with_multithreading(url_female, url_male, 'playwrights')
end = time.time()
elapsed = end - start
print(elapsed)

1091.908532857895


In [27]:
playwrights.to_csv('Playwrights.csv', index=False)

## 1.10) Scraping data for directors

In [28]:
start = time.time()
url_female = 'https://en.wikipedia.org/wiki/Category:Women_film_directors'
url_total = 'https://en.wikipedia.org/wiki/Category:Film_directors_by_nationality'
urls = get_all_subcategories(url_total)
links_total, names_total = get_the_link_for_profiles(urls)

urls = get_all_subcategories(url_female)
links_female, names_female = get_the_link_for_profiles(urls)

links_male = []
names_male = []

for index, (link, name) in enumerate(zip(links_total, names_total)):
    if link not in links_female:
        links_male.append(link)
        names_male.append(name)
        
directors = scraper_with_multithreading_using_links(links_female, names_female, links_male, names_male, 'director')

end = time.time()
elapsed = end - start
print(elapsed)

3753.587508916855


In [29]:
directors.to_csv('Directors.csv', index=False)

## 1.11) Scraping data for photographers

In [30]:
start = time.time()
url_female = 'https://en.wikipedia.org/wiki/Category:Women_photographers_by_nationality'
url_total = 'https://en.wikipedia.org/wiki/Category:Photographers_by_nationality'
urls = get_all_subcategories(url_total)
links_total, names_total = get_the_link_for_profiles(urls)

urls = get_all_subcategories(url_female)
links_female, names_female = get_the_link_for_profiles(urls)

links_male = []
names_male = []

for index, (link, name) in enumerate(zip(links_total, names_total)):
    if link not in links_female:
        links_male.append(link)
        names_male.append(name)
        
photographers = scraper_with_multithreading_using_links(links_female, names_female, links_male, names_male, 'director')

end = time.time()
elapsed = end - start
print(elapsed)

1342.5910720825195


In [31]:
photographers.to_csv('Photographers.csv', index=False)