### Load all of the packages

In [1]:
from bs4 import BeautifulSoup
import requests
import sys
import pandas as pd
from geopy.geocoders import Nominatim

### Download the .HTML of a Certain Page and Name

In [2]:
def get_file(first_name ="George", last_name="Washington", page="1"):
    # define search url based on user input
    url = (
        f"https://www.findagrave.com/memorial/search?firstname={first_name}&"
        f"middlename=&lastname={last_name}&birthyear=&birthyearfilter=&"
        "deathyear=&deathyearfilter=&location=&locationId=&memorialid=&mcid=&"
        f"linkedToName=&datefilter=&orderby=b&plot=&page={page}#sr-1075")

    page_HTML = requests.get(url) # HTML from URL

    # successful url request code is 200
    if page_HTML.status_code != 200:
        sys.exit("Connection Failed.") # stop execution and return error

    return page_HTML


### Go Through All Entries for a Certain Name and Return a Dataframe

In [4]:
def get_info(first_name ="George", last_name="Washington"):
    #Define the empty dataframe
    data_table = pd.DataFrame(columns = ["Names", "Dates", "Location of Grave"])

    # get HTML of specified page number
    page = get_file(first_name, last_name, "1")

    # Parse HTML into Beautiful Soup format
    parsed = BeautifulSoup(page.content, "html.parser")

    #Find the div containing the maximum number of pages
    max_page = parsed.find(id = "gotoPage")

    #max_page functions as a dictionary where we can enter the
    #string of any element in the div to find its value
    max_page = max_page["max"]


    #iterate through each page
    for index in range(int(max_page)):

        # get HTML of specified page number
        page = get_file(first_name, last_name, str(int(index) + 1))

        # Parse HTML into Beautiful Soup format
        parsed = BeautifulSoup(page.content, "html.parser")

        #Create a list of all tag with an attribute class "memorial-item"
        #tags act like containers that contain attributes and text
        grave_infos = parsed.find_all(class_ = "memorial-item")

        #Go through each item in the list
        for grave_info in grave_infos:
            #First check name tag exists and contains text and includes first and last name
            if grave_info.find("i") is not None and grave_info.find("i").string is not None and first_name in grave_info.find("i").string and last_name in grave_info.find("i").string:
                name = grave_info.find("i").string
                #Check that birth and death date exist and does not contain unknown
                if grave_info.find(class_ = "birthDeathDates") is not None and "unknown" not in grave_info.find(class_ = "birthDeathDates").string:
                    #find the text with class = birthDeathDates
                    date = grave_info.find(class_ = "birthDeathDates").string
                else: date = "NA"

                #make sure that the grave data actually exists
                if grave_info.find("p", attrs = {'class':'addr-cemet'}) is not None:
                    #find the text in <p> with class = addr-cemet
                    grave_address = grave_info.find("p", attrs = {'class':'addr-cemet'}).string
                else: grave_address = "NA"

                #record the date in the Dates column
                data_table = data_table.append({"Names": name, "Dates": date, "Location of Grave": grave_address}, ignore_index= True)

    #split birthday and death date
    data_table[["Birth Date", "Death Date"]] = data_table["Dates"].str.split("–", expand = True, n=1)
    #get rid of any NA values
    data_table = data_table.dropna()

    return data_table

In [5]:
data = get_info()


In [6]:
#grab only the years from the dates
data["Death Date"] = data["Death Date"].str[-4:]
data["Birth Date"] = data["Birth Date"].str[-5:]


In [27]:
#count the number of times named individual was born for each year
birth_stats = data["Birth Date"].value_counts()

In [28]:
#convert the count series into a dataframe
birth_stats = birth_stats.to_frame()

#for some reason count makes the objects it is counting an index so we need to make them a column
birth_stats.reset_index(inplace=True)


In [29]:
#rename columns for better clarity
birth_stats = birth_stats.rename(columns={"index": "Year", "Birth Date": "Birth Count"})

In [30]:
#get rid of anything that contains a letter
birth_stats = birth_stats[~birth_stats.Year.str.contains("A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z|a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z")]

In [31]:
#convert all of the Year and count data to integer
birth_stats = birth_stats.astype(int)

In [16]:
#Save dataframe as a csv (need to change as this isn't the edited one)
data.to_csv (r'/home/phillip/github/deceased-george-washington-stats/DataFrames/test1.csv', index = False, header=True)