<a href="https://colab.research.google.com/github/marinwitherspoon/English-monarchs/blob/main/Processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re

# url of scrape
url = 'https://en.wikipedia.org/wiki/List_of_English_monarchs'

# Get the HTML content using requests
html_content = requests.get(url).text

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(html_content, 'html.parser')

all_king_tables = soup.find_all('table',{'class':"wikitable"})

df_all = pd.DataFrame(columns=['Name', 'Birth', 'Death', 'Claim'])

for i in all_king_tables:
  df=pd.read_html(str(i))[0]

  if 'Claim' in df.columns:
    selected_columns = ['Name', 'Birth', 'Death', 'Claim']
  else:
    selected_columns = ['Name', 'Birth', 'Death']
    df['Claim'] = None

  # convert list to dataframe
  df=pd.DataFrame(df)[selected_columns]

  df_all = pd.concat([df_all, df], axis=0, ignore_index=True)

#  clean data  ##########################

# Delete rows with invalid data
df_all.drop([18, 25, 48], inplace=True)
#reset indexing
df_all.reset_index(drop=True, inplace=True)

# - clean up name column -------------------
#seperate the name form the rest of the string
df_all['Name'] = df_all['Name'].str.replace(r'\[.*?\]', '')

#cleaning name column to include only names
df_all[['Name', 'desc']] = df_all['Name'].str.split(r'\s\d|\[\d+\]', 1, expand=True)
df_all['Name'] = df_all['Name'].apply(lambda row: ' '.join(sorted(set(row.split()), key=row.index)))

#exstract dates from description
df_all['dates'] = df_all['desc'].str.extract(r'(\s*\d{3,4}\s*(?:–\s*\d{0,2}\s*\w*\s*\d{3,4})?)')
df_all['dates'][29] = '1307 – 1327'
df_all['dates'][3] = '927 – 939'
for i in df_all['dates']:
  bre = i.split(' - ')
  df_all['start'] = bre[0]
#remove days and months
df_all['dates'] = df_all['dates'].str.replace(r'\s+\d{0,2}\s+\w*\s+', ' ')

for index, row in df_all.iterrows():
  date_range = row['dates'].strip().split('–')

  df_all.at[index, 'start'] = int(date_range[0].strip())
  df_all.at[index, 'end'] = int(date_range[1].strip())

# - clean up Birth column -------------------
df_all['Birth'] = df_all['Birth'].str.extract(r'(\d{3,4})').astype(int)

# - clean up Death column -------------------
df_all['Death'] = df_all['Death'].str.extract(r'(\d{3,4})').astype(int)

# - clean up Claim column -------------------
df_all['Claim'] = df_all['Claim'].str.replace(r'/.*?of', 'of')
#exstract relationship
df_all['ClaimRelation'] = df_all['Claim'].str.extract(r'(\w*(?:-\w+)*(?:Son|Daughter))\s+',flags=re.IGNORECASE)
# exstract and clean up names
df_all['Claim'] = df_all['Claim'].str.extract(r'(?:Son|Daughter)\s+of\s+(\w+\s*(?:[IV]+|of\s+\w+|the\s+\w+)*)',flags=re.IGNORECASE)
df_all['Claim'] = df_all['Claim'].str.replace(r'(?<=[^ ])(?<![IV])([A-Z])(.*?)$','')

#manually correct data
df_all['Claim'][10] = 'NaN'
df_all['Claim'][45] = 'spouse'

In [None]:
!pip3 install ColabTurtlePlus

In [24]:
import ColabTurtlePlus.Turtle as T
date_range = max(df_all['Death'])-min(df_all['Birth'])+20

T.reset()
screen = T.Screen()
y = 1000
screen.setup(date_range, y)
screen.setworldcoordinates(0, 0, date_range, y)
T.penup()
T.speed(15)
T.goto(10, y-10)
T.pensize(10)


# T.pendown()
# T.forward(int(df_all['start'][0]) - int(df_all['Birth'][0]))
# T.color("yellow")
# T.forward(df_all['end'][0] - df_all['start'][0])
# if df_all['Death'][0] - df_all['end'][0] != 0:
#   T.color("black")
#   T.forward(df_all['Death'][0] - df_all['end'][0])
# T.penup()
# T.color("black")
# T.right(90)
# T.forward(15)
# T.right(90)
# T.forward(int(df_all['Death'][0]) - int(df_all['Birth'][0]))
# T.right(180)
# T.forward(df_all['start'][1] - df_all['start'][0])

for i in range(len(df_all)):
  T.pendown()
  T.forward(int(df_all['start'][i]) - int(df_all['Birth'][i]))
  T.color("orange")
  T.forward(df_all['end'][i] - df_all['start'][i])
  if df_all['Death'][i] - df_all['end'][i] != 0:
    T.color("black")
    T.forward(df_all['Death'][i] - df_all['end'][i])
  T.penup()
  T.color("black")
  T.right(90)
  T.forward(12)
  T.right(90)
  T.forward(int(df_all['Death'][i]) - int(df_all['Birth'][i]))
  T.right(180)
  if i+1 < len(df_all):
    T.forward(df_all['start'][i+1] - df_all['start'][i])