<a href="https://colab.research.google.com/github/marinwitherspoon/English-monarchs/blob/main/Processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re

# url of scrape
url = 'https://en.wikipedia.org/wiki/List_of_English_monarchs'

# Get the HTML content using requests
html_content = requests.get(url).text

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(html_content, 'html.parser')

all_king_tables = soup.find_all('table',{'class':"wikitable"})

df_all = pd.DataFrame(columns=['Name', 'Birth', 'Death', 'Claim'])

for i in all_king_tables:
  df=pd.read_html(str(i))[0]

  if 'Claim' in df.columns:
    selected_columns = ['Name', 'Birth', 'Death', 'Claim']
  else:
    selected_columns = ['Name', 'Birth', 'Death']
    df['Claim'] = None

  # convert list to dataframe
  df=pd.DataFrame(df)[selected_columns]

  df_all = pd.concat([df_all, df], axis=0, ignore_index=True)

#  clean data  ##########################

# Delete rows with invalid data
df_all.drop([18, 25, 48], inplace=True)
#reset indexing
df_all.reset_index(drop=True, inplace=True)

# - clean up name column -------------------
#seperate the name form the rest of the string
df_all['Name'] = df_all['Name'].str.replace(r'\[.*?\]', '')

#cleaning name column to include only names
df_all[['Name', 'desc']] = df_all['Name'].str.split(r'\s\d|\[\d+\]', 1, expand=True)
df_all['Name'] = df_all['Name'].apply(lambda row: ' '.join(sorted(set(row.split()), key=row.index)))

#exstract dates from description
df_all['dates'] = df_all['desc'].str.extract(r'(\s*\d{3,4}\s*(?:–\s*\d{0,2}\s*\w*\s*\d{3,4})?)')
df_all['dates'][29] = '1307 – 1327'
df_all['dates'][3] = '927 – 939'
for i in df_all['dates']:
  bre = i.split(' - ')
  df_all['start'] = bre[0]
#remove days and months
df_all['dates'] = df_all['dates'].str.replace(r'\s+\d{0,2}\s+\w*\s+', ' ')

for index, row in df_all.iterrows():
  date_range = row['dates'].strip().split('–')

  df_all.at[index, 'start'] = int(date_range[0].strip())
  df_all.at[index, 'end'] = int(date_range[1].strip())

# - clean up Birth column -------------------
df_all['Birth'] = df_all['Birth'].str.extract(r'(\d{3,4})').astype(int)

# - clean up Death column -------------------
df_all['Death'] = df_all['Death'].str.extract(r'(\d{3,4})').astype(int)

# - clean up Claim column -------------------
df_all['Claim'] = df_all['Claim'].str.replace(r'/.*?of', 'of')
#exstract relationship
df_all['ClaimRelation'] = df_all['Claim'].str.extract(r'(\w*(?:-\w+)*(?:Son|Daughter))\s+',flags=re.IGNORECASE)
# exstract and clean up names
df_all['Claim'] = df_all['Claim'].str.extract(r'(?:Son|Daughter)\s+of\s+(\w+\s*(?:[IV]+|of\s+\w+|the\s+\w+)*)',flags=re.IGNORECASE)
df_all['Claim'] = df_all['Claim'].str.replace(r'(?<=[^ ])(?<![IV])([A-Z])(.*?)$','')

#manually correct data
df_all['Claim'][10] = 'NaN'
df_all['Claim'][45] = 'spouse'

#sort table by coorination date
df_all = df_all.sort_values(by='start').reset_index(drop=True)

df_all['Name'][0] = 'Alfred the Great'

In [2]:
!pip3 install ColabTurtlePlus

Collecting ColabTurtlePlus
  Downloading ColabTurtlePlus-2.0.1-py3-none-any.whl (31 kB)
Installing collected packages: ColabTurtlePlus
Successfully installed ColabTurtlePlus-2.0.1


In [3]:
import ColabTurtlePlus.Turtle as T

def sqline(length, depth, color):
  T.color(color)
  T.begin_fill()

  T.left(90)
  T.forward(depth/2)

  T.right(90)
  T.forward(length)

  T.right(90)
  T.forward(depth)

  T.right(90)
  T.forward(length)

  T.right(90)
  T.forward(depth/2)

  T.right(90)
  T.end_fill()
  T.forward(length)

Put clearscreen() as the first line in a cell (after the import command) to re-run turtle commands in the cell


In [21]:
import ColabTurtlePlus.Turtle as T
date_range = max(df_all['Death'])-min(df_all['Birth'])+20

T.reset()
# set screen size, starting pen location and pen size
screen = T.Screen()
y = 600
screen.setup(date_range, y)
screen.setworldcoordinates(0, 0, date_range+100, y+10)
T.penup()
T.speed(15)

#### make time line
# T.goto(10, 10)
# T.left(90)
# T.pendown()

# for i in range(9,10 + round(date_range/100)):
#   T.pensize(2)
#   T.color('dark grey')
#   T.goto(i*100-int(df_all['start'][0]), 10)
#   T.write(i*100, font = ('Arial', 8, 'normal'))
#   T.pendown()
#   T.forward(y)
#   T.penup()
#   for j in range(1,4):
#     T.color('grey')
#     if j == 2:
#       T.pensize(1)
#     else:
#       T.pensize(.5)
#     T.goto(i*100+25*j-int(df_all['start'][0]), 10)
#     T.write(i*100+25*j, font = ('Arial', 8, 'normal'))
#     T.pendown()
#     T.forward(y)
#     T.penup()
# -----------------------------------
T.goto(10, y-10)
T.pensize(1)
#T.right(90)

for i in range(len(df_all)):
  T.goto(int(df_all['Birth'][i]-df_all['Birth'][0]+10), y-10*i)
  T.pendown()
  sqline(int(df_all['start'][i]) - int(df_all['Birth'][i]), 8, 'black')
  sqline(df_all['end'][i] - df_all['start'][i], 8, 'orange')
  if df_all['Death'][i] - df_all['end'][i] != 0:
    sqline(df_all['Death'][i] - df_all['end'][i], 8, 'black')
  T.penup()
  T.forward(10)
  T.color("black")
  T.write(df_all['Name'][i], font = ('Arial', 8, 'normal'))

# add connection
# change claim to generation count
# if string contains "(1st reign)" look for others and loop through once more

In [8]:
df_all

Unnamed: 0,Name,Birth,Death,Claim,desc,dates,start,end,ClaimRelation
0,Alfred the Great (King of Wessex from,849,899,Æthelwulf of Wessex,71) c. 886 – 26 October 899 (13 years),886 – 899,886,899.0,Son
1,Edward the Elder,874,924,Alfred,"6 October 899 – 17 July 924 (24 years, 266 days)",899 – 924,899,924.0,Son
2,Ælfweard c.,901,924,Edward the Elder,7 July 924 – 2 August 924 (16 days),924 – 924,924,924.0,Son
3,Æthelstan the Glorious,894,939,Edward the Elder,24 King of the Anglo-Saxons (924–927) – King o...,927 – 939,927,939.0,Son
4,Edmund I the Magnificent,921,946,Edward the Elder,"7 October 939 – 26 May 946 (6 years, 212 days)",939 – 946,939,946.0,Son
5,Eadred,923,955,Edward the Elder,"6 May 946 – 23 November 955 (9 years, 182 days)",946 – 955,946,955.0,Son
6,Eadwig All-Fair,940,959,Edmund I,"3 November 955 – 1 October 959 (3 years, 313 d...",955 – 959,955,959.0,Son
7,Edgar the Peaceful,943,975,Edmund I,"October 959 – 8 July 975 (15 years, 281 days)",959 – 975,959,975.0,Son
8,Edward the Martyr,962,978,Edgar the Peaceful,"July 975 – 18 March 978 (2 years, 254 days)",975 – 978,975,978.0,Son
9,(1st reign) Æthelred the Unready,966,1016,Edgar the Peaceful,8 March 978 – 1013 (34–35 years),978 – 1013,978,1013.0,Son
