![My Image](MacBook.jpg)

### Scraping data from a website.

In [25]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
url = "https://www.ucl.ac.uk/governance-compliance/academic-structure/academic-units"
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
faculty_list = []
current_faculty = []
current_department = []

for section in soup.find_all('section', class_ = 'middle-split__column2'):
    first_p = section.find('p')
    for a in first_p.find_all('a'):
        if a.find('strong') is not None and a.find('strong').text.strip() == a.text.strip():
            if current_faculty:
                faculty_list.append({'faculty': current_faculty, 'departments': current_department})
                current_department = []
            current_faculty = a.get_text().strip()
        elif a.find_parent('strong') is not None and a.find_parent('strong').text.strip() == a.text.strip():
            if current_faculty:
                faculty_list.append({'faculty': current_faculty, 'departments': current_department})
                current_department = []
            current_faculty = a.get_text().strip()
        else:
            current_department.append(a.get_text().strip())


faculty_list.append({'faculty': current_faculty, 'departments': current_department})

for section in soup.find_all('section', class_ = 'middle-split__column1'):
    for p in section.find_all('p'):
        for a in p.find_all('a'):
            if a.find('strong') is not None and a.find('strong').text.strip() == a.text.strip():
                if current_faculty:
                    faculty_list.append({'faculty': current_faculty, 'departments': current_department})
                    current_department = []
                current_faculty = a.get_text().strip()
            elif a.find_parent('strong') is not None and a.find_parent('strong').text.strip() == a.text.strip():
                if current_faculty:
                    faculty_list.append({'faculty': current_faculty, 'departments': current_department})
                    current_department = []
                current_faculty = a.get_text().strip()
            else:
                current_department.append(a.get_text().strip())


# Convert faculty_list to DataFrame
df = pd.DataFrame(faculty_list)

# Explode departments column
df = df.explode('departments')

# Write to CSV file
df.to_csv('faculty_departments.csv', index=False)
df.to_excel('faculty_departments.xlsx', index=False)

structure = pd.read_csv('faculty_departments_transformed.csv')
structure = structure[['faculty','departments']]
structure = structure.rename(columns = {'faculty':'Faculty', 'departments':'Department'})
structure

Unnamed: 0,Faculty,Department
0,Faculty of Mathematical and Physical Sciences,Chemistry
1,Faculty of Mathematical and Physical Sciences,Earth Sciences
2,Faculty of Mathematical and Physical Sciences,Mathematics
3,Faculty of Mathematical and Physical Sciences,Physics and Astronomy
4,Faculty of Mathematical and Physical Sciences,Science and Technology Studies
...,...,...
69,Faculty of Engineering Sciences,"Science, Technology, Engineering and Public Po..."
70,Faculty of Engineering Sciences,Security and Crime Science
71,Faculty of Engineering Sciences,UCL School of Management
72,Faculty of Laws,Faculty of Laws


In [2]:
department = pd.read_csv('department.csv')
department

Unnamed: 0,Department,Male,Female,Other,Total
0,Institute of the Americas,95,105,0,200
1,Anthropology,140,525,0,665
2,Arts and Sciences BASc,110,400,0,510
3,Institute of Archaeology,105,330,5,440
4,Bartlett Real Estate Institute,0,0,0,0
...,...,...,...,...,...
80,UCL Queen Square Institute of Neurology,150,275,0,425
81,Institute of Ophthalmology,80,165,0,245
82,Division of Surgery and Interventional Science,260,310,0,570
83,UCL Elizabeth Garrett Anderson Institute for W...,10,115,0,125


In [3]:

structure.to_csv('structure_with_total.csv', index=False)
merged = pd.merge(structure, department, on='Department', how='left')

merged

Unnamed: 0,Faculty,Department,Male,Female,Other,Total
0,Faculty of Mathematical and Physical Sciences,Chemistry,440.0,455.0,0.0,895.0
1,Faculty of Mathematical and Physical Sciences,Earth Sciences,135.0,140.0,0.0,275.0
2,Faculty of Mathematical and Physical Sciences,Mathematics,600.0,545.0,0.0,1145.0
3,Faculty of Mathematical and Physical Sciences,Physics and Astronomy,835.0,300.0,0.0,1135.0
4,Faculty of Mathematical and Physical Sciences,Science and Technology Studies,70.0,140.0,0.0,210.0
...,...,...,...,...,...,...
69,Faculty of Engineering Sciences,"Science, Technology, Engineering and Public Po...",45.0,45.0,0.0,90.0
70,Faculty of Engineering Sciences,Security and Crime Science,115.0,170.0,0.0,285.0
71,Faculty of Engineering Sciences,UCL School of Management,975.0,1145.0,0.0,2120.0
72,Faculty of Laws,Faculty of Laws,,,,


In [21]:
import squarify
import matplotlib.pyplot as plt
import plotly.express as px
colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']
fig = px.treemap(merged, 
                 path=['Faculty', 'Department'], 
                 values='Total',
                 color='Faculty',
                 custom_data=['Total'],
                 color_discrete_sequence=colors)
fig.update_traces(
    textfont_size=12,
    texttemplate='%{label}<br>%{customdata[0]:,.0f}',
    hovertemplate='<b>%{label}</b><br>Total: %{value:.2f}<br>',
    )

fig.show()

In [24]:
fig = px.treemap(merged, 
                 path=['Faculty', 'Department'], 
                 values='Total',
                 color='Faculty',
                 color_discrete_sequence=colors,
                 branchvalues='total',
                 hover_data={'Total': ':.2f'},
                 color_continuous_scale='RdBu',
                 custom_data=['Total'],
                 
                 )
fig.update_traces(
    textfont_size=12,
    texttemplate='%{label}<br>%{customdata[0]:,.0f}',
    hovertemplate='<b>%{label}</b><br>Total: %{value:.2f}<br>',
    tiling_packing = "dice-slice",
    )
fig.update_layout(
    margin=dict(t=50, b=50, l=50, r=50),
    )
fig.show()