In [49]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
from io import StringIO

url = 'https://en.wikipedia.org/wiki/2022_Asian_Games_medal_table'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

table = soup.find('table', {'class': 'wikitable'})
table_str = str(table)
table_io = StringIO(table_str)  

df = pd.read_html(table_io)[0]

df.columns = ['Rank', 'Nation', 'Gold', 'Silver', 'Bronze', 'Total']
df['Nation'] = df['Nation'].str.replace(r'\*', '', regex=True)

df = df[pd.to_numeric(df['Rank'], errors='coerce').notna()]
df['Rank'] = df['Rank'].astype(int)
df['Gold'] = df['Gold'].fillna(0).astype(int)
df['Silver'] = df['Silver'].fillna(0).astype(int)
df['Bronze'] = df['Bronze'].fillna(0).astype(int)

q1df = pd.melt(df[['Rank', 'Nation', 'Gold', 'Silver', 'Bronze']],
               id_vars=['Rank', 'Nation'], 
               var_name='Medal', 
               value_name='Count')

q1df.rename(columns={'Nation': 'Team'}, inplace=True)

q1df['Medal'] = pd.Categorical(q1df['Medal'], categories=['Gold', 'Silver', 'Bronze'], ordered=True)

q1df = q1df.sort_values(by=['Rank', 'Team', 'Medal']).reset_index(drop=True)

q1df['Rank'] = q1df['Rank'].astype(int)
q1df['Count'] = q1df['Count'].astype(int)
q1df['Team'] = q1df['Team'].astype(str)
q1df['Medal'] = q1df['Medal'].astype(str)

q1df.to_csv('Q1_medal.csv', index=False)

In [50]:
q1df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 123 entries, 0 to 122
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Rank    123 non-null    int32 
 1   Team    123 non-null    object
 2   Medal   123 non-null    object
 3   Count   123 non-null    int32 
dtypes: int32(2), object(2)
memory usage: 3.0+ KB


In [51]:
q1df.head(6)

Unnamed: 0,Rank,Team,Medal,Count
0,1,China,Gold,201
1,1,China,Silver,111
2,1,China,Bronze,71
3,2,Japan,Gold,52
4,2,Japan,Silver,67
5,2,Japan,Bronze,69


In [52]:
q1df.tail(6)

Unnamed: 0,Rank,Team,Medal,Count
117,38,Palestine,Gold,0
118,38,Palestine,Silver,0
119,38,Palestine,Bronze,1
120,38,Syria,Gold,0
121,38,Syria,Silver,0
122,38,Syria,Bronze,1


In [53]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

url = 'https://realpython.github.io/fake-jobs/'

response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

positions = []
companies = []
cities = []
states = []

jobs = soup.find_all('div', class_='card-content')

for job in jobs:
    position = job.find('h2', class_='title').text.strip()
    company = job.find('h3', class_='subtitle').text.strip()
    location = job.find('p', class_='location').text.strip()
    
    city, state = location.rsplit(',', 1)
    city = city.strip()
    state = state.strip()
    
    positions.append(position)
    companies.append(company)
    cities.append(city)
    states.append(state)
q2df = pd.DataFrame({
    'position': positions,
    'company': companies,
    'city': cities,
    'state': states
})

q2sample = q2df[(q2df['state'] == 'AA') & (q2df['position'].str.contains('engineer', case=False))]

q2df.to_csv('Q2_jobs.csv', index=False)

In [54]:
q2df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   position  100 non-null    object
 1   company   100 non-null    object
 2   city      100 non-null    object
 3   state     100 non-null    object
dtypes: object(4)
memory usage: 3.3+ KB


In [55]:
q2df.head(3)

Unnamed: 0,position,company,city,state
0,Senior Python Developer,"Payne, Roberts and Davis",Stewartbury,AA
1,Energy engineer,Vasquez-Davidson,Christopherville,AA
2,Legal executive,"Jackson, Chambers and Levy",Port Ericaburgh,AA


In [56]:
q2df.tail(3)  

Unnamed: 0,position,company,city,state
97,Database administrator,Yates-Ferguson,Port Susan,AE
98,Furniture designer,Ortega-Lawrence,North Tiffany,AA
99,Ship broker,"Fuentes, Walls and Castro",Michelleville,AP


In [57]:
q2sample 

Unnamed: 0,position,company,city,state
1,Energy engineer,Vasquez-Davidson,Christopherville,AA
28,Structural engineer,Pierce-Long,Herbertside,AA
32,Broadcast engineer,"Morgan, Butler and Bennett",Loribury,AA
48,"Engineer, broadcasting (operations)",Taylor PLC,Gileston,AA


In [58]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

url = 'https://www.peakbagger.com/list.aspx?lid=5651'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

table = soup.find('table', {'class': 'gray'})

if table is None:
    print("Error: Could not find the table. Please check the HTML structure and adjust the selector.")
else:
    print("Table found successfully!")


names, regions, elevs, ids, lats, longs = [], [], [], [], [], []

for i, row in enumerate(table.find_all('tr')[1:]):
    cols = row.find_all('td')
    
    if len(cols) >= 4:
        name = cols[1].text.strip()
        region = cols[2].text.strip()
        elev = int(cols[3].text.strip()) 
        peak_url = cols[1].find('a')['href']
        peak_id = peak_url.split('pid=')[1]

        peak_response = requests.get(f'https://www.peakbagger.com/{peak_url}')
        peak_soup = BeautifulSoup(peak_response.content, 'html.parser')

        lat_long_row = peak_soup.find(string='Latitude/Longitude (WGS84)')
        if lat_long_row:
            lat_long_td = lat_long_row.find_next('td')
            lat_long_text = lat_long_td.get_text(strip=True)

            print(f"Extracted lat/long text for {name}: {lat_long_text}")

            try:
                lat_long_parts = lat_long_text.split(',')
                lat = float(lat_long_parts[0].strip())
                long = float(lat_long_parts[1].strip().split()[0])  
            except (ValueError, IndexError) as e:
                lat, long = None, None
                print(f"Error parsing lat/long for {name} (URL: https://www.peakbagger.com/{peak_url}): {e}")
        else:
            lat, long = None, None
            print(f"Lat/long not found for {name} (URL: https://www.peakbagger.com/{peak_url})")

        names.append(name)
        regions.append(region)
        elevs.append(elev)
        ids.append(peak_id)
        lats.append(lat)
        longs.append(long)

    if (i + 1) % 10 == 0:
        print(f"Extracted data for {i + 1} mountains so far...")

    time.sleep(1)

q3df = pd.DataFrame({
    'name': names,
    'region': regions,
    'elev': elevs,
    'id': ids,
    'lat': lats,
    'long': longs
})

q3df.drop_duplicates(subset='id', inplace=True)

def categorize_elevation(elev):
    if elev >= 0 and elev < 1000:
        return 'Cat 1'
    elif elev >= 1000 and elev < 2000:
        return 'Cat 2'
    elif elev >= 2000 and elev < 3000:
        return 'Cat 3'
    else:
        return 'Cat 4'

q3df['elev_cat'] = q3df['elev'].apply(categorize_elevation)

q3df.to_csv('Q3_mountains.csv', index=False)

Table found successfully!
Extracted lat/long text for Fuji-san: 35.360638, 138.727347 (Dec Deg)35° 21' 38'' N, 138° 43' 38'' E (DMS)293513 E, 3915408 N, Zone 54 (UTM)
Extracted lat/long text for Fuji-san: 35.360638, 138.727347 (Dec Deg)35° 21' 38'' N, 138° 43' 38'' E (DMS)293513 E, 3915408 N, Zone 54 (UTM)
Extracted lat/long text for Kita-dake: 35.674537, 138.238833 (Dec Deg)35° 40' 28'' N, 138° 14' 20'' E (DMS)250093 E, 3951364 N, Zone 54 (UTM)
Extracted lat/long text for Hotaka-dake: 36.289203, 137.647986 (Dec Deg)36° 17' 21'' N, 137° 38' 53'' E (DMS)737808 E, 4019280 N, Zone 53 (UTM)
Extracted lat/long text for Aino-dake: 35.646037, 138.228292 (Dec Deg)35° 38' 46'' N, 138° 13' 42'' E (DMS)249049 E, 3948229 N, Zone 54 (UTM)
Extracted lat/long text for Yariga-take: 36.34198, 137.647625 (Dec Deg)36° 20' 31'' N, 137° 38' 51'' E (DMS)737615 E, 4025135 N, Zone 53 (UTM)
Extracted lat/long text for Warusawa-dake: 35.500736, 138.182417 (Dec Deg)35° 30' 3'' N, 138° 10' 57'' E (DMS)244433 E, 3

In [59]:
q3df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 100 entries, 0 to 100
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   name      100 non-null    object 
 1   region    100 non-null    object 
 2   elev      100 non-null    int64  
 3   id        100 non-null    object 
 4   lat       100 non-null    float64
 5   long      100 non-null    float64
 6   elev_cat  100 non-null    object 
dtypes: float64(2), int64(1), object(4)
memory usage: 6.2+ KB


In [60]:
q3df.head(3)

Unnamed: 0,name,region,elev,id,lat,long,elev_cat
0,Fuji-san,Kanto,3776,10882,35.360638,138.727347,Cat 4
2,Kita-dake,Chubu,3192,10866,35.674537,138.238833,Cat 4
3,Hotaka-dake,Chubu,3190,10840,36.289203,137.647986,Cat 4


In [61]:
q3df.tail(3)

Unnamed: 0,name,region,elev,id,lat,long,elev_cat
98,Ibuki-yama,Western Japan,1377,10879,35.417856,136.406373,Cat 2
99,Kaimon-dake,Western Japan,924,10937,31.179944,130.528397,Cat 1
100,Tsukuba-san,Kanto,877,10844,36.225403,140.106705,Cat 1


In [62]:
q3sample

elev_cat,Cat 1,Cat 2,Cat 3,Cat 4
region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Chubu,0,4,29,12
Hokkaido,0,5,4,0
Kanto,1,3,8,1
Tohoku,0,11,9,0
Western Japan,1,12,0,0


In [63]:
import requests
from bs4 import BeautifulSoup

url = "https://masters.hkubs.hku.hk/articles/masterofscienceinbusinessanalytics/ourfaculty"
response = requests.get(url)

soup = BeautifulSoup(response.content, 'html.parser')
divs = soup.find_all('div')  

for index, div in enumerate(divs[:5]):  
    print(f"Div {index}:")
    print(div.prettify())
    print("\n" + "="*80 + "\n")

Div 0:
<div class="loadingPage__Animation-ukg194-1 gDolnq">
 <div aria-label="animation" role="button" style="width:100%;height:100%;overflow:hidden;margin:0 auto;outline:none" tabindex="0" title="">
 </div>
</div>



Div 1:
<div aria-label="animation" role="button" style="width:100%;height:100%;overflow:hidden;margin:0 auto;outline:none" tabindex="0" title="">
</div>



Div 2:
<div class="iHyclk">
 <section class="sidebar__Overlay-sc-1e3pgs5-9 hdgGlQ">
 </section>
 <section class="sidebar__Wrapper-sc-1e3pgs5-10 dVAfLE">
  <div style="position:relative;overflow:hidden;width:100%;height:100%">
   <div style="position:absolute;top:0;left:0;right:0;bottom:0;overflow:hidden;-webkit-overflow-scrolling:touch;margin-right:0;margin-bottom:0">
    <div class="sidebar__Navigation-sc-1e3pgs5-11 ezUfru false">
     <div class="adlogo" style="margin-bottom: 6%;">
      <a href="/">
       <img src="/uploads/image/202207/06a15021779f99588cb18658b9c37d71.png" style="width: 200px;"/>
      </a>
      

In [64]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd
import os
import time
import requests

service = Service(ChromeDriverManager().install())  
driver = webdriver.Chrome(service=service)

url = 'https://masters.hkubs.hku.hk/articles/masterofscienceinbusinessanalytics/ourfaculty'
driver.get(url)

time.sleep(5)

soup = BeautifulSoup(driver.page_source, 'html.parser')

professors = []

for profile in soup.find_all('div', class_='col-md-4'):
    link = profile.find('a', class_='boximg')
    name_tag = link.find('b', class_='name')
    phd_tag = link.find('p', class_='desinfo')
    image_tag = link.find('img')
    name = name_tag.text.strip() if name_tag else "Unknown"
    phd_university = phd_tag.text.strip() if phd_tag else "Unknown"
    image_url = image_tag['src'] if image_tag else ""

    professors.append({
        'name': name,
        'phd_university': phd_university,
        'image_url': image_url
    })

driver.quit()

q4df = pd.DataFrame(professors)

os.makedirs('images', exist_ok=True)

for i, row in q4df.iterrows():
    image_url = row['image_url']
    name = row['name'].replace(' ', '_') + '.jpg'  
    image_path = os.path.join('images', name)
    
    if image_url:
        try:
            img_response = requests.get('https://masters.hkubs.hku.hk' + image_url) 
            with open(image_path, 'wb') as img_file:
                img_file.write(img_response.content)
        except Exception as e:
            print(f"Failed to download image for {row['name']}: {e}")

q4df.to_csv('Q4_teachers.csv', index=False)

                                                name  \
0                                  Prof. Boris Babic   
1                                  Prof. Zhanrui CAI   
2                           Prof. Michael C. L. CHAU   
3  Prof. Hailiang CHEN, Assistant Dean (Taught Po...   
4                                    Prof. Chao DING   

                                      phd_university  \
0             JD., Harvard Law School, United States   
1  Ph.D., The Pennsylvania State University, Unit...   
2           Ph.D., Arizona University, United States   
3            Ph.D., Purdue University, United States   
4        Ph.D., University of Florida, United States   

                                           image_url  
0  /uploads/image/202409/af79edec8240dc022014af59...  
1  /uploads/image/202408/45326d58ed53b6ad2afb7adf...  
2  /uploads/image/202205/ac5d364416b5bb9a1714038c...  
3  /uploads/image/202205/7bf2456efbfe30faf679e868...  
4  /uploads/image/202205/caadde230670b19f303e4d28..

In [65]:
q4df.head()

Unnamed: 0,name,phd_university,image_url
0,Prof. Boris Babic,"JD., Harvard Law School, United States",/uploads/image/202409/af79edec8240dc022014af59...
1,Prof. Zhanrui CAI,"Ph.D., The Pennsylvania State University, Unit...",/uploads/image/202408/45326d58ed53b6ad2afb7adf...
2,Prof. Michael C. L. CHAU,"Ph.D., Arizona University, United States",/uploads/image/202205/ac5d364416b5bb9a1714038c...
3,"Prof. Hailiang CHEN, Assistant Dean (Taught Po...","Ph.D., Purdue University, United States",/uploads/image/202205/7bf2456efbfe30faf679e868...
4,Prof. Chao DING,"Ph.D., University of Florida, United States",/uploads/image/202205/caadde230670b19f303e4d28...


In [66]:
q4df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28 entries, 0 to 27
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   name            28 non-null     object
 1   phd_university  28 non-null     object
 2   image_url       28 non-null     object
dtypes: object(3)
memory usage: 804.0+ bytes
