### Sample program for crawling  

#### Import libraries  

In [1]:
import re
import requests
from bs4 import BeautifulSoup
import pandas as pd

#### Parameters  

In [2]:
url = 'http://127.0.0.1:8887/sample.html'
csv_out = 'iniad_rooms.csv'

#### Crawling of web page  

In [3]:
s = requests.Session()
r = s.get(url)

print(r.status_code)
print(r.encoding)

200
utf-8


In [4]:
# response body
print(r.text)  # str
#print(r.content)  # binary

<!DOCTYPE html>
<html lang="ja">
  <head>
    <meta charset="UTF-8">
content-type
    <title>Sample Page for Crawling & Scraping</title>
    <script src="js/sample.js"></script>
  </head>
  <body>
    <p>Rooms for conference in <a href="https://www.iniad.org/">INIAD</a></p>

    <h3>Hall</h3>
    <ul id="hall">
      <li class="capacity">Capacity: 400</li>
      <li class="floor">1F</li>
    </ul>

    <h3>Lecture room</h3>
    <ul id="lecture_room">
      <li class="capacity">Capacity: 200</li>
      <li class="floor">2F</li>
    </ul>

    <h3>Large rooms</h3>
    <ul id="large_rooms">
      <li class="capacity">Capacity: 80</li>
      <li class="floor">3,4F</li>
    </ul>

    <label for="num">Num:</label>
    <input id="num" type="text">
    <button onclick="calc()">Select</button><br>
    Please use <span id="result">???</span>
  </body>
</html>


#### Scraping (parse web contents)  

In [5]:
bs = BeautifulSoup(r.text, 'html.parser')

#### Get info of elements by tag  

In [6]:
print("FIND:")
print("Tag: p")
p_obj = bs.find('p')
print("Whole:")
print(p_obj)
print("Text:")
print(p_obj.text)
print("Contents:")
print(p_obj.contents)

FIND:
Tag: p
Whole:
<p>Rooms for conference in <a href="https://www.iniad.org/">INIAD</a></p>
Text:
Rooms for conference in INIAD
Contents:
['Rooms for conference in ', <a href="https://www.iniad.org/">INIAD</a>]


In [7]:
print("Child elements:")
a_obj = p_obj.a
print("Whole:")
print(a_obj)
print("Text:")
print(a_obj.text)
print("Contents:")
print(a_obj.contents)
print("Attrs:")
print(a_obj.attrs)
print("Get attrs:")
print(a_obj.get("href"))

Child elements:
Whole:
<a href="https://www.iniad.org/">INIAD</a>
Text:
INIAD
Contents:
['INIAD']
Attrs:
{'href': 'https://www.iniad.org/'}
Get attrs:
https://www.iniad.org/


In [8]:
# all
print("FIND_ALL:")
h3_all = bs.find_all('h3')
for h3 in h3_all:
    print("Whole:")
    print(h3)
    print("Text:")
    print(h3.text)
    print("Contents:")
    print(h3.contents)
    print('======')

FIND_ALL:
Whole:
<h3>Hall</h3>
Text:
Hall
Contents:
['Hall']
Whole:
<h3>Lecture room</h3>
Text:
Lecture room
Contents:
['Lecture room']
Whole:
<h3>Large rooms</h3>
Text:
Large rooms
Contents:
['Large rooms']


#### Get info of elements by ID  

In [9]:
id_hall = bs.select('#hall')
print(id_hall)
id_hall_li = id_hall[0].find_all('li')
print(id_hall_li)

[<ul id="hall">
<li class="capacity">Capacity: 400</li>
<li class="floor">1F</li>
</ul>]
[<li class="capacity">Capacity: 400</li>, <li class="floor">1F</li>]


#### Get info of elements by class    

In [10]:
class_capacity = bs.select('.capacity')
print(class_capacity)
print(class_capacity[1].text)

[<li class="capacity">Capacity: 400</li>, <li class="capacity">Capacity: 200</li>, <li class="capacity">Capacity: 80</li>]
Capacity: 200


#### Output to CSV  

In [11]:
df = pd.DataFrame([], columns=['Room', 'Capacity', 'Floor'])
class_floor = bs.select('.floor')

rooms = []
capacities = []
floors = []
for i in range(len(h3_all)):
    rooms.append(h3_all[i].text)
    capacities.append(re.sub(r'^Capacity:\s*(\d+)$', r'\1', class_capacity[i].text))
    floors.append(class_floor[i].text)

df['Room'] = pd.Series(rooms)
df['Capacity'] = pd.Series(capacities)
df['Floor'] = pd.Series(floors)
display(df)

df.to_csv(csv_out, index=False)

Unnamed: 0,Room,Capacity,Floor
0,Hall,400,1F
1,Lecture room,200,2F
2,Large rooms,80,"3,4F"
