## import

In [1]:
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm
from time import sleep
import re

# function to convert Thai datetime
# "22 กันยายน  2565" -> 2022-09-22
def convert_date_thai(original_date): 
	day, month, year = re.split(' +', original_date)  # split by any length of whitespace  
	year = int(year) - 543 
	if len(day) == 1:
		day = '0' + day  # 3 -> 03
	month = {'มกราคม':'01','กุมภาพันธ์':'02','มีนาคม':'03','เมษายน':'04','พฤษภาคม':'05','มิถุนายน':'06',
			'กรกฎาคม':'07', 'กรกฏาคม':'07', 'สิงหาคม':'08','กันยายน':'09','ตุลาคม':'10','พฤศจิกายน':'11','ธันวาคม':'12'}[month]
	return f'{year}-{month}-{day}'


## instantiate webdriver

In [2]:
## instantiation
driver = webdriver.Chrome()

## get page
url = 'http://www.raot.co.th/rubber2012/menu5.php'
driver.get(url)

## scrape the table of one month

In [3]:
## set month and year in the left selectbox
## target selectboxes has id "select" and "year"
month_to_scrape = 12
year_to_scrape = 2565

## run JavaScript command
driver.execute_script(f"document.getElementById('select').value={month_to_scrape};")
driver.execute_script(f"document.getElementById('year').value={year_to_scrape};")

In [4]:
## click the GO button, then the page transits
driver.execute_script(f"document.querySelector('input[type=submit]').click();")

## or, you can use this too
# driver.find_element(By.NAME, 'Submit').click()

In [5]:
## convert to BeautifulSoup
## this page DOES NOT use utf-8 encoding, but TIS-620 encoding
soup = BeautifulSoup(driver.page_source.encode('TIS-620'))

In [6]:
## target table has no id, so use CSS selector instead
## #ewt_main_structure > tbody > tr:nth-child(2) > td > div > center > table:last-child > tbody
css_selector = "#ewt_main_structure > tbody > tr:nth-child(2) > td > div > center > table:last-child > tbody"
table_soup = soup.select_one(css_selector) # extract all table data

In [7]:
## first two <tr> are header -> ignore
## target table contains inner tables, so get only 'children' (not grandchildren)
## so, set recursive = False
table_rows = table_soup.find_all('tr', recursive=False)
len(table_rows)

22

In [8]:
# show first 2 elements of table
# we have to iterate <td> tags to get data
table_rows[:2]

[<tr bgcolor="#F5CC6B">
 <td height="22" rowspan="2" width="17%"><div align="center"><font color="#000000" face="Tahoma, MS Sans Serif, AngsanaUPC" size="2">วัน/เดือน/ปี</font></div></td>
 <td colspan="2" height="22"><div align="center"><font color="#000000" face="Tahoma, MS Sans Serif, AngsanaUPC" size="2">ท้องถิ่น</font></div></td>
 <td colspan="4" height="22"><div align="center"><font color="#000000" face="Tahoma, MS Sans Serif, AngsanaUPC" size="2">ราคาประมูล 
                             ณ ตลาดกลางยางพารา อ.หาดใหญ่</font></div></td>
 </tr>,
 <tr>
 <td bgcolor="#F5CC6B" height="22" width="9%"><div align="center"><font color="#000000" face="Tahoma, MS Sans Serif, AngsanaUPC" size="2">ยางแผ่นดิบ</font></div></td>
 <td bgcolor="#F5CC6B" height="22" width="10%"><div align="center"><font color="#000000" face="Tahoma, MS Sans Serif, AngsanaUPC" size="2">น้ำยางสด<br/>(ณ โรงงาน)</font></div></td>
 <td bgcolor="#F5CC6B" height="22" width="10%"><div align="center"><font color="#000000" face=

## make DataFrame from table data

In [9]:
header = ['วัน/เดือน/ปี','ยางแผ่นดิบ','น้ำยางสด','ยางแผ่นดิบ','ยางแผ่นรมควัน ชั้น3','น้ำยางสด','FOB']
df = pd.DataFrame(columns=header)

## iterate rows from index=2, because first 2 rows are header
## final row is mean -> ignore
for row_index in range(2, len(table_rows)-1):
    ## get first column = date
    date = table_rows[row_index].find('td').text.strip()
    ## iterate each inner-table in the row (len = 6)
    data_in_row = [convert_date_thai(date)]
    for tb in table_rows[row_index].find_all('table'):
        value = tb.find('td').text.strip()
        try:
            value = float(value)
        except:
            value = None
        data_in_row.append(value)
    df.loc[len(df)] = data_in_row

In [10]:
## show results
df

Unnamed: 0,วัน/เดือน/ปี,ยางแผ่นดิบ,น้ำยางสด,ยางแผ่นดิบ.1,ยางแผ่นรมควัน ชั้น3,น้ำยางสด.1,FOB
0,2022-12-01,42.65,44.1,,46.82,,56.1
1,2022-12-02,42.65,44.1,44.0,46.8,,56.1
2,2022-12-06,42.65,44.1,,46.9,,56.25
3,2022-12-07,43.0,44.4,,47.25,,56.6
4,2022-12-08,43.35,44.5,,47.35,,56.7
5,2022-12-09,43.6,44.7,44.75,47.6,,56.95
6,2022-12-13,43.6,44.9,44.8,47.7,,57.1
7,2022-12-14,43.65,45.3,,47.9,,57.35
8,2022-12-15,43.5,45.4,,48.55,,57.45
9,2022-12-16,43.75,45.5,,48.59,,57.45


## save the data

In [11]:
df.to_csv(f'rubber_{year_to_scrape}_{month_to_scrape}.csv', encoding='utf8', index=False)

## scrape with for loop

In [13]:
header = ['วัน/เดือน/ปี','ยางแผ่นดิบ','น้ำยางสด','ยางแผ่นดิบ','ยางแผ่นรมควัน ชั้น3','น้ำยางสด','FOB']
df = pd.DataFrame(columns=header)

## get home
url = 'http://www.raot.co.th/rubber2012/menu5.php'
driver.get(url)

## iterate year and month
for year in tqdm(range(2563, 2566)):
    for month in range(1, 13):
        ## get the target page
        driver.execute_script(f"document.getElementById('select').value={month};")
        driver.execute_script(f"document.getElementById('year').value={year};")
        driver.execute_script(f"document.querySelector('input[type=submit]').click();")
        sleep(1) # waiting time

        ## convert to BeautifulSoup
        soup = BeautifulSoup(driver.page_source.encode('TIS-620'))

        ## get the table data
        css_selector = "#ewt_main_structure > tbody > tr:nth-child(2) > td > div > center > table:last-child > tbody"
        table_soup = soup.select_one(css_selector) # extract table data

        ## get each row in the table
        table_rows = table_soup.find_all('tr', recursive=False)

        ## iterate rows from index=2, because first 2 rows are header
        ## final row is mean -> ignore
        for row_index in range(2, len(table_rows)-1):
            ## get first column = date
            date = table_rows[row_index].find('td').text.strip()
            ## iterate each inner-table in the row (len = 6)
            data_in_row = [convert_date_thai(date)]
            for tb in table_rows[row_index].find_all('table'):
                value = tb.find('td').text.strip()
                try:
                    value = float(value)
                except:
                    value = None
                data_in_row.append(value)
            df.loc[len(df)] = data_in_row

        ## browser back to first page to select next month
        driver.back()
        sleep(1) # waiting time

100%|██████████| 3/3 [01:31<00:00, 30.61s/it]


In [14]:
# save to csv file
df.to_csv(f'data/rubber_all.csv', encoding='utf8', index=False)