# Extract openbenchmarking.org results

### Import libraries

In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

# Step 1 : list the different urls of openbenchmarking.org

1 url = 1 test profile

In [123]:
parser = BeautifulSoup(open("./data/pages/test_profile.html").read(), 'html.parser')

listing = [p for p in parser.find_all('div', class_='row')]

lines = []

for l in listing:
    
    content = l.findChildren('div', class_='col-sm-6', recursive=False)
    
    name = None
    url = None
    type_pts = None
    description = None
    
    if len(content) > 0:
        first  = content[0].findChildren("a")[0]
        name, url = first.get_text(), first.get("href")
        type_pts = content[1].findChildren("h4")[0].get_text()
    
    content12 = l.findChildren('div', class_='col-sm-12', recursive=False)
    
    if len(content12) > 0:
        d = content12[0].findChildren('span')
        if len(d) > 0:
            description = d[0].get_text().replace(",","").replace("\n", "")
    
    lines.append((name, description, type_pts, url))

df = pd.DataFrame(lines).dropna()
df.columns = ["name", "description", "type", "url"]
df.to_csv("./data/list_urls.csv")
df

Unnamed: 0,name,description,type,url
2,AI Benchmark Alpha,AI Benchmark Alpha is a Python library for eva...,System,https://openbenchmarking.org/test/pts/ai-bench...
3,Aircrack-ng,Aircrack-ng is a tool for assessing WiFi/WLAN ...,Processor,https://openbenchmarking.org/test/pts/aircrack-ng
4,Algebraic Multi-Grid Benchmark,AMG is a parallel algebraic multigrid solver f...,Processor,https://openbenchmarking.org/test/pts/amg
5,AOBench,AOBench is a lightweight ambient occlusion ren...,Processor,https://openbenchmarking.org/test/pts/aobench
6,AOM AV1,This is a test of the AOMedia AV1 encoder (lib...,Processor,https://openbenchmarking.org/test/pts/aom-av1
...,...,...,...,...
466,Xsbench,XSBench is a mini-app representing a key compu...,System,https://openbenchmarking.org/test/pts/xsbench
467,Xsbench OpenCL,Xsbench benchmark in OpenCL via GPUOpen.,System,https://openbenchmarking.org/test/pts/xsbench-cl
468,Y-Cruncher,Y-Cruncher is a multi-threaded Pi benchmark.,Processor,https://openbenchmarking.org/test/pts/y-cruncher
469,YafaRay,YafaRay is an open-source physically based mon...,Processor,https://openbenchmarking.org/test/pts/yafaray


# Step 2 : Extract the content of test profiles

Input : 
- url of the webpage

Output : 
- dataset of performances related to the system under test

In [152]:
class ExtractorTP():
    # input : the url of an openbenchmarking.org webpage
    # output : a dataframe containing the average performances for a configurable system
    
    def __init__(self, url):
        self.url = url
        html_text = requests.get(self.url).text
        self.parser = BeautifulSoup(html_text, 'html.parser')
        # to remove the "Low-Tier"-like lines
        self.banwords = ["Low-Tier", "Median", "Mid-Tier"]
    
    def extract_data(self):
        row_text = self.parser.find_all('div', class_='div_table_row')
        lines = [self.extract_row(rt) for rt in row_text]
        df = pd.DataFrame(lines).dropna()
        df.columns = ["idproc", "descproc", "percentile", "nbproc", "perf"]
        return df
        
    def extract_row(self, row_text):
        content_line = []
        cells = row_text.find_all('div', class_='div_table_cell')
        if cells[0].get_text() not in self.banwords:
            content_line.append(cells[0].findChildren("a")[0].get("href"))
            content_line.extend([c.get_text() for c in cells])
        return content_line

### Test with x264

In [153]:
ext = ExtractorTP('https://openbenchmarking.org/test/pts/x264')
df = ext.extract_data()
df

Unnamed: 0,idproc,descproc,percentile,nbproc,perf
0,/s/AMD+EPYC+7763+64-Core,AMD EPYC 7763 64-Core,100th,6,236 +/- 3
1,/s/AMD+EPYC+75F3+32-Core,AMD EPYC 75F3 32-Core,100th,8,229 +/- 11
2,/s/AMD+EPYC+7713+64-Core,AMD EPYC 7713 64-Core,99th,6,225 +/- 2
3,/s/2+x+AMD+EPYC+75F3+32-Core,2 x AMD EPYC 75F3 32-Core,99th,13,223 +/- 28
4,/s/2+x+AMD+EPYC+7713+64-Core,2 x AMD EPYC 7713 64-Core,99th,7,222 +/- 3
...,...,...,...,...,...
176,/s/Intel+Pentium+Dual+E2220,Intel Pentium Dual E2220,6th,3,5
177,/s/ARMv8+Cortex-A55+4-Core,ARMv8 Cortex-A55 4-Core,5th,3,4
178,/s/Intel+Pentium+Dual+T2310,Intel Pentium Dual T2310,4th,7,3
179,/s/ARMv7+Cortex-A53+4-Core,ARMv7 Cortex-A53 4-Core,4th,3,2


### Test with x265

In [154]:
ext = ExtractorTP('https://openbenchmarking.org/test/pts/x265')
df = ext.extract_data()
df

Unnamed: 0,idproc,descproc,percentile,nbproc,perf
0,/s/AMD+EPYC+75F3+32-Core,AMD EPYC 75F3 32-Core,100th,9,31.6 +/- 0.3
1,/s/Intel+Xeon+Platinum+8380,Intel Xeon Platinum 8380,100th,6,31.2
2,/s/AMD+EPYC+7643+48-Core,AMD EPYC 7643 48-Core,100th,3,30.1 +/- 0.3
3,/s/AMD+EPYC+7763+64-Core,AMD EPYC 7763 64-Core,99th,14,30.0 +/- 0.7
4,/s/AMD+EPYC+74F3+24-Core,AMD EPYC 74F3 24-Core,98th,6,29.3 +/- 0.1
...,...,...,...,...,...
154,/s/Intel+Core+i3-4130,Intel Core i3-4130,7th,3,2.8
155,/s/AMD+A10-7850K+APU,AMD A10-7850K APU,7th,6,2.6 +/- 0.1
156,/s/AMD+Ryzen+3+3200U,AMD Ryzen 3 3200U,6th,3,2.6
157,/s/POWER9+4-Core,POWER9 4-Core,4th,7,1.3


# Step 3 : Extract the description of processors

In [159]:
class ExtractorPROC():
    # input : the url of an openbenchmarking.org webpage listing the technical details of a processor
    # output : a dataframe containing the average performances for a configurable system
    
    def __init__(self, url):
        self.url = url
        html_text = requests.get(self.url).text
        self.parser = BeautifulSoup(html_text, 'html.parser')
    
    def extract_data(self):
        row_text = self.parser.find_all('pre')
        lines = [rt.get_text() for rt in row_text]
        return lines

### Test with one processor

In [160]:
ext = ExtractorPROC('https://openbenchmarking.org/s/AMD+EPYC+74F3+24-Core')
ext.extract_data()

['processor\t: 0\nvendor_id\t: AuthenticAMD\ncpu family\t: 25\nmodel\t\t: 1\nmodel name\t: AMD EPYC 74F3 24-Core Processor\nstepping\t: 1\nmicrocode\t: 0xa001119\ncpu MHz\t\t: 3200.000\ncache size\t: 512 KB\nphysical id\t: 0\nsiblings\t: 48\ncore id\t\t: 0\ncpu cores\t: 24\napicid\t\t: 0\ninitial apicid\t: 0\nfpu\t\t: yes\nfpu_exception\t: yes\ncpuid level\t: 16\nwp\t\t: yes\nflags\t\t: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a rdseed adx sma