In [None]:
from urllib.request import urlopen
from urllib.parse import urljoin
import time

from tqdm import tqdm
from bs4 import BeautifulSoup
import pandas as pd

In [None]:
def get_specs(url):
    """Return specs as a dictionary"""
    html_doc = urlopen(url).read()
    soup = BeautifulSoup(html_doc, "html.parser")
    soup = soup.find("div", {"id": "specs"})
    specs = {}
    for spec in soup.find_all("tr"):
        key = spec.find("th", {"scope": "row"})
        if key:
            key = key.text
            value = spec.find("td").text.replace('\n', '')
            specs[key] = value
    return specs

def get_laptop_urls_in_page(page_url):
    root_url = "http://www.comparez-malin.fr/informatique/pc-portable/"
    html_doc = urlopen(page_url).read()
    soup = BeautifulSoup(html_doc, "html.parser")
    laptop_blocks = soup.find_all("div", {"class": "product"})
    specs_urls = {}
    for block in laptop_blocks:
        try:
            key = block["id"]
            url = block.find("a", {"class": "white"})["href"]
            url = urljoin(root_url, url.split('/')[-1])
            specs_urls[key] = url
        except KeyError:
            pass
    return specs_urls

In [None]:
url = "http://www.comparez-malin.fr/informatique/pc-portable/asus-zenbook-ux305ca-fc057t.html"
n = 5
df = pd.DataFrame(index=range(n))
tic = time.time()
for i in tqdm(range(n)):
    specs = get_specs(url)
    df = df.append(specs, ignore_index=True)
print("Elpased time: {0:.2f}".format(time.time()-tic))

In [None]:
root_url = "http://www.comparez-malin.fr/informatique/pc-portable/{}"
n = 265
specs_urls = {}
for i in tqdm(range(n)):
    page_url = root_url.format(i+1)
    specs_urls.update(get_laptop_urls_in_page(page_url))

In [None]:
# Convert urls to dataframe
s = pd.Series(specs_urls, name='url')
df = s.to_frame()
df.to_csv('data/specs_urls.csv')