# Laptop specifications scrapper
Scrap comparez-malin.com for laptop specification

In [None]:
import os
import time
from urllib.request import urlopen
from urllib.parse import urljoin

from tqdm import tqdm
from bs4 import BeautifulSoup
import pandas as pd

## Utils function

In [None]:
def get_specs(url):
    """Return specs as a dictionary"""
    html_doc = urlopen(url)
    html_doc = html_doc.read()
    soup = BeautifulSoup(html_doc, "html.parser")
    soup = soup.find("div", {"id": "specs"})
    specs = {}
    for spec in soup.find_all("tr"):
        key, value = extract_spec_(spec)
        if key:
            specs[key] = value
    
    return specs


def extract_spec_(spec):
    key = spec.find("th", {"scope": "row"})
    if key:
        key = key.text
        value = spec.find("td").text.replace('\n', '')
    else:
        return None, None
    return key, value


def extract_spec(spec):
    key_val = spec.find_all("td")
    if len(key_val) == 2:
        key = key_val[0].text
        value = key_val[1].text
        value = value.replace("\n", "")
        value = value.replace("\xa0", "  ")
    else:
        key = None
        value = None
    return key, value


def get_laptop_urls_in_page(page_url):
    root_url = "http://www.comparez-malin.fr/informatique/pc-portable/"
    html_doc = urlopen(page_url).read()
    soup = BeautifulSoup(html_doc, "html.parser")
    laptop_blocks = soup.find_all("div", {"class": "product"})
    specs_urls = {}
    for block in laptop_blocks:
        try:
            key = block["id"]
            url = block.find("a", {"class": "white"})["href"]
            if "tablette" not in url:
                url = urljoin(root_url, url.split('/')[-1])
                specs_urls[key] = url
        except KeyError:
            pass
    return specs_urls


def add_columns(df, columns):
    """Add columns to a dataframe"""
    # Remove columns that are already there
    columns = set(columns) - set(df.columns)
    df_columns = pd.DataFrame(columns=columns)
    df = df.join(df_columns, how='outer')
    return df


def get_laptops_urls(overwrite=False):
    """Get links to each laptop page in a dataframe"""
    csv_path = 'data/specs_urls.csv'
    if not os.path.exists(csv_path) or overwrite:
        root_url = "http://www.comparez-malin.fr/informatique/pc-portable/{}"
        n = 265
        specs_urls = {}
        for i in tqdm(range(n)):
            page_url = root_url.format(i+1)
            specs_urls.update(get_laptop_urls_in_page(page_url))

        # Convert urls to dataframe
        s = pd.Series(specs_urls, name='url')
        df = s.to_frame()
        df.to_csv(csv_path)
    else:
        df = pd.read_csv(csv_path, index_col=0)
    return df


def get_all_laptops_specs(df_laptops_urls, overwrite=False):
    """Get specs for all laptops urls"""
    df = df_laptops_urls
    csv_path = 'data/all_specs.csv'
    if not os.path.exists(csv_path) or overwrite:
        # Initialize columns
        url = df.iloc[0]["url"]
        specs = get_specs(url)
        columns = set(specs.keys())
        df = add_columns(df, columns)
        columns = set(df.columns)

        for index, row in tqdm(df.iterrows(), total=df.shape[0]):
            if row.isnull().values[1:].all():
                url = row["url"]
                specs = get_specs(url)
                if len(specs) == 0:
                    print(url)
                    pass
                specs["url"] = url
                new_cols = set(specs.keys())
                if (new_cols != columns):
                    df = add_columns(df, new_cols - columns)
                    columns = set(df.columns)
                df.loc[index] = specs
        #df.to_csv('data/all_specs.csv')
    else:
        df = pd.read_csv(csv_path, index_col=0)
    return df

In [None]:
def test_get_specs():
    url = "http://www.comparez-malin.fr/informatique/pc-portable/asus-zenbook-3-ux390ua-gs039r.html"
    specs = get_specs(url)
    retrieved_specs_keys = list(specs.keys())
    filename = "data/test_get_specs_keys.txt"
    with open(filename, "r") as fp:
        true_specs_keys = fp.readlines()
    true_specs_keys = [key.replace("\n", "") for key in true_specs_keys]
    not_retrieved = set(true_specs_keys) - set(retrieved_specs_keys)
    assert len(not_retrieved) == 0, "Not retrieved: {}".format(not_retrieved)
    not_asked = set(retrieved_specs_keys) - set(true_specs_keys)
    assert len(not_asked) == 0, "Not asked: {}".format(not_asked)
test_get_specs()

Get links to each laptop page in a  dataframe

In [None]:
df_laptops_urls = get_laptops_urls()

Get specs for all laptops

In [None]:
df = get_all_laptops_specs(df_laptops_urls, overwrite=True)