# Laptop specifications scrapper
Scrap comparez-malin.com for laptop specification

In [None]:
import os
import time
import json
from urllib.request import urlopen
from urllib.parse import urljoin
from urllib.error import HTTPError

from tqdm import tqdm
from bs4 import BeautifulSoup
import pandas as pd
import qgrid
import numpy as np

## Utils function

In [None]:
def save_and_reload_df(func):
    """
    Decorator that saves the dataframe computed by the function
    and loads it if it was already saved
    """
    def func_wrapper(*args, overwrite=False, **kwargs):
        csv_path = "data/{}.csv".format(func.__name__)
        if not os.path.exists(csv_path) or overwrite:
            df = func(*args, **kwargs)
            df.to_csv(csv_path)
        else:
            print("Reading dataframe from {}".format(csv_path))
            df = pd.read_csv(csv_path, index_col=0)
        return df
    return func_wrapper


def get_specs(url):
    """Return specs as a dictionary"""
    html_doc = urlopen(url)
    html_doc = html_doc.read()
    soup = BeautifulSoup(html_doc, "html.parser")
    soup = soup.find("div", {"id": "specs"})
    specs = {}
    for spec in soup.find_all("tr"):
        key, value = extract_spec(spec)
        if key:
            specs[key] = value
    
    return specs


def extract_spec(spec):
    key = spec.find("th", {"scope": "row"})
    if key:
        key = key.text
        key = key.replace("\n", " ").strip()
        value = spec.find("td").text
        value = value.replace("\n", " ").strip()
        value = value.replace(u'\xa0', u' ')
    else:
        return None, None
    return key, value


def get_laptop_urls_in_page(page_url):
    root_url = "http://www.comparez-malin.fr/informatique/pc-portable/"
    html_doc = urlopen(page_url).read()
    soup = BeautifulSoup(html_doc, "html.parser")
    laptop_blocks = soup.find_all("div", {"class": "product"})
    specs_urls = {}
    for block in laptop_blocks:
        try:
            key = block["id"]
            url = block.find("a", {"class": "white"})["href"]
            if "tablette" not in url:
                url = urljoin(root_url, url.split('/')[-1])
                specs_urls[key] = url
        except KeyError:
            pass
    return specs_urls


def add_columns(df, columns):
    """Add columns to a dataframe"""
    # Remove columns that are already there
    columns = set(columns) - set(df.columns)
    df_columns = pd.DataFrame(columns=columns)
    df = df.join(df_columns, how='outer')
    return df

@save_and_reload_df
def get_laptops_urls():
    """Get links to each laptop page in a dataframe"""
    root_url = "http://www.comparez-malin.fr/informatique/pc-portable/{}"
    n = 265
    specs_urls = {}
    for i in tqdm(range(n)):
        page_url = root_url.format(i+1)
        specs_urls.update(get_laptop_urls_in_page(page_url))

    # Convert urls to dataframe
    s = pd.Series(specs_urls, name='url')
    df = s.to_frame()
    df.to_csv(csv_path)
    return df

@save_and_reload_df
def get_all_laptops_specs(df_laptops_urls, overwrite=False):
    """Get specs for all laptops urls"""
    df = df_laptops_urls
    # Initialize columns
    url = df.iloc[0]["url"]
    specs = get_specs(url)
    columns = set(specs.keys())
    df = add_columns(df, columns)
    columns = set(df.columns)

    for index, row in tqdm(df.iterrows(), total=df.shape[0]):
        if row.isnull().values[1:].all():
            url = row["url"]
            specs = get_specs(url)
            if len(specs) == 0:
                print(url)
                pass
            specs["url"] = url
            new_cols = set(specs.keys())
            if (new_cols != columns):
                df = add_columns(df, new_cols - columns)
                columns = set(df.columns)
            df.loc[index] = specs
    df.to_csv('data/all_specs.csv')
    return df

def get_cpu_benchmark(cpu_name):
    root_url = "http://www.cpubenchmark.net/cpu.php?cpu={}"
    try:
        url = root_url.format(cpu_name.replace(" ", "+"))
        html_doc = urlopen(url).read()
        soup = BeautifulSoup(html_doc, "html.parser")
        # Square with perf and single thread rating
        soup = soup.find("td", {"style": "text-align: center"})
        benchmark = int(soup.find("span").text)
    except HTTPError:
        benchmark = None
    return benchmark

@save_and_reload_df
def get_cpu_dataframe():
    cpus = df["processeur"].unique()
    df_cpu = pd.DataFrame(cpus, columns=["processeur"])
    for index, row in tqdm(df_cpu.iterrows(), total=df_cpu.shape[0]):
        benchmark = get_cpu_benchmark(row["processeur"])
        df_cpu.loc[index, "cpu_benchmark"] = benchmark
    return df_cpu

In [None]:
def test_get_specs():
    url = "http://www.comparez-malin.fr/informatique/pc-portable/asus-zenbook-3-ux390ua-gs039r.html"
    specs = get_specs(url)
    retrieved_specs_keys = list(specs.keys())
    filename = "data/test_get_specs_keys.txt"
    with open(filename, "r") as fp:
        true_specs_keys = fp.readlines()
    true_specs_keys = [key.replace("\n", " ").strip() for key in true_specs_keys]
    not_retrieved = set(true_specs_keys) - set(retrieved_specs_keys)
    assert len(not_retrieved) == 0, "Not retrieved: {}".format(not_retrieved)
    not_asked = set(retrieved_specs_keys) - set(true_specs_keys)
    assert len(not_asked) == 0, "Not asked: {}".format(not_asked)
test_get_specs()

Get links to each laptop page in a  dataframe

In [None]:
df_laptops_urls = get_laptops_urls()

Get specs for all laptops

In [None]:
df_laptops_urls = df_laptops_urls.sample(n=100, random_state=0)

In [None]:
df = get_all_laptops_specs(df_laptops_urls, overwrite=False)
df.columns = df.columns.str.lower().str.replace(" ", "_").str.replace(".", "")

Get CPU benchmarks

In [None]:
df_cpu = get_cpu_dataframe()
df = df.reset_index().merge(df_cpu, on="processeur", how="left").set_index("index")

Processing

In [None]:
def process_prix_public(price):
    price = price.strip("€")
    price = price.replace(" ", "")
    price = price.replace(",", ".")
    price = float(price)
    return price

In [None]:
col_methods = {
    "prix_public": process_prix_public
}

for col, method in col_methods.items():
    df[col] = df[col].apply(lambda x: method(x))
    
df[["cores", "min_freq", "max_freq"]] = df["fréquence"].str.split(expand=True)[[0,2,4]].astype(float)
df["pdt_max"] = df["pdt_max"].str.split(expand=True)[0].astype(int)

In [None]:
def process_disque_dur(string):
    string = string.replace("(", "").replace(")", "").replace(u'\xa0', u' ')
    sshd = "cache SSD" in string
    string = string.replace("cache SSD", "")
    hdd_size = 0
    hdd_speed = 0
    if "tr/min" in string:
        hdd_string, string = string.split("tr/min")
        splitted = hdd_string.split()
        hdd_size = int(splitted[0])
        hdd_speed = int(splitted[-1])
    ssd_size = 0
    if "Go SSD" in string:
        ssd_size = int(string.split("Go SSD")[0].split()[-1])
    return hdd_size, hdd_speed, sshd, ssd_size

for index, row in tqdm(df.iterrows(), total=df.shape[0]):
    hdd_size, hdd_speed, sshd, ssd_size = process_disque_dur(row["disque_dur"])
    df.loc[index, "hdd_size"] = hdd_size
    df.loc[index, "hdd_speed"] = hdd_speed
    df.loc[index, "sshd"] = sshd
    df.loc[index, "ssd_size"] = ssd_size

Scoring

In [None]:
df_scores = pd.DataFrame(index=df.index)

In [None]:
scoring_methods = {
    "cpu_benchmark": lambda x: x/4000 * 0.8,
    "min_freq": lambda x: x/2.5 * 0.2,
    "max_freq": lambda x: x/3.3 * 0.4,
    "pdt_max": lambda x: x/38 * -0.25,
    "hdd_speed": lambda x: (x==7200) * 0.2,
    "sshd": lambda x: x * 0.4,
    "ssd_size": lambda x: ((x>0) + np.sqrt(x/128)) * 0.6,
    "prix_public": lambda x: (x/800) ** 0.6
}

df_score = pd.DataFrame(index=df.index, columns=["score"])
# Generic methods that take one column as input
# and output its associated score
for col, method in tqdm(scoring_methods.items()):
    df_score[col] = df[col].apply(method)
    
# Compute total score
df_score["score"] = df_score.drop(["score", "prix_public"], axis=1).sum(axis=1)
df_score["score"] /= df_score["prix_public"]
df_score["score"] /= df_score["score"].max()
df["score"] = df_score["score"]
df = df.sort_values(by="score", ascending=False)

In [None]:
#qgrid.set_defaults(grid_options={'forceFitColumns': False})#, 'defaultColumnWidth': 200})
cols_to_show = "url score cpu_benchmark min_freq max_freq pdt_max hdd_speed sshd ssd_size prix_public".split()
qgrid.show_grid(df[cols_to_show])