# Laptop specifications scrapper
Scrap comparez-malin.com for laptop specification

In [10]:
%load_ext autoreload
%autoreload 2

from tqdm import tqdm
from bs4 import BeautifulSoup
import pandas as pd
import qgrid
import numpy as np
from IPython.core.interactiveshell import InteractiveShell

from laptop_scoring import scrap
from laptop_scoring.utils import scale, ease, display_laptop


# Display all variables alon in a line
InteractiveShell.ast_node_interactivity = "all"

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Get all specs needed

In [11]:
#Get links to each laptop page in a  dataframe
df_laptops_urls = scrap.get_laptops_urls(overwrite=False)

# Get specs for all laptops
#df_laptops_urls = df_laptops_urls.sample(n=100, random_state=0)
df = scrap.get_all_laptops_specs(df_laptops_urls, overwrite=False)
df.columns = df.columns.str.lower().str.replace(" ", "_").str.replace(".", "").str.replace("(", "").str.replace(")", "")

# Get CPU and GPU benchmarks
df_cpu = scrap.get_cpu_dataframe(df["processeur"].unique(), overwrite=False)
df = df.reset_index().merge(df_cpu, on="processeur", how="left").set_index("index")

df_gpu = scrap.get_gpu_dataframe(df["puce_graphique_dédiée"].unique(), overwrite=False)
df = df.reset_index().merge(df_gpu, on="puce_graphique_dédiée", how="left").set_index("index")

# Clean rows for easier reading
df["processeur"] = df["processeur"].str.replace("Intel Core", "")
df["puce_graphique_dédiée"] = df["puce_graphique_dédiée"].str.replace("Nvidia", "").str.replace("GeForce", "").str.replace("GTX", "")

Reading dataframe from data/get_laptops_urls.csv
Reading dataframe from data/get_all_laptops_specs.csv
Reading dataframe from data/get_cpu_dataframe.csv
Reading dataframe from data/get_gpu_dataframe.csv


## Processing & Cleaning

In [12]:
# One-liners
df["prix_public"] = df["prix_public"].str.strip("€").str.replace(" ", "").str.replace(",", ".").astype(float)
df["prix"] = df["prix"].str.strip("€").str.replace(" ", "").str.replace(",", ".").astype(float)
df[["coeurs", "min_freq", "max_freq"]] = df["fréquence"].str.split(expand=True)[[0,2,4]].astype(float)
df["single_core_benchmark"] = df["cpu_benchmark"] / df["coeurs"]
df["pdt_max"] = df["pdt_max"].str.split(expand=True)[0].astype(int)
df["mémoire_ram"] = df["mémoire_ram"].str.split(expand=True)[0].astype(int)

# Split disque_dur in sshd (bool), hdd_size, hdd_speed, ssd_size
df["sshd"] = df["disque_dur"].apply(lambda x: ("cache SSD" in x))
df["disque_dur"] = df["disque_dur"].str.replace("cache SSD", "").str.replace("(", "").str.replace(")", "")
df["hdd_size"] = df["disque_dur"].apply(lambda x: int(x.split("tr/min")[0].split()[0]) if ("tr/min" in x) else 0)
df["hdd_speed"] = df["disque_dur"].apply(lambda x: int((x.split("tr/min")[0].strip().split()[-1])) if ("tr/min" in x) else 0)
df["ssd_size"] = df["disque_dur"].apply(lambda x: int(x.split("tr/min")[-1].split("Go SSD")[0].split()[-1].replace("SSD", "")) if ("Go SSD" in x) else 0)

df["res_width"] = df["résolution"].str.split(expand=True)[0].astype(int)
df["taille"] = df["taille"].str.split('" ', expand=True)[0].astype(float)
df[["width", "depth", "height"]] = df["dimensions"].str.split(expand=True)[[0,2,4]].astype(float)
df[["width", "depth", "height"]] = df["dimensions"].str.split("x", expand=True)
# Sometimes heights are written "17 - 18" so take the max
df["height"] = df["height"].str.replace(" mm", "").str.split("-", expand=True).fillna(0).astype(float).max(axis=1)
df[["width", "depth"]] = df[["width", "depth"]].astype(float)
df["poids"] = df["poids"].str.replace("kg", "").str.replace("g", "").str.strip().astype(float)
# Convert weights expressed in grams to kilograms
df["poids"] = df["poids"].apply(lambda x: x if x<100 else x/1000)
df = df.fillna(0)

## Scoring

In [13]:
# Wether to use official price (might not be available) or current price
#prix = "prix_public"
prix = "prix"
if prix == "prix":
    # Remove laptops with no current price (often because unavailable)
    df = df[(df["prix"]!=0)]

In [17]:
# Adjust the coefficients to your liking !
# List of tuples: (col_name, method, do_scale)
scoring_methods = [
    (prix, lambda x: (x/800) ** 1, False),
    ("single_core_benchmark", lambda x: x * 0.8, True),
    ("coeurs", lambda x: x * 1.2, True),
    ("mémoire_ram", lambda x: x * 0.4, True),
    ("gpu_benchmark", lambda x: ease(x) * 0.4, True),
    ("min_freq", lambda x: x * 0.2, True),
    ("max_freq", lambda x: x * 0.4, True),
    ("pdt_max", lambda x: x * -0.25, True),
    ("hdd_speed", lambda x: (x==7200) * 0.2, False),
    ("sshd", lambda x: x * 0.6, False),
    ("ssd_size", lambda x: ((x>0) + ease(x))/2 * 0.4, True),
    ("poids", lambda x: x * -0.5, True),
    ("height", lambda x: -x * 1, True),
    ("taille", lambda x: (x>15.6 or x<13.3) * -2 + (x<14) * -1, False),
    ("res_width", lambda x: (x<1920) * -2 + (x>1920) * 0.3 , False),
    ("composition", lambda x: (("Aluminium" in x) or ("Métal" in x)) * 1, False),
    ("marque", lambda x: (x=="Acer") * -0.5, False),  # Acer is not good quality
    ("référence", lambda x: (x.startswith("G")) * -0.5, False)  # Asus and MSI Gaming laptops are ugly
]

df_score = pd.DataFrame(index=df.index, columns=["score"])


# Generic methods that take one column as input
# and output its associated score
for (col_name, method, do_scale) in tqdm(scoring_methods):
    col = df[col_name]
    if do_scale:
        col = scale(col)
    df_score[col_name] = col.apply(method)

# Compute total score
df_score = df_score.fillna(0)
df_score["total"] = df_score.drop(["score", prix], axis=1).sum(axis=1)
df_score["total"] /= df_score[prix]
df_score["score"] = df_score["total"]/df_score["total"].max()
df["score"] = df_score["score"]
df = df.sort_values(by="score", ascending=False)
df_score = df_score.loc[df.index]

100%|██████████| 18/18 [00:00<00:00, 68.38it/s]


## Display results

In [18]:
qgrid.set_defaults(grid_options={'forceFitColumns': False})#, 'defaultColumnWidth': 200})
cols_to_show = "url marque référence score taille res_width processeur puce_graphique_dédiée mémoire_ram ssd_size prix".split()
#height single_core_benchmark coeurs cpu_benchmark gpu_benchmark min_freq max_freq pdt_max hdd_speed sshd prix_public
qgrid.show_grid(df[cols_to_show])
#df_score.head()

In [19]:
# Display n top ranked computer web pages
n = 3
iframe = '<iframe width="100%" height="350" src="{}"></iframe>'
for i, (index, row) in enumerate(df.iterrows()):
    print("-" * 50)
    print("{} {}".format(row["marque"], row["référence"]))
    print(row["url"])
    #print(df_score.loc[index])
    display_laptop(row["url"])
    if i >= (n-1):
        break

--------------------------------------------------
Lenovo Yoga 710-14
http://www.comparez-malin.fr/informatique/pc-portable/lenovo-yoga-710-14.html


--------------------------------------------------
Asus UX501VW-FI041T
http://www.comparez-malin.fr/informatique/pc-portable/asus-zenbook-pro-ux501vw-fi041t.html


--------------------------------------------------
MSI 6RF-020XFR
http://www.comparez-malin.fr/informatique/pc-portable/msi-gs63vr-6rf-020xfr.html
