# Laptop specifications scrapper
Scrap comparez-malin.com for laptop specification

In [1]:
%load_ext autoreload
%autoreload 2

from tqdm import tqdm
from bs4 import BeautifulSoup
import pandas as pd
import qgrid
import numpy as np
from IPython.core.interactiveshell import InteractiveShell

from laptop_scoring import scrap
from laptop_scoring.processing import process_and_clean
from laptop_scoring.utils import scale, ease, display_laptop, print_score


# Display all variables alon in a line
InteractiveShell.ast_node_interactivity = "all"

## Get all specs needed

In [2]:
#Get links to each laptop page in a  dataframe
df_laptops_urls = scrap.get_laptops_urls(overwrite=False)

# Get specs for all laptops
#df_laptops_urls = df_laptops_urls.sample(n=100, random_state=0)
df = scrap.get_all_laptops_specs(df_laptops_urls, overwrite=False)
df.columns = df.columns.str.lower().str.replace(" ", "_").str.replace(".", "").str.replace("(", "").str.replace(")", "")

# Get CPU and GPU benchmarks
df_cpu = scrap.get_cpu_dataframe(df["processeur"].unique(), overwrite=False)
df = df.reset_index().merge(df_cpu, on="processeur", how="left").set_index("index")

df_gpu = scrap.get_gpu_dataframe(df["puce_graphique_dédiée"].unique(), overwrite=False)
df = df.reset_index().merge(df_gpu, on="puce_graphique_dédiée", how="left").set_index("index")

# Clean rows for easier reading
df["processeur"] = df["processeur"].str.replace("Intel Core", "")
df["puce_graphique_dédiée"] = df["puce_graphique_dédiée"].str.replace("Nvidia", "").str.replace("GeForce", "").str.replace("GTX", "")

# Processing and cleaning
df = process_and_clean(df)

Reading dataframe from data/get_laptops_urls.csv
Reading dataframe from data/get_all_laptops_specs.csv
Reading dataframe from data/get_cpu_dataframe.csv
Reading dataframe from data/get_gpu_dataframe.csv


## Filtering

In [3]:
# Keep only computers that fit your needs
df = df[df["gpu_benchmark"]>500]
df = df[df["cpu_benchmark"]>3000]
df = df[(df["taille"]>=13.3) & (df["taille"]<=15.6)]
df = df[df["res_width"]>=1920]
df = df[df["marque"]!="MSI"]
df = df[df["url"].apply(lambda x: "rog" not in x)]

## Scoring

In [4]:
# Wether to use official price (might not be available) or current price
#prix = "prix_public"
prix = "prix"
if prix == "prix":
    # Remove laptops with no current price (often because unavailable)
    df = df[(df["prix"]!=0)]

In [5]:
# Adjust the coefficients to your liking !
# List of tuples: (col_name, method, do_scale)
scoring_methods = [
    (prix, lambda x: (x/700) ** 4, False),
    ("single_core_benchmark", lambda x: x * 0.6, True),
    ("coeurs", lambda x: x * 1.2, True),
    ("mémoire_ram", lambda x: x * 0.4, True),
    ("gpu_benchmark", lambda x: ease(x, method="log", asymetric=True) * 1.2, True),
    ("min_freq", lambda x: x * 0.2, True),
    ("max_freq", lambda x: x * 0.4, True),
    ("pdt_max", lambda x: x * -0.25, True),
    ("hdd_speed", lambda x: (x==7200) * 0.2, False),
    ("sshd", lambda x: x * 0.6, False),
    ("ssd_size", lambda x: ((x>0) + ease(x))/2 * 0.6, True),
    ("poids", lambda x: x * -0.5, True),
    ("height", lambda x: -ease(x) * 0.5, True),
    ("taille", lambda x: (x>15.6 or x<13.3) * -3 + (x<14) * -2, False),
    ("screen_to_body", lambda x: x * 0.5, True),
    ("res_width", lambda x: (x<1920) * -2 + (x>1920) * 0.3 , False),
    ("composition",
     lambda x: (
            (("Aluminium" in x) or ("Métal" in x)) * 0.7
           + ("Plastique" in x) * -0.5
        ) , False),
    ("type_c",
     lambda x: (
            ("Gen 1" in x) * 0.1
          + ("Gen 2" in x) * 0.2
          + ("Thunderbolt" in x) * 0.2
          + ("Charging" in x) * 0.2
        ), False),
    ("marque", lambda x: (x=="Acer") * -0.5 + (x=="MSI") * -2, False),  # Acer is not good quality and MSI are ugly
    ("url", lambda x: ("rog" in x) * -2, False)  # Asus ROG are ugly
]

df_score = pd.DataFrame(index=df.index, columns=["score"])


# Generic methods that take one column as input
# and output its associated score
for (col_name, method, do_scale) in tqdm(scoring_methods):
    col = df[col_name]
    if do_scale:
        col = scale(col)
    df_score[col_name] = col.apply(method)

# Compute total score
df_score = df_score.fillna(0)
df_score["total"] = df_score.drop(["score", prix], axis=1).sum(axis=1)
df_score["total"] /= df_score[prix]
df_score["score"] = df_score["total"]/df_score["total"].max()
df["score"] = df_score["score"]
df = df.sort_values(by="score", ascending=False)
df_score = df_score.loc[df.index]

100%|██████████| 20/20 [00:00<00:00, 623.16it/s]


## Display results

In [6]:
#qgrid.set_defaults(grid_options={'forceFitColumns': False})#, 'defaultColumnWidth': 200})
cols_to_show = "url marque référence score type_c taille screen_to_body res_width processeur puce_graphique_dédiée gpu_benchmark mémoire_ram ssd_size prix".split()
#height single_core_benchmark coeurs cpu_benchmark gpu_benchmark min_freq max_freq pdt_max hdd_speed sshd prix_public
qgrid.show_grid(df[cols_to_show])
#df_score.head()

In [7]:
# Display n top ranked computer web pages
n = 5
for i, (index, row) in enumerate(df.iterrows()):
    print("-" * 50)
    print(row["url"])
    print("{} {}".format(row["marque"], row["référence"]))
    # print_score(df_score.loc[index])
    display_laptop(row["url"])
    if i >= (n-1):
        break

--------------------------------------------------
http://www.comparez-malin.fr/informatique/pc-portable/lenovo-yoga-710-14.html
Lenovo Yoga 710-14


--------------------------------------------------
http://www.comparez-malin.fr/informatique/pc-portable/asus-r510iu-dm025t.html
Asus R510IU-DM025T


--------------------------------------------------
http://www.comparez-malin.fr/informatique/pc-portable/hp-pavilion-15-bc012nf.html
HP Y0V60EA#ABF


--------------------------------------------------
http://www.comparez-malin.fr/informatique/pc-portable/lenovo-yoga-710-14isk-80ty0016fr.html
Lenovo 80TY0016FR


--------------------------------------------------
http://www.comparez-malin.fr/informatique/pc-portable/asus-zenbook-ux410uq-gv039t.html
Asus UX410UQ-GV039T
