## Dom feature test
This document serves as a sanity check for the implementation of the DOM feature extractor. This serves as a sanity-check, literate unit-test fo the feature extraction funtions we wrote.

### Constants
We will begin bey defining the constants.

In [1]:
# constants
FIRST_RAW_FILENAME = '../data/raw/first-ecommerce.csv'  # the raw data of the first extraction

### Implementetation

In [2]:
%matplotlib inline

import sys, os

# pandas
import pandas as pd

# numpy, matplotlib, seaborn
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# add the library path
sys.path.append(os.path.join(os.getcwd(), "../src"))
from features import extract_features_from_html, extract_features_from_df

# this styling is purely my preference
# less chartjunk
sns.set_context('notebook', font_scale=1.5, rc={'line.linewidth': 2.5})
sns.set(style='ticks', palette='Set2')

In [3]:
# load the data
df = pd.read_csv(FIRST_RAW_FILENAME)
df.head()

Unnamed: 0,html,url
0,"<!DOCTYPE html><html lang=""ro"" class=""""><head>...",https://www.emag.ro/resigilate/placi_video/c?r...
1,"<!DOCTYPE html><html xml:lang=""ro"" lang=""ro"" c...",https://www.emag.ro/resigilate/ventilatoare-pc...
2,"<!DOCTYPE html><html xmlns:og=""http://ogp.me/n...",https://www.olx.ro/auto-masini-moto-ambarcatiu...
3,"<!DOCTYPE html><html lang=""ro"" class=""""><head>...",https://www.emag.ro/resigilate
4,"<!DOCTYPE html><html xml:lang=""ro"" lang=""ro"" c...","https://www.emag.ro/label/pret,intre-200-si-50..."


In [4]:
# try extracting features 
feats = extract_features_from_html(df.html[0], 2, 2)
feats.iloc[100:105]

Unnamed: 0,depth,sibling_pos,tag,no_classes,has_id,no_children,has_text,classes,descendant1_no_nodes,descendant1_no_children_avg,...,ancestor1_classes,ancestor2_depth,ancestor2_sibling_pos,ancestor2_tag,ancestor2_no_classes,ancestor2_has_id,ancestor2_no_children,ancestor2_has_text,ancestor2_classes,path
100,10,0,i,3,False,0,False,"[em, em-toys, megamenu-list-department__icon]",0,0.0,...,[gtm_31vgamc],8,11,li,2,False,1,False,"[megamenu-list-department, js-megamenu-list-de...",/html/body/div[2]/div[1]/div/div[1]/ul/li[12]/a/i
101,10,1,span,1,False,0,True,[megamenu-list-department__department-name],0,0.0,...,[gtm_31vgamc],8,11,li,2,False,1,False,"[megamenu-list-department, js-megamenu-list-de...",/html/body/div[2]/div[1]/div/div[1]/ul/li[12]/...
102,8,12,li,2,False,1,False,"[megamenu-list-department, js-megamenu-list-de...",1,2.0,...,[megamenu-list],6,0,div,1,False,1,False,[megamenu-list-container],/html/body/div[2]/div[1]/div/div[1]/ul/li[13]
103,9,0,a,1,False,2,False,[gtm_31vgamc],2,0.0,...,"[megamenu-list-department, js-megamenu-list-de...",7,0,ul,1,False,13,False,[megamenu-list],/html/body/div[2]/div[1]/div/div[1]/ul/li[13]/a
104,10,0,i,3,False,0,False,"[em, em-supermarket, megamenu-list-department_...",0,0.0,...,[gtm_31vgamc],8,12,li,2,False,1,False,"[megamenu-list-department, js-megamenu-list-de...",/html/body/div[2]/div[1]/div/div[1]/ul/li[13]/a/i


In [5]:
# ancestor features
ancestor_feats = feats.filter(like='ancestor', axis='columns')
ancestor_feats.tail()

Unnamed: 0,ancestor1_depth,ancestor1_sibling_pos,ancestor1_tag,ancestor1_no_classes,ancestor1_has_id,ancestor1_no_children,ancestor1_has_text,ancestor1_classes,ancestor2_depth,ancestor2_sibling_pos,ancestor2_tag,ancestor2_no_classes,ancestor2_has_id,ancestor2_no_children,ancestor2_has_text,ancestor2_classes
3527,2,1,body,0,False,128,False,[],1,0,html,0,False,2,False,[]
3528,2,1,body,0,False,128,False,[],1,0,html,0,False,2,False,[]
3529,2,1,body,0,False,128,False,[],1,0,html,0,False,2,False,[]
3530,2,1,body,0,False,128,False,[],1,0,html,0,False,2,False,[]
3531,3,127,div,0,False,1,False,[],2,1,body,0,False,128,False,[]


In [6]:
# descendant features
descendant_feats = feats.filter(like='descendant', axis='columns')
descendant_feats.head()

Unnamed: 0,descendant1_no_nodes,descendant1_no_children_avg,descendant1_has_id_avg,descendant1_no_classes_avg,descendant1_has_text_avg,descendant1_classes,descendant1_tags,descendant2_no_nodes,descendant2_no_children_avg,descendant2_has_id_avg,descendant2_no_classes_avg,descendant2_has_text_avg,descendant2_classes,descendant2_tags
0,2,83.0,0.0,0.0,0.0,[],"[head, body]",166,0.042169,0.042169,0.662651,0.108434,"[sr-only, main-container-outer, em-vars, em-va...","[meta, title, meta, meta, meta, link, meta, me..."
1,38,0.0,0.0,0.0,0.105263,[],"[meta, title, meta, meta, meta, link, meta, me...",0,0.0,0.0,0.0,0.0,[],[]
2,0,0.0,0.0,0.0,0.0,[],[],0,0.0,0.0,0.0,0.0,[],[]
3,0,0.0,0.0,0.0,0.0,[],[],0,0.0,0.0,0.0,0.0,[],[]
4,0,0.0,0.0,0.0,0.0,[],[],0,0.0,0.0,0.0,0.0,[],[]


In [7]:
# normal features
non_node_feats = list(descendant_feats.columns) + list(ancestor_feats.columns)
node_feats = feats.drop(non_node_feats, axis='columns')
node_feats.iloc[100:105]

Unnamed: 0,depth,sibling_pos,tag,no_classes,has_id,no_children,has_text,classes,path
100,10,0,i,3,False,0,False,"[em, em-toys, megamenu-list-department__icon]",/html/body/div[2]/div[1]/div/div[1]/ul/li[12]/a/i
101,10,1,span,1,False,0,True,[megamenu-list-department__department-name],/html/body/div[2]/div[1]/div/div[1]/ul/li[12]/...
102,8,12,li,2,False,1,False,"[megamenu-list-department, js-megamenu-list-de...",/html/body/div[2]/div[1]/div/div[1]/ul/li[13]
103,9,0,a,1,False,2,False,[gtm_31vgamc],/html/body/div[2]/div[1]/div/div[1]/ul/li[13]/a
104,10,0,i,3,False,0,False,"[em, em-supermarket, megamenu-list-department_...",/html/body/div[2]/div[1]/div/div[1]/ul/li[13]/a/i


In [8]:
extract_features_from_df(df.iloc[1:20], 2, 2)

Unnamed: 0,depth,sibling_pos,tag,no_classes,has_id,no_children,has_text,classes,descendant1_no_nodes,descendant1_no_children_avg,...,ancestor2_depth,ancestor2_sibling_pos,ancestor2_tag,ancestor2_no_classes,ancestor2_has_id,ancestor2_no_children,ancestor2_has_text,ancestor2_classes,path,url
0,1,0,html,1,False,2,False,[lang_ro],2,39.0,...,0,0,,0,False,0,False,[],/html,https://www.emag.ro/resigilate/ventilatoare-pc...
1,2,0,head,0,False,51,False,[],51,0.0,...,0,0,,0,False,0,False,[],/html/head,https://www.emag.ro/resigilate/ventilatoare-pc...
2,3,0,meta,0,False,0,False,[],0,0.0,...,1,0,html,1,False,2,False,[lang_ro],/html/head/meta[1],https://www.emag.ro/resigilate/ventilatoare-pc...
3,3,1,meta,0,False,0,False,[],0,0.0,...,1,0,html,1,False,2,False,[lang_ro],/html/head/meta[2],https://www.emag.ro/resigilate/ventilatoare-pc...
4,3,2,meta,0,False,0,False,[],0,0.0,...,1,0,html,1,False,2,False,[lang_ro],/html/head/meta[3],https://www.emag.ro/resigilate/ventilatoare-pc...
5,3,3,html_comment,0,False,0,True,[],0,0.0,...,1,0,html,1,False,2,False,[lang_ro],/html/head/comment()[1],https://www.emag.ro/resigilate/ventilatoare-pc...
6,3,4,meta,0,False,0,False,[],0,0.0,...,1,0,html,1,False,2,False,[lang_ro],/html/head/meta[4],https://www.emag.ro/resigilate/ventilatoare-pc...
7,3,5,meta,0,False,0,False,[],0,0.0,...,1,0,html,1,False,2,False,[lang_ro],/html/head/meta[5],https://www.emag.ro/resigilate/ventilatoare-pc...
8,3,6,meta,0,False,0,False,[],0,0.0,...,1,0,html,1,False,2,False,[lang_ro],/html/head/meta[6],https://www.emag.ro/resigilate/ventilatoare-pc...
9,3,7,meta,0,False,0,False,[],0,0.0,...,1,0,html,1,False,2,False,[lang_ro],/html/head/meta[7],https://www.emag.ro/resigilate/ventilatoare-pc...
