# Lib import

In [84]:
#load libs
from google.colab import files
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
np.warnings.filterwarnings('ignore')

# Extract a lot of HTML files

In [85]:
# !unzip -u "/content/drive/MyDrive/Colab Notebooks/ressources/keywi/dataset/html_pages.zip" -d "/content/drive/MyDrive/Colab Notebooks/ressources/keywi/dataset/"

# Import filenames and extract hashes

In [86]:
import cv2
import glob
path = "/content/drive/MyDrive/Colab Notebooks/ressources/keywi/dataset/*"

filenames = []

for file in glob.glob(path):
   filenames.append(file)


filenames_df = pd.DataFrame(filenames, columns=['raw_filename'])

Remove zip file from the list

In [87]:
filenames_df = filenames_df[~filenames_df.raw_filename.str.contains('zip')]

Extract the hash

In [88]:
temp_df = filenames_df.raw_filename.str.split('dataset/', expand=True)
filenames_df['hash'] = temp_df[1].str.split('.html', expand=True)[0]

In [89]:
filenames_df

Unnamed: 0,raw_filename,hash
1,/content/drive/MyDrive/Colab Notebooks/ressour...,003ff5c19b37a2b79bb169b95d2e30de
2,/content/drive/MyDrive/Colab Notebooks/ressour...,0043a18428f34c7aebba1146780dc77b
3,/content/drive/MyDrive/Colab Notebooks/ressour...,00569945c8433ea0e859e78d3973690c
4,/content/drive/MyDrive/Colab Notebooks/ressour...,0057fa9ca754fb8e46a3b74f07942bdc
5,/content/drive/MyDrive/Colab Notebooks/ressour...,00837ecb034b5f4570d1e073a597f6cf
...,...,...
3227,/content/drive/MyDrive/Colab Notebooks/ressour...,ff85349e89a648edb9cd359c2378328c
3228,/content/drive/MyDrive/Colab Notebooks/ressour...,ff8f4abb330a219836b3da6fb9f66742
3229,/content/drive/MyDrive/Colab Notebooks/ressour...,ffdd799c052b7994619c51a1abe20316
3230,/content/drive/MyDrive/Colab Notebooks/ressour...,ffef4a742bffa072fe251922c9a46b30


# Import labels into df

In [90]:
labels = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/ressources/keywi/labels.csv')

In [91]:
labels.columns = ['hash', 'label']

# Merge partial dataset to labels

In [92]:
dataset = pd.merge(left=filenames_df,right=labels,on='hash')

In [93]:
dataset

Unnamed: 0,raw_filename,hash,label
0,/content/drive/MyDrive/Colab Notebooks/ressour...,003ff5c19b37a2b79bb169b95d2e30de,landing
1,/content/drive/MyDrive/Colab Notebooks/ressour...,0043a18428f34c7aebba1146780dc77b,article
2,/content/drive/MyDrive/Colab Notebooks/ressour...,00569945c8433ea0e859e78d3973690c,landing
3,/content/drive/MyDrive/Colab Notebooks/ressour...,0057fa9ca754fb8e46a3b74f07942bdc,landing
4,/content/drive/MyDrive/Colab Notebooks/ressour...,00837ecb034b5f4570d1e073a597f6cf,company_information
...,...,...,...
3237,/content/drive/MyDrive/Colab Notebooks/ressour...,ff85349e89a648edb9cd359c2378328c,landing
3238,/content/drive/MyDrive/Colab Notebooks/ressour...,ff8f4abb330a219836b3da6fb9f66742,landing
3239,/content/drive/MyDrive/Colab Notebooks/ressour...,ffdd799c052b7994619c51a1abe20316,article
3240,/content/drive/MyDrive/Colab Notebooks/ressour...,ffef4a742bffa072fe251922c9a46b30,company_information


In [94]:
dataset.columns

Index(['raw_filename', 'hash', 'label'], dtype='object')

In [95]:
dataset['label'].value_counts()

 company_information    1075
 landing                1012
 social                  630
 article                 521
 commercial                4
Name: label, dtype: int64

Taking only the lightest pages shows that the commercial ones tend to be really heave, and the articles are twice as likely to be heavier than a company information or landing page.

# Focus on what we have

Lets try to predict whether a page is a landing, or not.


In [96]:
dataset['is_landing'] = dataset.label.str.contains('landing')

In [97]:
dataset.is_landing.value_counts()

False    2230
True     1012
Name: is_landing, dtype: int64

Restore balance by dropping half of False samples:

In [98]:
false_dataset = dataset[dataset.is_landing == False].sample(n=1012)

We can see that the "not-a-landing page" values have hidden labels which are present in same repartition as the initial sample thanks to the sample function. (although, the initial sample is biased from the start, so it doesn't matter that much)

In [99]:
false_dataset.label.value_counts()

 company_information    485
 social                 293
 article                232
 commercial               2
Name: label, dtype: int64

In [100]:
balanced_dataset = pd.concat([false_dataset, dataset[dataset.is_landing == True]])

In [101]:
balanced_dataset.is_landing.value_counts()

False    1012
True     1012
Name: is_landing, dtype: int64

# Export for next notebook

In [102]:
balanced_dataset.to_csv('/content/drive/MyDrive/Colab Notebooks/ressources/keywi/landing_dataset.csv')