# Lib and labels import

In [72]:
#load libs
from google.colab import files
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
np.warnings.filterwarnings('ignore')

In [73]:
filename = '/content/drive/MyDrive/Colab Notebooks/ressources/keywi/labels.csv'
df = pd.read_csv(filename, sep=',', index_col=0)

In [74]:
df

Unnamed: 0_level_0,label
hash,Unnamed: 1_level_1
762da493542c9d185c7403be3f981994,landing
3102c582cd015d0588735e8de86c2ad0,landing
4e9d74b8d4e652b9f25f6cd5ed0852ca,landing
34ac0c476606be7a786f909388d7f4aa,landing
0bb84fb0c6a9b709a0d377c2a9edbbd7,landing
...,...
74785a6c9d31dcee7b8b6f9bbc43668b,company_information
2ec48f400548071fc1016c207d9904bd,company_information
cf31419363312fe2d51074df9f1f5555,company_information
df10b7f8dbe4b652ae7ca52fc8e18827,company_information


We observe balanced labels:

In [75]:
df.value_counts()

 label              
 social                 1997
 landing                1975
 company_information    1950
 commercial             1936
 article                1877
dtype: int64

When sampling the dataset, the labels are still equally balanced:

In [76]:
df.sample(frac=0.1).value_counts()

 label              
 company_information    208
 commercial             207
 landing                195
 social                 194
 article                170
dtype: int64

# Import single html (Beautiful Soup)


In [2]:
from bs4 import BeautifulSoup

In [3]:
with open("/content/drive/MyDrive/Colab Notebooks/ressources/keywi/000c8dbce2c87572ff60e83280a8bed2.html") as fp:
    soup = BeautifulSoup(fp, 'html.parser')

In [4]:
text_list = []
for text in soup.find_all('p'):
  text_list.append(text.get_text().strip().lower())

In [5]:
lines = ''.join(text_list)

In [6]:
soup.title.string

'Advertising agency - Wikipedia'

# Noun extraction from HTML:

In [None]:
import nltk
nltk.download("book")

In [110]:
tokenized = nltk.word_tokenize(lines)
nouns = [word for (word, pos) in nltk.pos_tag(tokenized) if((pos[:2] == 'NN') or (pos[:2] == 'VB'))]
print (nouns)

['advertising', 'agency', 'referred', 'agency', 'ad', 'agency', 'is', 'business', 'dedicated', 'creating', 'planning', 'handling', 'advertising', 'forms', 'promotion', 'marketing', 'clients', 'ad', 'agency', 'is', 'client', 'be', 'department', 'agency', 'provides', 'point', 'view', 'effort', 'selling', 'client', 'products', 'services', 'firm', 'agency', 'handle', 'marketing', 'branding', 'strategies', 'promotions', 'clients', 'include', 'sales', 'ad', 'agency', 'clients', 'include', 'businesses', 'corporations', 'organizations', 'agencies', 'agencies', 'be', 'hired', 'produce', 'television', 'advertisements', 'radio', 'advertisements', 'advertising', 'advertising', 'marketing', 'advertising', 'part', 'advertising', 'campaign.the', 'acknowledged', 'advertising', 'agency', 'was', 'taylor', 'agency', 'started', 'james', "'jem", 'street', 'london', 'evolved', 'bull', 'holmes', 'recruitment', 'advertising', 'agency', 'went', 'business', '1980s', ']', '[', ']', 'george', 'reynell', 'officer'

# TF-IDF extraction from text (SKlearn)

We should parse several files to test it, so we start with the default example

In [83]:
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
]
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
vectorizer.get_feature_names_out()

print(X.shape)


(4, 9)


In [84]:
vectorizer.get_feature_names_out()

array(['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third',
       'this'], dtype=object)

In [87]:
pd.DataFrame(X)


Unnamed: 0,0
0,"(0, 1)\t0.46979138557992045\n (0, 2)\t0.580..."
1,"(0, 5)\t0.5386476208856763\n (0, 1)\t0.6876..."
2,"(0, 4)\t0.511848512707169\n (0, 7)\t0.51184..."
3,"(0, 1)\t0.46979138557992045\n (0, 2)\t0.580..."


## Testing on the sentences of the HTML input file:

In [127]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(new_sentences)
vectorizer.get_feature_names_out()

array(['1960s', '1980s', 'accelerated', 'account', 'acknowledged',
       'acting', 'ad', 'administration', 'adopted', 'ads',
       'advertisement', 'advertisements', 'advertisers', 'advertising',
       'agate', 'age', 'agencies', 'agency', 'agents', 'ambrotypes',
       'america', 'are', 'artists', 'associated', 'attention',
       'audiences', 'australia', 'ayer', 'barker', 'barkers', 'be',
       'became', 'been', 'began', 'bond', 'bonner', 'brady', 'brand',
       'branding', 'breakdowns', 'breaking', 'building', 'builds', 'bull',
       'business', 'businesses', 'called', 'campaign', 'capture',
       'carlton', 'case', 'caused', 'century', 'charles', 'chemistry',
       'cities', 'city', 'client', 'clients', 'clutter', 'collaboration',
       'comes', 'commemorated', 'commissions', 'communication',
       'companies', 'company', 'consumers', 'content', 'contribute',
       'corporations', 'created', 'creating', 'creativity', 'credited',
       'daguerreotypes', 'days', 'dedicat

Too much stuff, lets try only the verbs:

In [None]:
import nltk
nltk.download("book")

In [124]:
def get_verbs_and_nouns(input_string):
  tokenized = nltk.word_tokenize(input_string)
  verb_nouns = [word for (word, pos) in nltk.pos_tag(tokenized) if((pos[:2] == 'NN') or (pos[:2] == 'VB'))]
  return verb_nouns

new_sentences = []
for sentence in text_list:
  new_sentences.append(' '.join(get_verbs_and_nouns(sentence)))

In [125]:
new_sentences

['',
 'advertising agency referred agency ad agency is business dedicated creating planning handling advertising forms promotion marketing clients ad agency is client be department agency provides point view effort selling client products services firm agency handle marketing branding strategies promotions clients include sales',
 'ad agency clients include businesses corporations organizations agencies agencies be hired produce television advertisements radio advertisements advertising advertising marketing advertising part advertising campaign',
 "acknowledged advertising agency was taylor agency started james 'jem street london evolved bull holmes recruitment advertising agency went business 1980s ] [ ] george reynell officer london gazette set advertising agencies london ] remained family business 'reynell son is part tmp worldwide agency ireland brand tmp reynell ] agency traded was founded charles barker firm established traded 'barkers went administration",
 'volney b. palmer op

# Testing more stuff

In [91]:
lines

'an advertising agency, often referred to as a creative agency or an ad agency, is a business dedicated to creating, planning, and handling advertising and sometimes other forms of promotion and marketing for its clients. an ad agency is generally independent of the client; it may be an internal department or agency that provides an outside point of view to the effort of selling the client\'s products or services, or an outside firm. an agency can also handle overall marketing and branding strategies promotions for its clients, which may include sales as well.typical ad agency clients include businesses and corporations, non-profit organizations and private agencies. agencies may be hired to produce television advertisements, radio advertisements, online advertising, out-of-home advertising, mobile marketing, and ar advertising, as part of an advertising campaign.the first acknowledged advertising agency was william taylor in 1786. another early agency, started by james \'jem\' white i

In [101]:
#remove stop words

from nltk.corpus import stopwords
from nltk import word_tokenize

stop = set(stopwords.words('english'))

# lines
text_wihtout_stop_words = ' '.join([i for i in word_tokenize(lines.lower()) if i not in stop])

In [108]:
text_wihtout_stop_words

"advertising agency , often referred creative agency ad agency , business dedicated creating , planning , handling advertising sometimes forms promotion marketing clients . ad agency generally independent client ; may internal department agency provides outside point view effort selling client 's products services , outside firm . agency also handle overall marketing branding strategies promotions clients , may include sales well.typical ad agency clients include businesses corporations , non-profit organizations private agencies . agencies may hired produce television advertisements , radio advertisements , online advertising , out-of-home advertising , mobile marketing , ar advertising , part advertising campaign.the first acknowledged advertising agency william taylor 1786. another early agency , started james 'jem ' white 1800 fleet street , london , eventually evolved white bull holmes , recruitment advertising agency , went business late 1980s . [ 1 ] [ 2 ] 1812 george reynell , 

In [102]:
from nltk.stem import WordNetLemmatizer 


# Init the Wordnet Lemmatizer
lemmatizer = WordNetLemmatizer()
word_list = nltk.word_tokenize(text_wihtout_stop_words)
lemmatized_output = ' '.join([lemmatizer.lemmatize(w) for w in word_list])
print(lemmatized_output)

advertising agency , often referred creative agency ad agency , business dedicated creating , planning , handling advertising sometimes form promotion marketing client . ad agency generally independent client ; may internal department agency provides outside point view effort selling client 's product service , outside firm . agency also handle overall marketing branding strategy promotion client , may include sale well.typical ad agency client include business corporation , non-profit organization private agency . agency may hired produce television advertisement , radio advertisement , online advertising , out-of-home advertising , mobile marketing , ar advertising , part advertising campaign.the first acknowledged advertising agency william taylor 1786. another early agency , started james 'jem ' white 1800 fleet street , london , eventually evolved white bull holmes , recruitment advertising agency , went business late 1980s . [ 1 ] [ 2 ] 1812 george reynell , officer london gazett

In [109]:
import spacy
nlp = spacy.load('en', disable=['parser', 'ner'])
doc = nlp(text_wihtout_stop_words)
print(" ".join([token.lemma_ for token in doc]))

advertising agency , often refer creative agency ad agency , business dedicate creating , planning , handle advertising sometimes form promotion marketing client . ad agency generally independent client ; may internal department agency provide outside point view effort sell client 's product service , outside firm . agency also handle overall marketing brand strategy promotion client , may include sale well.typical ad agency client include business corporation , non - profit organization private agency . agency may hire produce television advertisement , radio advertisement , online advertising , out - of - home advertising , mobile marketing , ar advertising , part advertising campaign.the first acknowledge advertising agency william taylor 1786 . another early agency , start james ' jem ' white 1800 fleet street , london , eventually evolve white bull holmes , recruitment advertising agency , go business late 1980 . [ 1 ] [ 2 ] 1812 george reynell , officer london gazette , set anoth

# Conclusion

* we can extract nouns from an html file
* we can compute their relative frequencies

# Next steps

* Select part of the dataset to look at specific words => do all landing pages contain all the same words ?

* How to select part of this dataset ? Only one class or multiple ?