In [1]:
#Import Packages

from arango import ArangoClient
from IPython.display import clear_output
import getpass
import pandas as pd
import re
import nltk
from langdetect import detect
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

#Web Scraping Packages
import requests
from requests.utils import requote_uri
from fake_useragent import UserAgent
from lxml import html
from selenium import webdriver
from bs4 import BeautifulSoup
from tqdm import tqdm
import time

In [2]:
# Database Connection

username = input('Please enter your username to connect DB: ')
password = getpass.getpass('Please enter your password to connect DB: ')
try:
    client = ArangoClient(hosts='http://localhost:8529')
    FBW = client.db('Flemish_Business_Websites', username=username, password=password)
    clear_output(wait=True)
    print('Successfully connected to the ArangoDB')
    del password
except ConnectionError:
    clear_output(wait=True)
    print('Warning: Please check your credentials and try to connect again!')
    del password

BusinessCollection = FBW.collection('Businesses')

Successfully connected to the ArangoDB


In [8]:
#Required Functions

#Get Visible Text
def visible_texts(soup):
    re_spaces = re.compile(r'\s{3,}')
    text = ' '.join([s for s in soup.strings if s.parent.name not in ('style', 'script', 'head', 'title')])
    return re_spaces.sub(' ', text)

#Clean Text + Get if any other frames
def clean_pop_cookie_frame(raw_text):
    soup = BeautifulSoup(raw_text, 'html.parser')
    for tag in soup.find_all('div', id=re.compile(r'(cook)|(popup)')):
        tag.decompose()
    for tag in soup.find_all('div', class_=re.compile(r'(cook)|(popup)')):
        tag.decompose()
    body_text = visible_texts(BeautifulSoup(visible_texts(soup), 'html.parser'))
    if len(soup.find_all('frame')) > 0:
        frame_text = ''
        for f in soup.find_all('frame'):
            try:
                frame_request = requests.get(f['src'], timeout = 10)
                frame_soup =  BeautifulSoup(frame_request.content, 'html.parser')
                frame_text = frame_text + ' ' + visible_texts(BeautifulSoup(visible_texts(frame_soup), 'html.parser'))
            except:
                frame_text = ''
        body_text = body_text + frame_text
    return body_text.strip()

def lower_punct_number_clean(text, lower_bound_letter_length):
    temp_text = re.sub('[^A-Za-z ]+', '', text)
    temp_text = ' '.join([i for i in temp_text.split() if len(i) >= lower_bound_letter_length])
    return temp_text.lower().strip()

def language_detector(text):
    try: 
        return detect(text)
    except:
        pass

english_stopwords = stopwords.words('english')
dutch_stopwords = stopwords.words('dutch')   
def remove_stopwords(text, lang):
    if(lang == 'nl'):
        temp_text = ' '.join([word for word in text.split() if word not in dutch_stopwords])
        return ' '.join([word for word in temp_text.split() if word not in english_stopwords])
    elif(lang == 'en'):
        return ' '.join([word for word in text.split() if word not in english_stopwords])
    else:
        return None

english_stemmer = SnowballStemmer(language='english')
dutch_stemmer = SnowballStemmer(language='dutch')
def stem_text(text, lang):
    if(text == None):
        return None
    elif(lang == 'nl'):
        return ' '.join([dutch_stemmer.stem(word) for word in text.split()])
    elif(lang == 'en'):
        return ' '.join([english_stemmer.stem(word) for word in text.split()])
    else:
        return None
    
def count_words(text):
    if(text == None):
        return None
    else:
        return len(text.split())

In [9]:
#Trial for one website
clean_pop_cookie_frame(BusinessCollection["426019644"]["HTML_R1_NOV20"])

"Skip to content Search Menu Home About Products People Sustainability Investors Share and bond Reports Presentations Corporate governance Stock exchange filings Financial calendar Resources IR contact Contact News Search 01 02 03 04 05 Mowi ranked most sustainable protein producer Mowi has been ranked as the world’s most sustainable protein producer for the second year in a row by the FAIRR Initiative. Read more 01 02 03 04 05 Operational EBIT of EUR 80 million in the third quarter for Mowi The results of the third quarter are published. Read more 01 02 03 04 05 Mowi’s soy suppliers commit to 100% deforestation free supply chain Read more here 01 02 03 04 05 Mowi collaborates with X, Alphabet’s innovation engine, to make salmon farming more sustainable Mowi works with Tidal, X’s ocean health and sustainability team, to test and research new technologies. Read more here 01 02 03 04 05 We are the world’s largest supplier of farm-raised salmon Mowi harnesses nature to produce nutritious,

In [19]:
cursor = FBW.aql.execute('FOR doc IN Businesses RETURN doc', ttl=5000)
for document in tqdm(cursor, total=cursor.count()):
    if (("HTML_R1_NOV20" in document) or ("HTML_R2_NOV20" in document)):
        if(document["HTML_R1_NOV20"] is not None):
            document["Clean_Text"] = clean_pop_cookie_frame(document["HTML_R1_NOV20"])
        elif(document["HTML_R2_NOV20"] is not None):
            document["Clean_Text"] = clean_pop_cookie_frame(document["HTML_R2_NOV20"])
        BusinessCollection.update(document)      

3179it [03:50, 13.80it/s] 


In [8]:
#Trial punctuation, lower, < 3 words cleaning for one webiste  
lower_punct_number_clean(BusinessCollection["426019644"]["Clean_Text"], 3)

'skip content search menu home about products people sustainability investors share and bond reports presentations corporate governance stock exchange filings financial calendar resources contact contact news search mowi ranked most sustainable protein producer mowi has been ranked the worlds most sustainable protein producer for the second year row the fairr initiative read more operational ebit eur million the third quarter for mowi the results the third quarter are published read more mowis soy suppliers commit deforestation free supply chain read more here mowi collaborates with alphabets innovation engine make salmon farming more sustainable mowi works with tidal ocean health and sustainability team test and research new technologies read more here are the worlds largest supplier farmraised salmon mowi harnesses nature produce nutritious tasty and supreme quality food from the ocean result our ongoing innovation and sustainable development are the worlds largest supplier farmraise

In [9]:
#Trial language detector - needed for stopword treatment
language_detector(lower_punct_number_clean(BusinessCollection["426019644"]["Clean_Text"], 3))

'en'

In [66]:
cursor = FBW.aql.execute('FOR doc IN Businesses RETURN doc', ttl=5000)
for document in tqdm(cursor, total=cursor.count()):
    if ("Clean_Text" in document):
        document["Clean_Text"] = lower_punct_number_clean(document["Clean_Text"], 3)
        BusinessCollection.update(document) 

3179it [27:49,  1.90it/s]


In [94]:
cursorx = FBW.aql.execute('FOR doc IN Businesses RETURN doc', ttl=5000)
for document in tqdm(cursorx, total=cursorx.count()):
    if ("Clean_Text" in document):
        document["Language"] = language_detector(document["Clean_Text"])
        BusinessCollection.update(document) 

3179it [03:11, 16.60it/s] 


In [96]:
language_list = []

cursor = FBW.aql.execute('FOR doc IN Businesses RETURN doc', ttl=5000)
for document in tqdm(cursor, total=cursor.count()):
    if ("Language" in document):
        language_list.append(document["Language"])
clear_output(wait=True)       
print(pd.Series(language_list).value_counts())

nl    1973
en     882
fr      25
af       9
de       8
da       2
it       2
ro       2
es       1
hr       1
lt       1
pl       1
sv       1
ca       1
sk       1
pt       1
sl       1
cy       1
dtype: int64


In [11]:
#Filtering stopwords 
remove_stopwords(BusinessCollection["426019644"]["Clean_Text"], BusinessCollection["426019644"]["Language"])

'skip content search menu home products people sustainability investors share bond reports presentations corporate governance stock exchange filings financial calendar resources contact contact news search mowi ranked sustainable protein producer mowi ranked worlds sustainable protein producer second year row fairr initiative read operational ebit eur million third quarter mowi results third quarter published read mowis soy suppliers commit deforestation free supply chain read mowi collaborates alphabets innovation engine make salmon farming sustainable mowi works tidal ocean health sustainability team test research new technologies read worlds largest supplier farmraised salmon mowi harnesses nature produce nutritious tasty supreme quality food ocean result ongoing innovation sustainable development worlds largest supplier farmraised salmon satisfying one fifth global demand products consistently delivering delicious nutritious healthy seafood thanks worldleading innovative sustainabl

In [66]:
#Stemming after removing stopwords
stem_text(remove_stopwords(BusinessCollection["426019644"]["Clean_Text"], BusinessCollection["426019644"]["Language"]), BusinessCollection["426019644"]["Language"])

'skip content search menu home product peopl sustain investor share bond report present corpor govern stock exchang file financi calendar resourc contact contact news search mowi rank sustain protein produc mowi rank world sustain protein produc second year row fairr initi read oper ebit eur million third quarter mowi result third quarter publish read mowi soy supplier commit deforest free suppli chain read mowi collabor alphabet innov engin make salmon farm sustain mowi work tidal ocean health sustain team test research new technolog read world largest supplier farmrais salmon mowi har natur produc nutriti tasti suprem qualiti food ocean result ongo innov sustain develop world largest supplier farmrais salmon satisfi one fifth global demand product consist deliv delici nutriti healthi seafood thank worldlead innov sustain approach offer wide select healthi delici respons produc seafood brand product join blue revolut mowi peopl core busi employe world largest salmon farm compani alway

In [78]:
#Count number of words to filter down < 10
count_words(stem_text(remove_stopwords(BusinessCollection["426019644"]["Clean_Text"], BusinessCollection["426019644"]["Language"]), BusinessCollection["426019644"]["Language"]))

322

In [19]:
cursor = FBW.aql.execute('FOR doc IN Businesses RETURN doc', ttl=5000)
for document in tqdm(cursor, total=cursor.count()):
    if (("Clean_Text" in document) and ("Language" in document)):
        temp_text = remove_stopwords(document["Clean_Text"], document["Language"])
        if((temp_text is not None) and (count_words(temp_text) >= 20)):
            document["Final_Text_IR04"] = temp_text
        BusinessCollection.update(document) 

3179it [00:51, 61.23it/s] 


In [22]:
BusinessCollection["426019644"]["Final_Text_IR01"]

'skip content search menu home product peopl sustain investor share bond report present corpor govern stock exchang file financi calendar resourc contact contact news search mowi rank sustain protein produc mowi rank world sustain protein produc second year row fairr initi read oper ebit eur million third quarter mowi result third quarter publish read mowi soy supplier commit deforest free suppli chain read mowi collabor alphabet innov engin make salmon farm sustain mowi work tidal ocean health sustain team test research new technolog read world largest supplier farmrais salmon mowi har natur produc nutriti tasti suprem qualiti food ocean result ongo innov sustain develop world largest supplier farmrais salmon satisfi one fifth global demand product consist deliv delici nutriti healthi seafood thank worldlead innov sustain approach offer wide select healthi delici respons produc seafood brand product join blue revolut mowi peopl core busi employe world largest salmon farm compani alway

In [89]:
count_n = 0 
cursor = FBW.aql.execute('FOR doc IN Businesses RETURN doc', ttl=5000)
for document in tqdm(cursor, total=cursor.count()):
    if ("Final_Text_IR01" in document):
        count_n += 1 
clear_output(wait=True)
print("Total Websites in the Final Set: ", count_n)

Total Websites in the Final Set:  2778


In [23]:
count_n = 0 
cursor = FBW.aql.execute('FOR doc IN Businesses RETURN doc', ttl=5000)
for document in tqdm(cursor, total=cursor.count()):
    if ("Final_Text_IR04" in document):
        count_n += 1 
clear_output(wait=True)
print("Total Websites in the Final Set: ", count_n)

Total Websites in the Final Set:  2698


### Sensitivy Analysis Data Creation

In [4]:
cursor = FBW.aql.execute('FOR doc IN Businesses RETURN doc', ttl=5000)
for document in tqdm(cursor, total=cursor.count()):
    if (("HTML_R1_NOV20" in document) or ("HTML_R2_NOV20" in document)):
        if(document["HTML_R1_NOV20"] is not None):
            document["Clean_Text_SA"] = clean_pop_cookie_frame(document["HTML_R1_NOV20"])
        elif(document["HTML_R2_NOV20"] is not None):
            document["Clean_Text_SA"] = clean_pop_cookie_frame(document["HTML_R2_NOV20"])
        BusinessCollection.update(document) 

3179it [03:29, 15.17it/s]


In [69]:
#Sensitivity Parameters

processing_id = "SA16"
stem_bool = False
remove_sequence_length = 2
stopwords_bool = False
minimum_document_length = 20

In [70]:
cursor = FBW.aql.execute('FOR doc IN Businesses RETURN doc', ttl=5000)
for document in tqdm(cursor, total=cursor.count()):
    if (("Clean_Text_SA" in document) and ("Language" in document)):
        document[processing_id] = lower_punct_number_clean(document["Clean_Text_SA"], remove_sequence_length)
        if(stopwords_bool): 
            temp_text = remove_stopwords(document[processing_id], document["Language"])
        else:
            temp_text = document[processing_id]
        if((temp_text is not None) and (count_words(temp_text) >= minimum_document_length)):
            if(stem_bool):
                document[processing_id] = stem_text(temp_text, document["Language"])
            else:
                document[processing_id] = temp_text
        BusinessCollection.update(document) 

3179it [00:41, 76.78it/s] 


In [17]:
BusinessCollection["421138663"]["SA01"]

'servic aanvrag offert fir woodlin urban lin pur lin classic lin interior woodlin urban lin pur lin classic lin referenties showrom professional contact fir woodlin urban lin pur lin classic lin interior woodlin urban lin pur lin classic lin referenties showrom professional contact servic aanvrag fir interior concept person virtueel bezoek mak afsprak bel fir interior concept warm thuis begint gezell interieur puntjes afgewerkt ingebouwd maatwerkkast mat gemaakt multimediawand ecologisch haard complet renovaties keuken slap suitebadkamer uniek person sfer daarvor zorgt vuyst fir interior concept fir interior wij ontwerp sam fir biedt houthaard gashaard gegot mooi design prachtig vormgev mat gemaakt interieur interior strev voortdur perfect balan tuss functionaliteit design gloednieuw atelier bereid plaatsing thuis grondig rek verder onz hog afwerkingsgrad proactiev onderhoudsservic eig hersteldienst fir interior referenties bekijk all referenties vuyst begrep meten wij will won volled 

In [62]:
BusinessCollection["421138663"]["SA12"]

'service aanvraag offerte fire woodline urban line pure line classic line interior woodline urban line pure line classic line referenties showrooms professional over ons contact fire woodline urban line pure line classic line interior woodline urban line pure line classic line referenties showrooms professional over ons contact service aanvraag fire interior concepts een persoonlijk virtueel bezoek maak een afspraak bel fire interior concepts een warme thuis begint met een gezellig interieur dat tot puntjes afgewerkt van ingebouwde maatwerkkasten maat gemaakte multimediawanden ecologische haarden tot complete renovaties van keukens slaap suitebadkamers alles unieke persoonlijke sfeer daarvoor zorgt vuyst fire interior concepts fire interior wat doen wij ontwerpen samen met fire biedt houthaarden gashaarden aan gegoten mooie designs voor prachtige vormgeving van maat gemaakte interieur moet bij interior zijn streven voortdurend naar een perfecte balans tussen functionaliteit design ons 

In [71]:
BusinessCollection["421138663"]["SA16"]

'service aanvraag offerte fire woodline urban line pure line classic line interior woodline urban line pure line classic line referenties showrooms professional over ons contact fire woodline urban line pure line classic line interior woodline urban line pure line classic line referenties showrooms professional over ons contact service aanvraag fire interior concepts een persoonlijk virtueel bezoek maak een afspraak of bel fire interior concepts een warme thuis begint met een gezellig interieur dat tot in de puntjes is afgewerkt van ingebouwde maatwerkkasten op maat gemaakte multimediawanden en ecologische haarden tot complete renovaties van keukens of slaap en en suitebadkamers alles in uw unieke en persoonlijke sfeer daarvoor zorgt de vuyst fire interior concepts fire interior wat doen we wij ontwerpen samen met fire biedt houthaarden en gashaarden aan gegoten in mooie designs voor de prachtige vormgeving van uw op maat gemaakte interieur moet bij interior zijn we streven voortdurend

### Transformer Dataset Raw Output

In [72]:
def database_to_transformers(revision):
    text = []
    innovation = []
    cursor = FBW.aql.execute('FOR doc IN Businesses RETURN doc', ttl=5000)
    for document in cursor:
        if((revision in document) and ("Innovation" in document)):
            text.append(document[revision])
            innovation.append(document["Innovation"])
    return pd.DataFrame({'text': text,'innovation': innovation})

transformer_dataset = database_to_transformers('Clean_Text_SA')
transformer_dataset.to_csv("transformer_dataset_raw.csv")