In [1]:
#Import Required Packages

#Web Scraping Packages
import requests
from requests.utils import requote_uri
from fake_useragent import UserAgent
from lxml import html
from selenium import webdriver
from bs4 import BeautifulSoup

#Language Detection
import langdetect
from textblob import TextBlob

#Miscellaneous Packages
import pandas as pd
import random
import time
import re
from tqdm import tqdm

In [2]:
#Functions

def language_switcher(URL, lang_code):
    success_boolean = False
    try: 
        page = requests.get(URL)
    except: 
        return success_boolean, ''
    soup = BeautifulSoup(page.text, 'html.parser')
    returned_list = soup.find_all(hreflang=re.compile(lang_code), href=True)
    if (len(returned_list) == 0):
        returned_list = soup.find_all(href=True)
        for item in returned_list:
            lower_string = str(item.text).lower()
            if(any(['nl' == word for word in lower_string.split()])):
                success_boolean = True
                new_page = item['href']
                if('http' not in item['href']):
                    new_page = URL + item['href'].strip('.')
                if language_detector(new_page)[1] == 'nl':
                    return success_boolean, new_page 
        for item in returned_list:
            lower_string = str(item['href']).lower()
            if(lower_string.find('nl') != -1):
                success_boolean = True
                new_page = item['href']
                if('http' not in item['href']):
                    new_page = URL + item['href'].strip('.')
                if language_detector(new_page)[1] == 'nl':
                    return success_boolean, new_page         
        return success_boolean, ''    
    elif (len(returned_list) == 1):
        success_boolean = True
        new_page = returned_list[0]['href']
        if('http' not in returned_list[0]['href']):
            new_page = URL + returned_list[0]['href'].strip('.')
        if language_detector(new_page)[1] == 'nl':
            return success_boolean, new_page 
    elif (len(returned_list) > 1):
        success_boolean = True    
        for item in returned_list:
            new_page = item['href']
            if(item['href'].find('be') != -1):
                if('http' not in item['href']):
                    new_page = URL + item['href'].strip('.')
                if language_detector(new_page)[1] == 'nl':
                    return success_boolean, new_page
        new_page = returned_list[0]['href']
        if('http' not in returned_list[0]['href']):
            new_page = URL + returned_list[0]['href'].strip('.')
        if language_detector(new_page)[1] == 'nl':
            return success_boolean, new_page      
    else:
        return success_boolean, ''
    
def visible_texts(soup):
    re_spaces = re.compile(r'\s{3,}')
    text = ' '.join([s for s in soup.strings if s.parent.name not in ('style', 'script', 'head', 'title')])
    return re_spaces.sub(' ', text)

def language_detector(URL):
    try: 
        page = requests.get(URL, timeout=10)
        soup = BeautifulSoup(page.content, 'html.parser')
        for tag in soup.find_all('div', id=re.compile(r'(cook)|(popup)')):
            tag.decompose()
        for tag in soup.find_all('div', class_=re.compile(r'(cook)|(popup)')):
            tag.decompose()
        body_text = visible_texts(BeautifulSoup(visible_texts(soup), 'html.parser'))
        if len(soup.find_all('frame')) > 0:
            frame_text = ''
            for f in soup.find_all('frame'):
                frame_request = requests.get(f['src'])
                frame_soup =  BeautifulSoup(frame_request.content, 'html.parser')
                frame_text = frame_text + ' ' + visible_texts(BeautifulSoup(visible_texts(frame_soup), 'html.parser'))
            body_text = body_text + frame_text
        print(body_text)
        return len(body_text.split()), TextBlob(body_text).detect_language()
    except:
        return 0, 'unknown'

def crawl_contact_page(URL, Base_URL, request_page):
    new_pages = []
    soup_crawl = BeautifulSoup(request_page.text, 'html.parser')
    returned_list = soup_crawl.find_all(href=True)        
    for item in returned_list:
        lower_href_text = ''.join(str(item.text).lower().strip())
        if('cont' in lower_href_text):
            if('www' in item['href']):
                new_pages.append(item['href'])
            else:
                new_page = Base_URL + item['href'].strip('.')
                new_pages.append(new_page)
    return list(set(new_pages))

def crawl_location_page(URL, Base_URL, request_page):
    new_pages = []
    soup_crawl = BeautifulSoup(request_page.text, 'html.parser')
    returned_list = soup_crawl.find_all(href=True)        
    for item in returned_list:
        lower_href_text = ''.join(str(item.text).lower().strip())
        if(('vest' in lower_href_text) | ('loc' in lower_href_text)):
            if('www' in item['href']):
                new_pages.append(item['href'])
            else:
                new_page = Base_URL + item['href'].strip('.')
                new_pages.append(new_page)
    return list(set(new_pages))

def validate_zip(URL, Base_URL, zip_1, zip_2):
    page = requests.get(URL)
    contact_pages = crawl_contact_page(URL, Base_URL, page)
    location_pages = crawl_location_page(URL, Base_URL, page)
    total_pages = contact_pages + location_pages
    print(total_pages)
    soup = BeautifulSoup(page.text, 'lxml')
    [s.decompose() for s in soup('script')]
    all_text = ' '.join(re.sub(r'\n', ' ', soup.get_text()).split())
    numeric_text = re.findall(r'\d+', all_text)
    if (any([str(zip_1) == number for number in numeric_text]) | 
        any([str(zip_2) == number for number in numeric_text])):
        return True
    elif (len(total_pages) != 0):
        for new_page in total_pages:
            time.sleep(3)
            page = requests.get(new_page)
            soup = BeautifulSoup(page.text, 'lxml')
            [s.decompose() for s in soup('script')]
            all_text = ' '.join(re.sub(r'\n', ' ', soup.get_text()).split())
            numeric_text = re.findall(r'\d+', all_text)
            if (any([str(zip_1) == number for number in numeric_text]) | 
                any([str(zip_2) == number for number in numeric_text])):
                return True
    return False

def validate_street(URL, Base_URL, street_raw):
    page = requests.get(URL)
    contact_pages = crawl_contact_page(URL, Base_URL, page)
    location_pages = crawl_location_page(URL, Base_URL, page)
    total_pages = contact_pages + location_pages
    print(total_pages)
    soup = BeautifulSoup(page.text, 'lxml')
    [s.decompose() for s in soup('script')]
    all_text = ' '.join(re.sub(r'\n', ' ', soup.get_text()).split())
    street_raw_temp = re.sub(r'\d+', '', street_raw).strip()
    final_street = re.sub('[\(\[].*?[\)\]]', '', street_raw_temp) 
    if(final_street in all_text):
        return True
    elif (len(total_pages) != 0):
        for new_page in total_pages:
            time.sleep(3)
            page = requests.get(new_page)
            soup = BeautifulSoup(page.text, 'lxml')
            [s.decompose() for s in soup('script')]
            all_text = ' '.join(re.sub(r'\n', ' ', soup.get_text()).split())
            if(final_street in all_text):
                return True
    return False

def extract_url_from_email(Email):
    try:
        return (re.findall(r'@([A-Za-z.]+)', Email)[0]).strip()
    except: 
        return ''
    
def fix_http(URL):
    if URL != '':
        if('http' in URL) & (URL[-1:] == '/'):
            return URL
        elif ('http' in URL) & (URL[-1:] != '/'):
            return URL + '/'
        elif ('http' not in URL) & (URL[-1:] == '/'):
            return 'http://' + URL
        else:
            return 'http://' + URL + '/'
    
#Input is 4 columns; cur_email,cur_web,email,web columns
def assign_primary_URL(cur_web, cur_email, web, email):
    if not (pd.isnull(cur_web)):
        return fix_http(cur_web)
    elif not (pd.isnull(cur_email)):
        return fix_http(extract_url_from_email(cur_email))
    elif not (pd.isnull(web)):
        return fix_http(web)
    elif not (pd.isnull(email)):
        return fix_http(extract_url_from_email(email))
    else: 
        return ''    
def get_status_code(URL):
    try:
        return requests.get(URL, timeout=10).status_code
    except:
        return 0
    
def get_NL_URL(URL, status_code):
    try: 
        if status_code == 200:
            if language_detector(URL)[1] != 'nl':
                success_code, new_url = language_switcher(URL, 'nl')
                if success_code & (new_url != ''):
                    return new_url
        return URL
    except:
        return URL

In [79]:
get_NL_URL("http://www.biagroup.com/", 200)

'https://www.biagroup.com/bel-vl/'

In [9]:
requests.get('http://www.farys.be/', timeout=10).status_code

200

In [None]:
FILE = 'C:\\Users\\nusret\\Desktop\\Thesis\\Data\\inno5_address_toStatVla_cis19.xlsx'
CIS_Survey_Dataframe = pd.read_excel(FILE, sheet_name='inno5_address_toStatVla_cis19')
CIS_Survey_Dataframe.head()

In [None]:
# Get a best guess URL
pd.options.mode.chained_assignment = None  
CIS_Survey_Dataframe['best_guess'] = CIS_Survey_Dataframe.apply(lambda x: assign_primary_URL(x.cur_web, x.cur_email, x.web, x.email), axis=1)
CIS_Survey_Dataframe[100:130]

In [15]:
from tqdm import tqdm
from tqdm import tqdm_notebook

tqdm(disable=True, total=0)
if len(tqdm._instances) > 0:
    while len(tqdm._instances) > 0:
        tqdm._instances.pop().close()
    clear_output(wait=True)

CIS_Survey_Dataframe['status_code'] = 0  

for index, row in tqdm(CIS_Survey_Dataframe.iterrows(), total=CIS_Survey_Dataframe.shape[0]):
    CIS_Survey_Dataframe.loc[index,'status_code'] = get_status_code(row['best_guess'])

100%|██████████████████████████████████████| 3179/3179 [53:33<00:00,  1.01s/it]


In [None]:
CIS_Survey_Dataframe[0:50]

In [17]:
CIS_Survey_Dataframe['status_code'].value_counts()

200    2390
0       685
403      72
404      15
500       7
503       6
406       3
999       1
Name: status_code, dtype: int64

In [18]:
CIS_Survey_Dataframe.to_pickle('CIS_Status_Code.pkl')

In [3]:
CIS_Survey_Dataframe = pd.read_pickle('CIS_Status_Code.pkl')

In [4]:
CIS_200 = CIS_Survey_Dataframe[CIS_Survey_Dataframe['status_code'] == 200]
CIS_non_200 = CIS_Survey_Dataframe[CIS_Survey_Dataframe['status_code'] != 200]

In [None]:
pd.options.mode.chained_assignment = None  
tqdm(disable=True, total=0)
if len(tqdm._instances) > 0:
    while len(tqdm._instances) > 0:
        tqdm._instances.pop().close()
    clear_output(wait=True)

CIS_200['NL_URL'] = 0  

for index, row in tqdm(CIS_200.iterrows(), total=CIS_200.shape[0]):
    CIS_200.loc[index,'NL_URL'] = get_NL_URL(row['best_guess'], row['status_code'])

In [11]:
CIS_200.to_pickle('CIS_200NL.pkl')

In [None]:
CIS_200[1600:1650]

In [13]:
pd.options.mode.chained_assignment = None  
tqdm(disable=True, total=0)
if len(tqdm._instances) > 0:
    while len(tqdm._instances) > 0:
        tqdm._instances.pop().close()
    clear_output(wait=True)

CIS_200['language'] = 0  

for index, row in tqdm(CIS_200.iterrows(), total=CIS_200.shape[0]):
    CIS_200.loc[index,'language'] = language_detector(row['NL_URL'])[1]
    

 88%|█████████████████████████████████▍    | 2103/2390 [49:21<06:22,  1.33s/it]Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
100%|██████████████████████████████████████| 2390/2390 [55:15<00:00,  1.39s/it]


In [14]:
CIS_200.language.value_counts()

nl         1646
en          597
unknown      81
fr           40
de           12
lb            5
co            2
hu            1
pl            1
zh-TW         1
pt            1
af            1
ja            1
sv            1
Name: language, dtype: int64

In [15]:
CIS_200.to_pickle('CIS_200NL.pkl')

In [3]:
CIS_Survey_Dataframe_NL = pd.read_pickle('CIS_200NL.pkl')
CIS_Survey_Dataframe_NON200 = pd.read_pickle('CIS_non_200.pkl')


In [5]:
CIS_Survey_Dataframe_NON200.status_code.value_counts()

0      685
403     72
404     15
500      7
503      6
406      3
999      1
Name: status_code, dtype: int64

In [None]:
manual_websites = pd.read_csv("C:\\Users\\nusret\\Desktop\\Thesis\\Data\\Manual Found Merge\\Scrape Merge_CSV.csv", sep =';')
CIS_Survey_Dataframe_Merged = pd.merge(CIS_Survey_Dataframe_NON200,
                                       manual_websites,
                                       left_on = 'btw',
                                       right_on = 'btw',
                                       how = 'left')
CIS_Survey_Dataframe_Merged

In [None]:
for index, row in CIS_Survey_Dataframe_Merged.iterrows():
    if(CIS_Survey_Dataframe_Merged.loc[index,'best_guess'] == ""):
            CIS_Survey_Dataframe_Merged.loc[index,'best_guess'] = fix_http(str(CIS_Survey_Dataframe_Merged.loc[index,'Found Website']))
CIS_Survey_Dataframe_Merged = CIS_Survey_Dataframe_Merged.iloc[:, :-2]
CIS_Survey_Dataframe_Merged

In [6]:
from tqdm import tqdm
from tqdm import tqdm_notebook

tqdm(disable=True, total=0)
if len(tqdm._instances) > 0:
    while len(tqdm._instances) > 0:
        tqdm._instances.pop().close()
    clear_output(wait=True)

CIS_Survey_Dataframe_Merged['status_code'] = 0  

for index, row in tqdm(CIS_Survey_Dataframe_Merged.iterrows(), total=CIS_Survey_Dataframe_Merged.shape[0]):
    CIS_Survey_Dataframe_Merged.loc[index,'status_code'] = get_status_code(row['best_guess'])

100%|████████████████████████████████████████| 789/789 [22:12<00:00,  1.69s/it]


In [15]:
CIS_Survey_Dataframe_Merged.to_pickle('Merged_Final.pkl')

In [8]:
CIS_Survey_Dataframe_Merged.status_code.value_counts()

0      340
200    330
403     80
404     22
503      6
500      6
406      4
999      1
Name: status_code, dtype: int64

In [46]:
CIS_Survey_Dataframe_NL.status_code.value_counts()

200    2390
Name: status_code, dtype: int64

In [9]:
pd.options.mode.chained_assignment = None  
tqdm(disable=True, total=0)
if len(tqdm._instances) > 0:
    while len(tqdm._instances) > 0:
        tqdm._instances.pop().close()
    clear_output(wait=True)

CIS_Survey_Dataframe_Merged['NL_URL'] = 0  

for index, row in tqdm(CIS_Survey_Dataframe_Merged.iterrows(), total=CIS_Survey_Dataframe_Merged.shape[0]):
    CIS_Survey_Dataframe_Merged.loc[index,'NL_URL'] = get_NL_URL(row['best_guess'], row['status_code'])

 84%|█████████████████████████████████▍      | 659/789 [08:04<03:04,  1.42s/it]Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
100%|████████████████████████████████████████| 789/789 [09:38<00:00,  1.36it/s]


In [13]:
pd.options.mode.chained_assignment = None  
tqdm(disable=True, total=0)
if len(tqdm._instances) > 0:
    while len(tqdm._instances) > 0:
        tqdm._instances.pop().close()
    clear_output(wait=True)

CIS_Survey_Dataframe_Merged['language'] = 0  

for index, row in tqdm(CIS_Survey_Dataframe_Merged.iterrows(), total=CIS_Survey_Dataframe_Merged.shape[0]):
    CIS_Survey_Dataframe_Merged.loc[index,'language'] = language_detector(row['NL_URL'])[1]

 45%|█████████████████▉                      | 353/789 [11:00<24:28,  3.37s/it]Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
100%|████████████████████████████████████████| 789/789 [22:35<00:00,  1.72s/it]


In [14]:
CIS_Survey_Dataframe_Merged.language.value_counts()

unknown    350
nl         268
en         166
fr           3
id           1
de           1
Name: language, dtype: int64

In [11]:
CIS_Survey_Dataframe_Merged = pd.read_pickle('data/Merged_Final.pkl')
CIS_Survey_Dataframe_NL200 = pd.read_pickle('data/CIS_200NL.pkl')

In [None]:
CIS_Survey_Dataframe_NL200

In [13]:
CIS_Survey_Dataframe_Merged.columns

Index(['btw', 'cur_firm', 'groupStata19', 'gpnameStata19', 'inno5',
       'cur_street1', 'cur_street2', 'finalzip', 'nuts', 'regionRsp',
       'zipActivPost19', 'nutsActivPost19', 'regionactivPost19', 'cur_email',
       'cur_web', 'street1', 'street2', 'email', 'web', 'best_guess',
       'status_code', 'NL_URL', 'language'],
      dtype='object')

In [14]:
frames = [CIS_Survey_Dataframe_NL200, CIS_Survey_Dataframe_Merged]
CIS_Survey_Dataframe_Final = pd.concat(frames).reset_index()

In [15]:
CIS_Survey_Dataframe_Final
Excel_CIS_Manual_Check = CIS_Survey_Dataframe_Final[['btw', 'index', 'cur_firm', 'best_guess', 'status_code', 'NL_URL', 'language']]

In [16]:
Excel_CIS_Manual_Check.to_excel("Manual_Check_SheetBTW.xlsx")

In [21]:
language_detector("http://arcelormittal.com/")

 


(0, 'unknown')

In [None]:
ua = UserAgent()
header = {'User-Agent':str(ua.random)}
page = requests.get("https://tokheim.com/", timeout=10, headers=header)
page.text

In [10]:
ua = UserAgent()
header = {'User-Agent':str(ua.random)}
page = requests.get("https://www.digisystem.com/be/nl/", timeout=10, headers=header)
#print(page.text)
soup = BeautifulSoup(page.content, 'html.parser')
visible_texts(soup)

' Contacteer ons menu Language Global - English Benelux - Français Benelux - Nederlands Canada - English Canada - Français Deutschland - Deutsch España - Español France - Français India - English Italia – Italiano Singapore - English Türkiye - Türkçe UK - English USA - English 日本 - 日本語 中国 - 简体中文 대한민국 - 한국어 Vakgebieden Producten Oplossingen In de praktijk Duurzaamheid Over DIGI DIGI Benelux, gevestigd te Antwerpen, is een dynamisch en groeiend bedrijf dat weegschalen-, prijs- en etiketteersystemen produceert. Als marktleider leveren ze o.a. aan de retail-, industrie- en voedingsmiddelenbranche. DIGI biedt toonaangevende oplossingen voor de voedingsindustrie met een breed gamma aan producten. DIGI biedt uitgebreide oplossingen die de productiviteit verhogen in de logistieke sector. Ontdek onze weegschalen en verpakkingsoplossingen specifiek voor de logistiek! DIGI biedt innovatieve oplossingen die de activiteiten stroomlijnen en de winst vergroten in de horeca. DIGI ondersteunt al zijn p