In [None]:
#!pip install azure-storage-blob
#!python -m nltk.downloader stopwords
#!python -m nltk.downloader punkt
#!pip install fake_useragent
#!pip install selenium
#!pip install webdriver_manager
#!apt install chromium-chromedriver
#!cp /usr/lib/chromium-browser/chromedriver /usr/bin

### <span style="color:#FF00FF">Import libraries</span>

In [None]:
import re
import json


import string
import unicodedata
import html
import time

import pandas as pd
import numpy as np

from fake_useragent import UserAgent
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager

import requests
from tqdm import tqdm
from bs4 import BeautifulSoup
from bs4.element import Comment


from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords


from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient, __version__

In [None]:
def configure_driver():
    ua = UserAgent()
    user_agent = ua.random
    print(user_agent)
    options = Options()
    options.add_argument(f'user-agent = {user_agent}')
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    driver = webdriver.Chrome('chromedriver', options = options)
    return driver

### <span style="color:#FF00FF">Define Parameters</span>

In [None]:
headers = {
    'app_client': 'consumer_web',
    'content-type': 'application/json',
    'user-agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36',
    'accept': '*/*',
}

url = 'https://www.google.com/search?q=$'

params = {
      "q": "",
      "hl": "en",
      "gl": "us"
    }


sub_search = ['about us',
              'company profile',
              'leadership team',
              'holding patterns',
              'diverse certificate', 
              'public company']


#Blob Storage Connection
çompany_list = "dnbcompanieslist.csv"
connect_str="END_POINT_API_KEY"
blob_service_client = BlobServiceClient.from_connection_string(connect_str)
container_client = blob_service_client.get_container_client("googlesearch")

### <span style="color:#FF00FF">Custom Google Search</span>

In [None]:
# This is a class to do googlesearch. 
class googlesearch:

    def __init__(self):
        pass

    def google_search(self,query):
      
      cleanString = re.sub('\W+',' ', query ).strip()

      qurl = url.replace('$',cleanString)
      
      details_driver.get(qurl)
      soup = BeautifulSoup(details_driver.page_source)
      
      #links = soup.findAll("a")
      links = []
      for link in  soup.find_all("a",href=re.compile("(htt.*://.*)")):
          l = re.split(":(?=http)",link["href"])[0]
          if 'google' not in l and not l.endswith('pdf'):
              links.append(l) 
      return links
      

    def get_links(self,query):
      query_list = [query]
      add_list  = list(map(lambda x: query +" "+ x, sub_search))
      query_list.extend(add_list)
      links = []
      for query in query_list:
        try:
          tmp_link = self.google_search(query)
          links.extend(tmp_link)
        except:
          pass
      return list(set(links))


    def tag_visible(self,element):
        if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
            return False
        if isinstance(element, Comment):
            return False
        return True


    def text_from_html(self,body):
        texts = body.findAll(text=True)
        visible_texts = filter(self.tag_visible, texts)  
        return u" ".join(t.strip() for t in visible_texts)

    def get_pagecontent(self,query):
      context = []
      time.sleep(2)
      links = self.get_links(query)
      source = 'google'
      for link in tqdm(links):
        #print(link)
        try:
          details_driver.get(link)
          print(link)
          soup = BeautifulSoup(details_driver.page_source, 'html.parser')
          text = self.text_from_html(soup)
          context.append([source,query,link,text])
        except Exception as e:
          print('Exception raised',e)
          pass
      return context

### <span style="color:#FF00FF">Get Company List</span>

In [None]:
bc = container_client.get_blob_client(blob=çompany_list)

data = bc.download_blob()
with open(çompany_list, "wb") as f:
   data.readinto(f)
companies = pd.read_csv(çompany_list,sep=",", encoding='cp1252')

#First 10 company Names
company_names = companies.dunsName[0:10]


### <span style="color:#FF00FF">Web Scrapping</span>

In [None]:
details_driver = configure_driver()
#adding a time limit of 20 seconds to load the page
details_driver.set_page_load_timeout(20)

columns = ['source','search','link','content']
df = pd.DataFrame(columns = columns)

search = googlesearch()

company_names = ['Wong Potatoes, Inc']
for company in tqdm(company_names):
  content = search.get_pagecontent(company)
  tmp = pd.DataFrame(content,columns = columns)
  df = df.append(tmp, ignore_index = True)

In [None]:
#df_bak = df.copy()

### <span style="color:#800000">Data Cleaning</span>

In [None]:
# This is a class to get clean text. 
class preprocessing:
    
    def __init__(self):
        pass

    #Cleaning and stripping HTML
    def remove_html_tags(self,text):
        clean = re.compile('<.*?>')
        cleantext = re.sub(clean, '', text)
        return cleantext

    #Removing Escaping characters &lt
    def escaping_html_char(self,doc):
        cleandoc = html.unescape(doc)
        return cleandoc
    
    #Removing newline & extra spaces
    def textcleaning(self,doc):
        # remove extra newlines
        a = doc.replace("\\n",".").strip()
        a = a.replace("\\r",".").strip()
        #a = re.sub(r'\d+','',a)# remove numbers
        cleandoc = re.sub("\s+"," ", a)
        return cleandoc
    
    def text_norm(self,doc):
        cleandoc = doc.lower()
        return cleandoc
    

    # Removing accented characters
    # A simple example — converting é to e.
    def decode_text(self,doc):
        cleandoc = unicodedata.normalize('NFKD', doc).encode('ascii','ignore').decode("utf8")
        return cleandoc
    
    def text_tokenize(self,doc):
        return word_tokenize(doc)
    
    def remove_stopwords(self,words):
        # set of stop words
        stop_words = set(stopwords.words('english')) 
        stext = [] 
        for w in words:
            if w not in stop_words:
                stext.append(w)
        return stext
    
    def remove_punctuation(self,doc):
        #chars = '!"#$%&\'()*+,-/:;<=>?@[\\]^_`{|}~'
        #table = str.maketrans(chars, ' '*len(chars))
        table = str.maketrans(string.punctuation, ' '*len(string.punctuation))
        ##str.maketrans('', '', string.punctuation)
        cleandoc = doc.translate(table)
        cleandoc=re.sub(r'[\W_]+', ' ', cleandoc)
        return cleandoc
    
    
    def questions_clean(self,text):
        text = re.sub(r'\n+', '\n', text).strip()
        return text
    
    
    def data_preprocessing(self,doc):
        doc = str(doc)

        step1 = self.remove_html_tags(doc)            # Cleaning and stripping HTML
        step2 = self.escaping_html_char(step1)        # Removing Escaping characters &lt
        step3 = self.textcleaning(step2)              # Removing newline & extra spaces
        step4 = self.text_norm(step3)                 # Case Normalization
        step5 = self.remove_punctuation(step4)        # Remove punctuation
        step6 = self.decode_text(step5)               # Text encoding - Removing accented characters
        step7 = self.text_tokenize(step6)             # Tokenization
        step8 = self.remove_stopwords(step7)          # Stop Words
        
        return " ".join(step8)

### <span style="color:#800000">Pre Processing</span>

In [None]:
pre = preprocessing()
df['content_clean'] = df['content'].apply(pre.data_preprocessing)

### <span style="color:#800000">Azure Blob - Upload Files</span>

In [None]:
comp_lst = df.search.unique()

for comp in comp_lst:
  tmp_df = df[df.search==comp]
  comp = pre.data_preprocessing(comp)
  for i,(index, row) in enumerate(tmp_df.iterrows()):
    rj = row.to_json()
    parsed = json.loads(rj)
    data = json.dumps(parsed,indent=4)
    
    blobname = '{0}/content_{1}.json'.format(comp, i)
    #print(blobname)
    container_client.upload_blob(name=blobname, data=data,overwrite = True)

### <span style="color:#800000">Azure Blob - Delete Files</span>

In [None]:
# Delete
blob_list = container_client.list_blobs()

for blob in blob_list:
  #print(blob.name)
  if not('dnb' in blob.name):
    container_client.delete_blob(blob.name)
  else:
    print(blob.name)

dnbcompanieslist.csv
