# Packages

In [1]:
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import os,re
from urllib.parse import urljoin
from datetime import datetime
from time import sleep
import pandas as pd

# URL

In [2]:
url = 'https://clutch.co/'

# Saving File Path

In [3]:
# file = "OutData"+datetime.now().strftime('%Y-%m-%d %H%M%S')
file = 'OutData'
if not os.path.isdir(file):
    os.mkdir(file)
os.chdir(file)
sep = '@@'

# Chrome properties

In [4]:
options = webdriver.ChromeOptions()
download_path = os.getcwd()
prefs = {'download.default_directory':download_path}
options.add_experimental_option('prefs', prefs)
# ChromeDriverPath = r'../chromedriver.exe'


# Opening Chrome 

In [5]:
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

In [6]:
driver.maximize_window()
driver.set_page_load_timeout(3)
webActions = ActionChains(driver)

# Setting Page load time

In [7]:
driver.set_page_load_timeout(200000)
driver.set_script_timeout(20000)

In [8]:
driver.get(url)

# To get Data 

In [9]:
def getBeautifulsoapData(xpath):
    return BeautifulSoup(driver.execute_script("return arguments[0].innerHTML;", driver.find_element("xpath",xpath)))

In [10]:
def cleanText(text):
    text = ' and '.join([i.strip(' ') for i in text.split('&')])
    text = ' or '.join([i.strip(' ') for i in text.split('/')])
    text = text.replace('\t','')
    text = ''.join(re.findall(r'\b[a-zA-Z0-9_\s]\w+', text))
    
    return text

# Extracting URL of Tech

In [11]:
list_data = getBeautifulsoapData('/html/body/main/article/section[2]/div/div[2]')

In [20]:
# To store all Tech Domain URLs
data_urls = {}

for data in list_data.findAll('div'):
    # To get sub Domain URLs
    subDomains_urls = {}
    
    # Filtering out special chars
    domains = cleanText(data.find('button').text)
    
    # Accessing links of each sub Domain
    for sub_url in data.find('nav').findAll('a'):
        # Adding all subDomain link to dict 
        subDomains_urls[cleanText(sub_url.text)] = urljoin(url,sub_url.get('href'))
        
    # Adding all links
    data_urls[domains] = subDomains_urls

In [16]:
print("All Tech Domains : ", *data_urls.keys(),sep = ',')

All Tech Domains : ,Development,Design and Production,Marketing,Advertising,Business Services,IT Services


In [21]:
# sample Tech Domain links
for outerDomain in data_urls.keys():
    file_name = "clutch_{}_{}.csv"
    dataUrls = data_urls[outerDomain]
    for innerurl in dataUrls.keys():
        print(dataUrls[innerurl], innerurl)
        print(file_name.format(outerDomain, innerurl))
        break
        

https://clutch.co/directory/mobile-application-developers MobileApp Development
clutch_Development_MobileApp Development.csv
https://clutch.co/web-designers Web Design
clutch_Design and Production_Web Design.csv
https://clutch.co/agencies/digital-marketing Digital Marketing
clutch_Marketing_Digital Marketing.csv
https://clutch.co/agencies Advertising
clutch_Advertising_Advertising.csv
https://clutch.co/call-centers Call Centers
clutch_Business Services_Call Centers.csv
https://clutch.co/it-services IT Services
clutch_IT Services_IT Services.csv


In [18]:
def getMaxPages()->int:
    '''To get max pages for the technology'''
    return int(getBeautifulsoapData('//*[@id="providers"]/nav').findAll('li')[-1].find('a').get('data-page'))

In [19]:
def nextPage(SoupData,fileName ):
    '''used to scrap data of page'''
    with open(fileName, 'a',encoding='utf-8') as datewriter:
        for d in SoupData.findAll( 'li', attrs = {'data-is-list':'true'}):

            try:
                postion = d.get('data-position')
            except Exception as e:
                postion = 'NA'
            
            # company name
            try:
                title = d.get('data-title') 
            except Exception as e:
                title = 'NA'
            
            # website link
            try:
                website =  d.find('a', attrs = {'class':'website-link__item'}).get('href')
            except Exception as e:
                website = 'NA'
            
            # Location 
            try:
                location = d.find('span' , attrs = {'class':'locality'}).text
            except Exception as e:
                location = 'NA'
                
            # rating of company
            try:
                rating =  str(float(d.find('span', attrs = {'class':'rating sg-rating__number'}).text))
            except Exception as e:
                rating = "NA"
            
            # reviews count
            try:
                reviews = d.find('a' ,attrs = {'data-link_text':"Reviews Count"}).text.replace('\n','').replace('\r', '').replace('\t', '').strip(' ')
            except Exception as e:
                reviews = 'NA'
            
            # Hourly Rate
            try:
                hourlyRate = d.findAll('div', attrs = {'class' : 'list-item custom_popover'})[0].find('span').text.strip(' ')
            except Exception as e:
                hourlyRate = 'NA'
                
            # Mini Project size 
            try:
                miniProjSize = d.find('div',attrs = {'class' : 'list-item block_tag custom_popover'}).find('span').text
            except Exception as e:
                miniProjSize = 'NA'
            
            
            # Employee size 
            try:
                empSize = d.findAll('div' , attrs = {'class':'list-item custom_popover'})[1].find('span').text
            except Exception as e:
                empSize = 'NA'
                
            
            out = sep.join([postion,title,website,location,rating,reviews,hourlyRate,miniProjSize,empSize])+'\n'
            datewriter.write(out)
#             break

In [25]:
def getLinkData(domainUrl, fileName):
    # Deleting all cookies to avoid varification for robot
    driver.delete_all_cookies()
    
    # Each Tech domain url
    driver.get(domainUrl)
    
    # Max pages in each Tech domain
    maxPages = min(20,getMaxPages())
    
    
    print("looping over Pages : ",maxPages)
    print("Started scrapy for :" ,domainUrl)
    
    # Looping over all pages in Tech Domain.
    for i in range(maxPages+1):
        
        driver.delete_all_cookies()
        print(i,end = ',')
        
        # generating url for next page 
        driver.get(urljoin(domainUrl,'?page={}'.format(i)))
        
        # gathering data of Each page
        SoupData = getBeautifulsoapData('//*[@id="providers"]/div[2]')
        
        # passing it to next page function to get requried data from from that page
        nextPage(SoupData,fileName)
        
    print("\nEnded scrapy for :" ,domainUrl)
    print('\n\n')
#         break

    
    

In [22]:
'''For scrapying Entire Data'''
for outerDomain in data_urls.keys():

    # Gathering all subDomain links
    dataUrls = data_urls[outerDomain]
    for innerurl in dataUrls.keys():
        
        # Generating file according to domain and subdomain
        file_name = "clutch_{}_{}.tsv"
        file_name = file_name.format(outerDomain, innerurl)
        
        
        print(dataUrls[innerurl], innerurl)
        print(os.path.join(os.getcwd(),file_name))
        
        # writing columns to a file
        columns = ['Postion', 'title','website','location','rating','reviews','hourlyRate','miniProjSize','empSize']
        with open(file_name, 'w') as w:
            w.write(sep.join(columns)+'\n')
            
        # to get Data of subDomain
        getLinkData(dataUrls[innerurl], file_name)
#         break

'For scrapying Entire Data'

In [69]:
files_tsv = [i for i in os.listdir() if i.endswith('.tsv')]

In [68]:
# converting files to xlsx format.
for i in files_tsv:
    xlsx_name = i.replace('.tsv', '.xlsx')
    try:
        pd.read_csv(i,sep = sep,engine='python').to_excel(xlsx_name, engine="xlsxwriter", index = False)
        os.remove(i)
    except:
        print(i)

In [64]:
''' If any issue occured we can scrap data from individual Tech Domains (Optional)'''
# OutteDomain = 'Marketing'
# outter = data_urls[OutteDomain]
# for innerurl in outter.keys():
    
    
#     file_name = "clutch_{}_{}.tsv"
#     file_name = file_name.format(OutteDomain, innerurl)
    
#     print('Current URL :' ,outter[innerurl])
#     print(os.path.join(os.getcwd(),file_name))
    
#     columns = ['Postion', 'title','website','location','rating','reviews','hourlyRate','miniProjSize','empSize']
#     with open(file_name, 'w') as w:
#         w.write(sep.join(columns)+'\n')
#     getLinkData(outter[innerurl], file_name)
# #         break

Current URL : https://clutch.co/agencies/app-marketing
E:\PythonProjects\webScrapping\Clutch\OutDataLinks\clutch_Marketing_Mobile Marketing.tsv
looping over Pages :  20
Started scrapy for : https://clutch.co/agencies/app-marketing
0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,
Ended scrapy for : https://clutch.co/agencies/app-marketing





In [22]:
# data_urls['Marketing'] 

In [23]:
# data_urls['Marketing'] = {'Mobile Marketing': 'https://clutch.co/agencies/app-marketing'}

In [24]:
# data_urls['Marketing'] 