# Web Scraping Demo
> **Feb 2, 2024**

In [1]:
import re
import time
import pandas as pd
import requests
from bs4 import BeautifulSoup as bs

### For reponse code : _403_

> Use the headers to make a successful request.

In [8]:
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.3; Win 64 ; x64) Apple WebKit /537.36(KHTML , like Gecko) Chrome/80.0.3987.162 Safari/537.36'}
rating_review_reg_ex = r'(\d+\.\d+)|[\d]+'

webpage = requests.get(
    'https://www.ambitionbox.com/list-of-companies?campaign=desktop_nav', headers=headers).text

### Parsing the webpage content

In [3]:
soup = bs(webpage, 'lxml')

### Using the _bs4_ object

In [None]:
# Pring the entire webpage content in plain HTML
print(soup.prettify())

 ## _Extracting web content_

### Company **names**

In [29]:
names, names_query = [], []

h2 = soup.find_all('h2')

for name in h2:
    name = name.text.strip()
    names.append(name)

    name = name.lower().replace(' ', '-')
    names_query.append(name)
    
names = names[:-4]
names_query = names_query[:-4]

names

['TCS',
 'Accenture',
 'Cognizant',
 'Wipro',
 'HDFC Bank',
 'ICICI Bank',
 'Infosys',
 'Capgemini',
 'HCLTech',
 'Tech Mahindra',
 'Genpact',
 'Axis Bank',
 'Teleperformance',
 'Concentrix Corporation',
 'Reliance Jio',
 'Amazon',
 'IBM',
 'Larsen & Toubro Limited',
 'Reliance Retail',
 'HDB Financial Services']

> Exceptional changes ...

In [21]:
names_query[-3] = 'l-and-t'

names_query

['tcs',
 'accenture',
 'cognizant',
 'wipro',
 'hdfc-bank',
 'icici-bank',
 'infosys',
 'capgemini',
 'hcltech',
 'tech-mahindra',
 'genpact',
 'axis-bank',
 'teleperformance',
 'concentrix-corporation',
 'reliance-jio',
 'amazon',
 'ibm',
 'l-and-t',
 'reliance-retail',
 'hdb-financial-services']

### Class for representing **each company's data**

In [30]:
class Company:

    def __init__(self, name):
        self.name = name
        self.review = float()
        self.rating = float()

    def fetch_page(self, name):
        url = f"https://www.ambitionbox.com/overview/{name}-overview"
        res = requests.get(url, headers=headers)
        return res

    def fetch_details(self):
        res = self.fetch_page(self.name)
        if res.status_code == 200:
            page = bs(res.text, 'lxml')

            review = page.find('p', class_='newHInfo__rc')
            rating = page.find('span', class_='newHInfo__rating')

            if review is not None and rating is not None:
                review = re.search(rating_review_reg_ex, review.text)
                rating = re.search(rating_review_reg_ex, rating.text)

                if review is not None and rating is not None:
                    self.review =  float(review.group()) * 1000
                    self.rating =  float(rating.group())
                else:
                    return
            else:
                return
        else:
            return
        
    def __str__(self) -> str:
        self.fetch_details()
        return f"Name : {self.name}\tRating : {self.rating}\tNumber of reviews : {self.review}"

### _**Company details**_

> _Use some time delay in order to avoid getting your ip banned._

In [31]:
companies = []

for name in names_query:
    c = Company(name)
    c.fetch_details()
    
    companies.append(c)

for company in companies:
    print(company)

Name : tcs	Rating : 3.8	Number of reviews : 72500.0
Name : accenture	Rating : 4.0	Number of reviews : 45900.0
Name : cognizant	Rating : 3.9	Number of reviews : 41300.0
Name : wipro	Rating : 3.8	Number of reviews : 38600.0
Name : hdfc-bank	Rating : 3.9	Number of reviews : 33600.0
Name : icici-bank	Rating : 4.0	Number of reviews : 33300.0
Name : infosys	Rating : 3.8	Number of reviews : 31500.0
Name : capgemini	Rating : 3.8	Number of reviews : 29400.0
Name : hcltech	Rating : 3.6	Number of reviews : 28200.0
Name : tech-mahindra	Rating : 3.7	Number of reviews : 27400.0
Name : genpact	Rating : 3.9	Number of reviews : 26000.0
Name : axis-bank	Rating : 3.8	Number of reviews : 21900.0
Name : teleperformance	Rating : 3.6	Number of reviews : 21700.0
Name : concentrix-corporation	Rating : 3.9	Number of reviews : 21600.0
Name : reliance-jio	Rating : 4.0	Number of reviews : 20400.0
Name : amazon	Rating : 4.1	Number of reviews : 20200.0
Name : ibm	Rating : 4.1	Number of reviews : 19600.0
Name : larse

#### _Replacing the names_

In [32]:
i = 0
for company in companies:
    company.name = names[i]
    i+=1

Name : TCS	Rating : 3.7	Number of reviews : 72600.0
Name : Accenture	Rating : 4.0	Number of reviews : 45900.0
Name : Cognizant	Rating : 3.9	Number of reviews : 41300.0
Name : Wipro	Rating : 3.7	Number of reviews : 38700.0
Name : HDFC Bank	Rating : 3.8	Number of reviews : 33600.0
Name : ICICI Bank	Rating : 4.0	Number of reviews : 33300.0
Name : Infosys	Rating : 3.8	Number of reviews : 31500.0
Name : Capgemini	Rating : 3.8	Number of reviews : 29500.0
Name : HCLTech	Rating : 3.6	Number of reviews : 28200.0
Name : Tech Mahindra	Rating : 3.6	Number of reviews : 27400.0
Name : Genpact	Rating : 3.9	Number of reviews : 26000.0
Name : Axis Bank	Rating : 3.8	Number of reviews : 21900.0
Name : Teleperformance	Rating : 3.7	Number of reviews : 21700.0
Name : Concentrix Corporation	Rating : 3.9	Number of reviews : 21600.0
Name : Reliance Jio	Rating : 4.0	Number of reviews : 20400.0
Name : Amazon	Rating : 4.0	Number of reviews : 20200.0
Name : IBM	Rating : 4.1	Number of reviews : 19600.0
Name : Larse

### Company **card details**

In [14]:
company_cards = soup.find_all('div', class_='companyCardWrapper__companyDetails')

company_cards_details = []

for card in company_cards:
    txt = card.text.strip().replace('\n', ' ').replace('\t', ' ')
    company_cards_details.append(txt)
    print(txt)

print(f"\nNumber of Companies = {len(company_cards_details)}")

TCS             Follow   3.7           IT Services & Consulting | 1 Lakh+ Employees | Public | 56 years old | Mumbai +339 more            Highly Rated For Job Security, Work Life Balance  Critically Rated For Promotions / Appraisal, Salary & Benefits, Work Satisfaction
Accenture             Follow   4.0           IT Services & Consulting | 1 Lakh+ Employees | Public | 35 years old | Dublin +168 more            Highly Rated For Company Culture, Skill Development / Learning, Job Security  Critically Rated For Promotions / Appraisal
Cognizant             Follow   3.9           IT Services & Consulting | 1 Lakh+ Employees | Forbes Global 2000 | 30 years old | Teaneck. New Jersey. +155 more             Critically Rated For Promotions / Appraisal
Wipro             Follow   3.7           IT Services & Consulting | 1 Lakh+ Employees | Public | 79 years old | Bangalore/Bengaluru +274 more            Highly Rated For Job Security  Critically Rated For Promotions / Appraisal, Salary & Benefits
HD

### Company **Data Collection**

In [34]:
ratings, reviews = [], []

for c in companies:
    ratings.append(c.rating)
    reviews.append(c.review)


[72600.0,
 45900.0,
 41300.0,
 38700.0,
 33600.0,
 33300.0,
 31500.0,
 29500.0,
 28200.0,
 27400.0,
 26000.0,
 21900.0,
 21700.0,
 21600.0,
 20400.0,
 20200.0,
 19600.0,
 18600.0,
 18500.0,
 18000.0]