# Web Scraping Demo
> **Feb 2, 2024**

In [None]:
import re
import pandas as pd
import requests
from bs4 import BeautifulSoup as bs

### For reponse code : _403_

> Use the headers to make a successful request.

In [None]:
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.3; Win 64 ; x64) Apple WebKit /537.36(KHTML , like Gecko) Chrome/80.0.3987.162 Safari/537.36'}

webpage = requests.get(
    'https://www.ambitionbox.com/list-of-companies?campaign=desktop_nav', headers=headers).text

### Parsing the webpage content

In [None]:
soup = bs(webpage, 'lxml')

### Using the _bs4_ object

In [None]:
# Pring the entire webpage content in plain HTML
print(soup.prettify())

 ## _Extracting web content_

### Class for representing _each_ company's data

In [None]:
class Company:

    def __init__(self, name):
        self.name = name
        self.review = float()

    def fetch_page(self, name):
        url = f"https://www.ambitionbox.com/overview/{name}-overview"
        res = requests.get(url, headers=headers)
        return res

    def fetch_reviews(self, name):
        res = self.fetch_page(name)
        if res.status_code == 200:
            page = bs(res.text, 'lxml')
            data = page.find('p', class_='newHInfo__rc')

            if data is not None:
                data = re.search(r'(\d+\.\d)', data.text)
                if data is not None:
                    return float(data.group()) * 1000
                else:
                    return
            else:
                return
        else:
            return

### Company **names**

In [None]:
names, names_query = [], []

h2 = soup.find_all('h2')

for name in h2:
    name = name.text.strip()
    names.append(name)

    name = name.lower().replace(' ', '-')
    names_query.append(name)
    
names = names[:-4]
names_query = names_query[:-4]

names_query

### Company **card details**

In [None]:
company_cards = soup.find_all('div', class_='companyCardWrapper__companyDetails')

company_cards_details = []

for card in company_cards:
    txt = card.text.strip().replace('\n', ' ').replace('\t', ' ')
    company_cards_details.append(txt)
    print(txt)

print(f"\nNumber of Companies = {len(company_cards_details)}")

### Company **ratings**

In [None]:
ratings = []

for detail in company_cards_details:
    rating = re.search(r'(\d+\.\d)', detail)
    if rating != None:
        ratings.append(rating.group())

ratings

#### _Company **page details**_

In [109]:
companies = []

for name in names_query:
    company = Company(name=name)
    review = company.fetch_reviews(name)
    if review is not None:
        company.review = review
        companies.append(company)
    else:
        break

for company in companies:
    print(company.review)

72500.0
45900.0
41300.0
38600.0
33600.0
33300.0
31500.0
29400.0
28200.0
27400.0
