# Web Scraping Demo
> **Feb 2, 2024**

In [None]:
import re
import numpy as np

import pandas as pd

import requests

import json

from bs4 import BeautifulSoup as bs
from sqlalchemy import create_engine as sqleng

### For reponse code : _403_

> Use the headers to make a successful request.

In [None]:
data = {}

with open('utils.json') as f:
    data = json.load(f)

rating_review_reg_ex = r'(\d+\.\d+)|[\d]+'

webpage = requests.get(
    'https://www.ambitionbox.com/list-of-companies?campaign=desktop_nav', headers=data['headers']).text

### Parsing the webpage content

In [None]:
soup = bs(webpage, 'lxml')

### Using the _bs4_ object

In [None]:
# Pring the entire webpage content in plain HTML
print(soup.prettify())

> Utility function

In [None]:
def show(items):
    for item in items:
        print(item)

 ## _Extracting web content_

### Company **names**

In [None]:
names, names_query = [], []

h2 = soup.find_all('h2')

for name in h2:
    name = name.text.strip()
    names.append(name)

    name = name.lower().replace(' ', '-')
    names_query.append(name)
    
names = names[:-4]
names_query = names_query[:-4]

names

> Exceptional changes ...

In [None]:
names_query[-3] = 'l-and-t'

names_query

### Class for representing **each company's data**

In [None]:
class Company:

    def __init__(self, name):
        self.name = name
        self.review = float()
        self.rating = float()
        self.hq = ''
        self.ownership = ''
        self.founded_in = int()
        self.global_emp_count = float()
        self.india_emp_count = float()

    def fetch_page(self, name):
        url = f"https://www.ambitionbox.com/overview/{name}-overview"
        res = requests.get(url, headers=data["headers"])
        return res

    def fetch_details(self):
        res = self.fetch_page(self.name)
        if res.status_code == 200:
            page = bs(res.text, 'lxml')

            review = page.find('p', class_='newHInfo__rc')
            rating = page.find('span', class_='newHInfo__rating')

            if review is not None and rating is not None:
                review = re.search(rating_review_reg_ex, review.text)
                rating = re.search(rating_review_reg_ex, rating.text)

                if review is not None and rating is not None:
                    self.review =  float(review.group()) * 1000
                    self.rating =  float(rating.group())
                else:
                    return
            else:
                return
        else:
            return
        
    def __str__(self) -> str:
        self.fetch_details()
        return f"Name : {self.name}\tRating : {self.rating}\tNumber of reviews : {self.review}\tHQ : {self.hq}"

### _**Company details**_

> _Use some time delay in order to avoid getting your ip banned._

In [None]:
companies = []

for name in names_query:
    c = Company(name)
    c.fetch_details()
    
    companies.append(c)

#### _Replacing the names_

In [None]:
i = 0
for company in companies:
    company.name = names[i]
    i+=1

### Company **card details**

In [None]:
company_cards = soup.find_all('div', class_='companyCardWrapper__companyDetails')

company_cards_details = []

for card in company_cards:
    txt = card.text.strip().replace('\n', ' ').replace('\t', ' ')
    company_cards_details.append(txt)
    print(txt)

print(f"\nNumber of Companies = {len(company_cards_details)}")

### Company **Data Collection**

In [None]:
hq_data, founded_in_data, ownership, emp_count, ratings, reviews = [], [], [], [], [], []

cities = {
    'Bengaluru/Bangalore': 'Bangalore',
    'Bangalore/Bengaluru': 'Bangalore',
    'Teaneck.': 'New Jersey',
    'New': 'New York',
    'Navi': 'Navi Mumbai'
}

for cc in company_cards:
    data = cc.find(
        'span', class_="companyCardWrapper__interLinking").text.strip().split('|')

    hq_ = data[-1].split()[0]
    fd_ = 2024 - int(data[-2].split()[0])
    own_ = data[-3].strip()
    emp_ = ' '.join(data[1].split()[:-1])

    if hq_ in cities.keys():
        hq_ = cities[hq_]

    if own_.startswith('1') or own_.startswith('F'):
        own_ = 'Private'

    hq_data.append(hq_)
    founded_in_data.append(fd_)
    ownership.append(own_)
    emp_count.append(emp_)


for c in companies:
    ratings.append(c.rating)
    reviews.append(int(c.review))

### Making the **DataFrame**

In [None]:
df = pd.DataFrame({
    'name': names,
    'rating': ratings,
    'no_of_reviews': reviews,
    'hq': hq_data,
    'founded_in': founded_in_data,
    'ownership': ownership,
    'employee_count': emp_count,
})

df.head()

### SQL **Data Base**

In [None]:
eng = sqleng(f"{data['sql']}web_scrape")

rows = df.to_sql("df", con=eng, if_exists='append')

rows