# Web Scraping Demo
> **Feb 2, 2024**

In [1]:
import re
import numpy as np

import pandas as pd

import requests

import json

from bs4 import BeautifulSoup as bs
from sqlalchemy import create_engine as sqleng

import mysql.connector as msc
from mysql.connector import Error

### For reponse code : _403_

> Use the headers to make a successful request.

In [2]:
connect_data = {}

with open('utils.json') as f:
    connect_data = json.load(f)

rating_review_reg_ex = r'(\d+\.\d+)|[\d]+'

webpage = requests.get(
    connect_data['url'], headers=connect_data['headers']).text

### Parsing the webpage content

In [3]:
soup = bs(webpage, 'lxml')

> Utility function

In [4]:
def show(items):
    for item in items:
        print(item)

 ## _Extracting web content_

### Company **names**

In [5]:
names, names_query = [], []

h2 = soup.find_all('h2')

for name in h2:
    name = name.text.strip()
    names.append(name)

    name = name.lower().replace(' ', '-')
    names_query.append(name)
    
names = names[:-4]
names_query = names_query[:-4]

names

['TCS',
 'Accenture',
 'Cognizant',
 'Wipro',
 'HDFC Bank',
 'ICICI Bank',
 'Infosys',
 'Capgemini',
 'HCLTech',
 'Tech Mahindra',
 'Genpact',
 'Axis Bank',
 'Teleperformance',
 'Concentrix Corporation',
 'Jio',
 'Amazon',
 'IBM',
 'Larsen & Toubro Limited',
 'Reliance Retail',
 'HDB Financial Services']

> Exceptional changes ...

In [6]:
names_query[-3] = 'l-and-t'

names_query

['tcs',
 'accenture',
 'cognizant',
 'wipro',
 'hdfc-bank',
 'icici-bank',
 'infosys',
 'capgemini',
 'hcltech',
 'tech-mahindra',
 'genpact',
 'axis-bank',
 'teleperformance',
 'concentrix-corporation',
 'jio',
 'amazon',
 'ibm',
 'l-and-t',
 'reliance-retail',
 'hdb-financial-services']

### Class for representing **each company's data**

In [7]:
class Company:

    def __init__(self, name):
        self.name = name
        self.review = float()
        self.rating = float()
        self.hq = ''
        self.ownership = ''
        self.founded_in = int()
        self.global_emp_count = float()
        self.india_emp_count = float()

    def fetch_page(self, name):
        url = f"https://www.ambitionbox.com/overview/{name}-overview"
        res = requests.get(url, headers=connect_data["headers"])
        return res

    def fetch_details(self):
        res = self.fetch_page(self.name)
        if res.status_code == 200:
            page = bs(res.text, 'lxml')

            review = page.find('p', class_='newHInfo__rc')
            rating = page.find('span', class_='newHInfo__rating')

            if review is not None and rating is not None:
                review = re.search(rating_review_reg_ex, review.text)
                rating = re.search(rating_review_reg_ex, rating.text)

                if review is not None and rating is not None:
                    self.review =  float(review.group()) * 1000
                    self.rating =  float(rating.group())
                else:
                    return
            else:
                return
        else:
            return
        
    def __str__(self) -> str:
        self.fetch_details()
        return f"Name : {self.name}\tRating : {self.rating}\tNumber of reviews : {self.review}\tHQ : {self.hq}"

### _**Company details**_

In [8]:
companies = []

for name in names_query:
    c = Company(name)
    c.fetch_details()
    
    companies.append(c)

#### _Replacing the names_

In [9]:
i = 0
for company in companies:
    company.name = names[i]
    i+=1

### Company **card details**

In [None]:
company_cards = soup.find_all('div', class_='companyCardWrapper__companyDetails')

company_cards_details = []

for card in company_cards:
    txt = card.text.strip().replace('\n', ' ').replace('\t', ' ')
    company_cards_details.append(txt)
    print(txt)

print(f"\nNumber of Companies = {len(company_cards_details)}")

### Company **Data Collection**

In [11]:
hq_data, founded_in_data, ownership, emp_count, ratings, reviews = [], [], [], [], [], []

cities = {
    'Bengaluru/Bangalore': 'Bangalore',
    'Bangalore/Bengaluru': 'Bangalore',
    'Teaneck.': 'New Jersey',
    'New': 'New York',
    'Navi': 'Navi Mumbai'
}

for cc in company_cards:
    data = cc.find(
        'span', class_="companyCardWrapper__interLinking").text.strip().split('|')

    hq_ = data[-1].split()[0]
    fd_ = 2024 - int(data[-2].split()[0])
    own_ = data[-3].strip()
    emp_ = ' '.join(data[1].split()[:-1])

    if hq_ in cities.keys():
        hq_ = cities[hq_]

    if own_.startswith('1') or own_.startswith('F'):
        own_ = 'Private'

    hq_data.append(hq_)
    founded_in_data.append(fd_)
    ownership.append(own_)
    emp_count.append(emp_)


for c in companies:
    ratings.append(c.rating)
    reviews.append(int(c.review))

### Making the **DataFrame**

In [12]:
df = pd.DataFrame({
    'company_name': names,
    'rating': ratings,
    'no_of_reviews': reviews,
    'hq': hq_data,
    'founded_in': founded_in_data,
    'ownership': ownership,
    'employee_count': emp_count,
})

df.head()

Unnamed: 0,company_name,rating,no_of_reviews,hq,founded_in,ownership,employee_count
0,TCS,3.7,72800,Mumbai,1968,Public,1 Lakh+
1,Accenture,4.0,46100,Dublin,1989,Public,1 Lakh+
2,Cognizant,3.8,41400,New Jersey,1994,Private,1 Lakh+
3,Wipro,3.7,38800,Bangalore,1945,Public,1 Lakh+
4,HDFC Bank,3.8,33700,Mumbai,1994,Public,1 Lakh+


### **Uploading** the DataFrame to the local **SQL** Database 
> Make sure to create the database first!

In [13]:
pwd = input("Enter root password : ")
db_name = input("Enter the database name : ")
tb_name = input("Enter the table name : ")

eng = sqleng(f"{connect_data['sql'].replace('<pwd>', pwd)}{db_name}")
rows = df.to_sql(tb_name, con=eng, if_exists='append')

rows

20

### SQL Connection **Class**

In [3]:
class SQL_Connection:

    def __init__(self):
        self._connection_error = ""
        self._connection = None
        self._host = ""
        self._user = ""
        self._pwd = ""
        self._db = ""

        self.set_connection_params()

    def set_connection_params(self):
        self._user = input("Enter username : ")
        self._pwd = input("Enter password : ")
        self._host = input("Enter host : ")
        self._db = input("Enter database name : ")

    def connect(self):
        try:
            self._connection = msc.connect(
                host=self._host,
                database=self._db,
                user=self._user,
                password=self._pwd
            )
            print(f"Connected to database : {self._db}")

        except Error as e:
            print(f"Error : {e}")
            self._connection_error = e

    def execute_query(self, query):
        if self._connection is not None:
            cursor = self._connection.cursor(dictionary=True)
            cursor.execute(query)
            res = cursor.fetchall()
            print("\nQuery ran successfully! Result :\n")
            for row in res:
                print(row)
        else:
            print(f"\nCannot run query due to : {self._connection_error}!\n")

    def query(self):
        q = input("Enter your query : ")
        self.execute_query(q)

### **Establishing** the connection

In [4]:
sql = SQL_Connection()
sql.connect()

Connected to database : web_scrape


### **Running** the queries

In [7]:
sql.query()


Query ran successfully! Result :

{'index': 2, 'company_name': 'Cognizant', 'rating': 3.8, 'no_of_reviews': 41400, 'hq': 'New Jersey', 'founded_in': 1994, 'ownership': 'Private', 'employee_count': '1 Lakh+'}
{'index': 12, 'company_name': 'Teleperformance', 'rating': 3.7, 'no_of_reviews': 21800, 'hq': 'Paris', 'founded_in': 1978, 'ownership': 'Private', 'employee_count': '50k-1 Lakh'}
{'index': 18, 'company_name': 'Reliance Retail', 'rating': 3.8, 'no_of_reviews': 18600, 'hq': 'Navi Mumbai', 'founded_in': 2006, 'ownership': 'Private', 'employee_count': '1 Lakh+'}
{'index': 19, 'company_name': 'HDB Financial Services', 'rating': 4.0, 'no_of_reviews': 18100, 'hq': 'Ahmedabad', 'founded_in': 2007, 'ownership': 'Private', 'employee_count': '1 Lakh+'}
