In [1]:
# Importing library

import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

In [2]:
# Initialize an empty DataFrame
final = pd.DataFrame()

In [4]:
# Iterate through pages
for j in range(1, 501):  # Change range to scrape all pages (1 to 500)
    url = 'https://www.ambitionbox.com/list-of-companies?page={}'.format(j)
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win 64 ; x64) Apple WeKit /537.36(KHTML , like Gecko) Chrome/80.0.3987.162 Safari/537.36'
    }
    webpage = requests.get(url, headers=headers).text
    soup = BeautifulSoup(webpage, 'lxml')
    company = soup.find_all('div', class_="companyCardWrapper")
    
    # Create empty lists to store data
    name = []
    rating = []
    description = []
    highly_rated_for = []
    critically_rated_for = []
    reviews = []
    salaries = []
    interviews = []
    jobs = []
    benefits = []
    
    # Iterate over the companies
    for company in soup.find_all('div', class_='companyCardWrapper'):
        # Extract company name
        name.append(company.find('h2', class_="companyCardWrapper__companyName").text.strip())
        
        # Extract company rating if available
        rating_element = company.find('span', class_="companyCardWrapper__companyRatingValue")
        rating_value = rating_element.text.strip() if rating_element else np.nan
        rating.append(rating_value)
        
        # Extract company description
        description.append(company.find('div', class_="companyCardWrapper__interLinkingWrapper").text.strip())
        
        # Extract highly rated for and critically rated for if available
        ratings_element = company.find('span', class_="companyCardWrapper__ratingValues")
        if ratings_element:
            ratings = ratings_element.text.strip()
            ratings_split = ratings.split(',')
            if len(ratings_split) >= 2:
                highly_rated_for.append(ratings_split[0].strip())  # Assuming highly rated for is the first item
                critically_rated_for.append(ratings_split[1].strip())  # Assuming critically rated for is the second item
            else:
                highly_rated_for.append(np.nan)
                critically_rated_for.append(np.nan)
        else:
            highly_rated_for.append(np.nan)
            critically_rated_for.append(np.nan)
        
        # Extract reviews, salaries, interviews, jobs, benefits, and photos if available
        actions = company.find_all('span', class_='companyCardWrapper__ActionCount')
        reviews.append(actions[0].text.strip() if actions else np.nan)
        salaries.append(actions[1].text.strip() if len(actions) > 1 else np.nan)
        interviews.append(actions[2].text.strip() if len(actions) > 2 else np.nan)
        jobs.append(actions[3].text.strip() if len(actions) > 3 else np.nan)
        benefits.append(actions[4].text.strip() if len(actions) > 4 else np.nan)
    
    # Create a DataFrame from the collected data
    data = {
        'Name': name,
        'Rating': rating,
        'Description': description,
        'Highly Rated For': highly_rated_for,
        'Critically Rated For': critically_rated_for,
        'Reviews': reviews,
        'Salaries': salaries,
        'Interviews': interviews,
        'Jobs': jobs,
        'Benefits': benefits,
    }
    
    df = pd.DataFrame(data)
    
    # Append the DataFrame to the final DataFrame
    final = pd.concat([final, df], ignore_index=True)
    print("Data appended for page", j)

# Save the final DataFrame as a CSV file
final.to_csv('companies_data.csv', index=False)
print("CSV file created successfully.")

Data appended for page 1
Data appended for page 2
Data appended for page 3
Data appended for page 4
Data appended for page 5
Data appended for page 6
Data appended for page 7
Data appended for page 8
Data appended for page 9
Data appended for page 10
Data appended for page 11
Data appended for page 12
Data appended for page 13
Data appended for page 14
Data appended for page 15
Data appended for page 16
Data appended for page 17
Data appended for page 18
Data appended for page 19
Data appended for page 20
Data appended for page 21
Data appended for page 22
Data appended for page 23
Data appended for page 24
Data appended for page 25
Data appended for page 26
Data appended for page 27
Data appended for page 28
Data appended for page 29
Data appended for page 30
Data appended for page 31
Data appended for page 32
Data appended for page 33
Data appended for page 34
Data appended for page 35
Data appended for page 36
Data appended for page 37
Data appended for page 38
Data appended for pag

In [5]:
final.shape

(10000, 10)

# Explanation of the column

In [12]:
# Name --> Name of the company
# Rating --> Rating of the company
# Description --> Explanation in short
# Highly Rated for --> Positive part about the company
# Critically Rated for --> Negative part about the compnay
# Reviews --> Number of Reviews Given by employees
# Salaries --> Salaries for different post given by employees
# Interviews --> Number of Interview Question avaliable 
# Jobs --> Different Types of Jobs
# Benefits -> Different Types of Benefits

In [6]:
final.sample(10)

Unnamed: 0,Name,Rating,Description,Highly Rated For,Critically Rated For,Reviews,Salaries,Interviews,Jobs,Benefits
4648,V2Solutions,3.7,IT Services & Consulting | 501-1k Employees | ...,Promotions / Appraisal,Job Security,167,1.1k,12,--,25
6608,Armed Forces Tribunal,4.8,Government | 11-50 Employees | 17 years old | ...,Company Culture,Job Security,114,397,--,--,22
5715,Carnival Group,4.0,Media & Entertainment | 51-200 Employees | Con...,Work Life Balance,Company Culture,133,308,--,--,19
3648,L&T Water & Effluent Treatment,4.4,Engineering & Construction | 5k-10k Employees ...,Job Security,Skill Development / Learning,210,185,8,--,146
1328,Hitachi,4.1,Industrial Machinery | 10k-50k Employees | For...,Company Culture,Job Security,557,3.8k,59,275,66
1565,CARE,4.3,Non-Profit | 1k-5k Employees | 74 years old | ...,Company Culture,Skill Development / Learning,476,1.2k,36,--,84
8475,Isgec Hitachi Zosen,3.5,Industrial Machinery | 201-500 Employees | 12 ...,Salary & Benefits,Job Security,92,512,6,--,8
7427,DNV GL,4.3,IT Services & Consulting | 1k-5k Employees | 1...,Company Culture,Work Life Balance,100,539,6,--,11
8249,Synnat Pharma,4.4,1-10 Employees | 13 years old | Secunderabad +...,Work Life Balance,Skill Development / Learning,91,160,2,--,4
8847,Sigma Corporation India,4.3,Auto Components | 201-500 Employees | 60 years...,Job Security,Work Life Balance,84,574,5,--,12


In [7]:
final.isnull().sum()

Name                       0
Rating                     3
Description                0
Highly Rated For        1571
Critically Rated For    1571
Reviews                    0
Salaries                   0
Interviews                 0
Jobs                       0
Benefits                   0
dtype: int64

In [8]:
final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Name                  10000 non-null  object
 1   Rating                9997 non-null   object
 2   Description           10000 non-null  object
 3   Highly Rated For      8429 non-null   object
 4   Critically Rated For  8429 non-null   object
 5   Reviews               10000 non-null  object
 6   Salaries              10000 non-null  object
 7   Interviews            10000 non-null  object
 8   Jobs                  10000 non-null  object
 9   Benefits              10000 non-null  object
dtypes: object(10)
memory usage: 781.4+ KB


In [10]:
final.duplicated().sum()

121