## Step - 1: Web Scrapping

In [None]:
import requests
from bs4 import BeautifulSoup

In [None]:
request_header = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:106.0) Gecko/20100101 Firefox/106.0',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,/;q=0.8',
    'Accept-Language': 'en-US,en;q=0.5',
    # 'Accept-Encoding': 'gzip, deflate, br',
    'DNT': '1',
    'Connection': 'keep-alive',
    'Upgrade-Insecure-Requests':
    '1',
    'Sec-Fetch-Dest': 'document',
    'Sec-Fetch-Mode': 'navigate',
    'Sec-Fetch-Site': 'none',
    'Sec-Fetch-User': '?1',
}

# Use a session


In [None]:
import time

com_list = []
ratings = []
type_of_company = []
reviews = []

for i in range(1, 31):  # Pages 1-30
    url = f"https://www.ambitionbox.com/list-of-companies?campaign=desktop_nav&page={i}"
    
    response = requests.get(url, headers=request_header)
    
    # Check if the request succeeded
    if response.status_code != 200:
        print(f"Failed to fetch page {i}")
        continue
    
    soup = BeautifulSoup(response.content, 'lxml')
    
    # Extract company names
    for company_tag in soup.find_all("h2", class_="companyCardWrapper__companyName"):
        com_list.append(company_tag.text.strip())  # Use strip() to clean whitespace
    
    time.sleep(2)  # Avoid overwhelming the server

    # Extract ratings 
    for rate in soup.find_all("div", class_="companyCardWrapper__companyRatingWrapper"):
        ratings.append(rate.text.strip())  # Use strip() to clean whitespace
    
    time.sleep(2)  # Avoid overwhelming the serve

    # extract company_type
    for type in soup.find_all("span", class_="companyCardWrapper__interLinking"):
        type_of_company.append(type.text.strip())
        
    time.sleep(2)

    # Extract Reviews
    for review in soup.find_all("span", class_="companyCardWrapper__ActionCount"):
        reviews.append(review.text.strip())
        
    time.sleep(2)           


In [None]:
company_types = []
locations = []

for item in type_of_company:
    parts = item.split('|', 1)  # Split at first occurrence of '|'
    
    if len(parts) == 2:  
        company_types.append(parts[0].strip())
        locations.append(parts[1].strip())
    else:  
        company_types.append("Unknown")  # Handle missing company type
        locations.append(parts[0].strip())  # Treat full entry as location

print("Company Types:", company_types)
print("Locations:", locations)

In [None]:
reviews_1 = reviews[0::6]
salaries = reviews[1::6]
interviews = reviews[2::6]
jobs = reviews[3::6]
benefits = reviews[4::6]
photos = reviews[5::6]
#salaries, interviews, jobs, benefits, photos)

In [None]:
r=[]
for i in ratings:
    r.append(i[:3])

r

In [None]:
print(len(com_list))
print(len(ratings))
print(len(type_of_company))
print(len(reviews_1))
print(len(salaries))
print(len(interviews))
print(len(jobs))
print(len(benefits))
print(len(photos))

In [None]:
import pandas as pd

df = pd.DataFrame({'Company_Name':com_list,
                  'Rating':r,
                  'Industry_Type':company_types,
                   'Headquater_other_Locations' :locations,
                  'Total Reviews':reviews_1,
                  'AvgSalaries':salaries,
                  'Total_Interviews':interviews,
                  'Total_Jobs':jobs,
                  'Total_Benifits':benefits,
                  'Total_Photos':photos})
df

In [None]:
df.to_csv('Raw_Ambition_Box.csv', index=False)

In [None]:
lst = []
for i in com_list:
    res = i.lower()
    res_1 = res.replace(" ", "-")
    lst.append(res_1)
for i in lst:
    url = f"https://www.ambitionbox.com/overview/{i}-overview"
    print(url)

In [None]:
import time

ben=[]

for i in lst:  # Pages 1-10
    url = f"https://www.ambitionbox.com/overview/{i}-overview"
    
    response = requests.get(url, headers=request_header)
    
    # Check if the request succeeded
    if response.status_code != 200:
        print(f"Failed to fetch page {i}")
        continue
    
    soup = BeautifulSoup(response.content, 'lxml')
    
    # Extract company names
    for company_tag in soup.find_all("div", class_="css-175oi2r grid grid-cols-2 md:grid-cols-4 gap-3"):
        ben.append(company_tag.text.strip())  # Use strip() to clean whitespace
    
    time.sleep(2)

## Step - 2 : Data Cleaning

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
df = pd.read_csv(r"Raw_Ambition_Box.csv")
df

In [None]:
df.head(3) # top 3 data rows

In [None]:
df.tail(3)

In [None]:
df.shape # shape of the data

In [None]:
df.columns

In [None]:
df['Ratings'] = df['Ratings'].astype(float)

In [None]:
df['Total_Reviews']=df['Total Reviews'].str.replace('k', '', regex=True).astype(float) * 1000

In [None]:
df['Total_Benifits']=df['Total_Benifits'].apply(lambda x: float(x.replace('k', '')) * 1000 if 'k' in x else float(x))

In [None]:
df['Total_Interviews']=df['Total_Interviews'].apply(lambda x: float(x.replace('k', '')) * 1000 if 'k' in x else float(x))

In [None]:
# Function to convert values
def convert_values(x):
    if 'k' in x:
        return float(x.replace('k', '')) * 1000
    elif x == '--':  # Handling missing values
        return np.nan
    else:
        return float(x)  # Keep numeric values as they are

# Apply conversion
df['Total_Jobs'] = df['Total_Jobs'].apply(convert_values)

In [None]:
df.Total_Jobs

In [None]:
df['AvgSalaries'].unique

In [None]:
def convert_values(x):
    if 'k' in x:
        return float(x.replace('k', '')) * 1000
    elif 'L' in x:
        return float(x.replace('L', '')) * 100000
    else:
        return float(x)  # Keep numeric values as they are

# Apply conversion to column
df['AvgSalaries'] = df['AvgSalaries'].apply(convert_values)

In [None]:
df

In [None]:
df.drop(columns=['Total Reviews', 'Total_Photos'], inplace=True)
df

In [None]:
import pandas as pd

# Load the dataset
file_path = "/mnt/data/updated_Cleaned_Ambition_Box.csv"
df = pd.read_csv(file_path)

# Selecting relevant features for imputation
features = ['Total_Interviews', 'AvgSalaries', 'Total_Benifits']

# Fill missing values in Total_Jobs using median based on Industry_Type
df['Total_Jobs'] = df.groupby('Industry_Type')['Total_Jobs'].transform(lambda x: x.fillna(x.median()))

# If there are still missing values, fill with overall median
df['Total_Jobs'].fillna(df['Total_Jobs'].median(), inplace=True)

# Convert Total_Jobs to integer
df['Total_Jobs'] = df['Total_Jobs'].astype(int)

# Save the updated dataset
df.to_csv("/mnt/data/filled_Cleaned_Ambition_Box.csv", index=False)

print("Missing values in Total_Jobs filled successfully using data analysis, and datatype converted to int!")

In [None]:
df.to_csv('Cleaned_Ambition_Box.csv', index=False)

In [None]:
df.dtypes