## Importing Libraries

In [1]:
#selenium libraries
import selenium
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException

import time
import pandas as pd
import re

from configparser import ConfigParser

### Setting up a common launch pad for chrome browser with Webdriver

In [2]:
def get_drive_launch(url):
    """
    This function initiates the Chrome browser's driver with the url passed as a parameter and returns the driver instance.
    Parameters:
        url - url of the website
    return:
        driver - driver instance of the chrome browser    
    """
    #creating driver instance
    driver = webdriver.Chrome('./driver/chromedriver.exe')
    driver.maximize_window()
    
    #defining implicit wait
    driver.implicitly_wait(10)
    
    #launching the url
    driver.get(url)
    
    return driver


## Q1: Write a python program to scrape data for “Data Analyst” Job position in “Bangalore” location. You have to scrape the job-title, job-location, company_name, experience_required. You have to scrape first 10 jobs data.


This task will be done in following steps:
1. first get the webpage https://www.naukri.com/
2. Enter “Data Analyst” in “Skill,Designations,Companies” field and enter “Bangalore” in “enter the location” field.
3. Then click the search button.
4. Then scrape the data for the first 10 jobs results you get.
5. Finally create a dataframe of the scraped data. 

Note- All of the above steps have to be done in code. No step is to be done manually

In [3]:
def get_highlevel_details_naukri(job, location = None, salary_range=None, stop=None, use_filter=False):
    """
    This function will fetch the highlevel details from naukri website for the job tile and location provided as parameter values
        The function takes the 
        job - job_title
        location - default=None. The location filter
        salary_range - default = None. the salary filter
        stop - number of jobs to fetch. default =None
        use_filter - If filter facet should be used. default None
    """
    titles=[]
    companies=[]
    exps=[]
    salaries=[]
    locs=[]

    #launching naukri site
    url = 'https://www.naukri.com/'
    drivr = get_drive_launch(url)
    
    #typing in the job title to search
    drivr.find_element_by_id("qsb-keyword-sugg").send_keys(job)
    
    #checking if location filter should be used directly or in the filter facet
    if ((location) and (not use_filter)):
        drivr.find_element_by_id('qsb-location-sugg').send_keys(location)
    time.sleep(2)
    
    #clicking on search
    drivr.find_element_by_xpath("//div[@class='search-btn']/button").click()
    
    #checking if filter for the location
    if use_filter:        
        if location:
            time.sleep(3)
            try:
                drivr.find_element_by_xpath("//span[@class = 'ellipsis fleft' and contains(text(), "+"'"+location+"'"+")]/../preceding-sibling::i").click()
            except:
                drivr.find_element_by_xpath("//div[@data-filter-id='citiesGid']//a").click()
                time.sleep(3)
                drivr.find_element_by_xpath("//span[@class = 'ellipsis fleft' and contains(text(), "+"'"+location+"'"+")]/../preceding-sibling::i").click()
    
    #checking if salary range is requested
    if salary_range:
            try:
                drivr.find_element_by_xpath("//span[contains(text(), "+"'"+salary_range+"'"+")]/../preceding-sibling::i").click()
            except:
                drivr.find_element_by_xpath("//div[@data-filter-id='salaryRange']//a").click()
                time.sleep(3)
                drivr.find_element_by_xpath("//span[contains(text(), "+"'"+salary_range+"'"+")]/../preceding-sibling::i").click()
    time.sleep(3)
    
    #printing out the topic for search results
    print(drivr.find_element_by_xpath("//div[@class='sortAndH1Cont']/h1").text)

    #Fetching the web elements
    job_titles = drivr.find_elements_by_xpath("//div[@class='info fleft']/a")
    job_companies = drivr.find_elements_by_xpath("//div[@class='info fleft']/div/a[1]")
    job_exps = drivr.find_elements_by_xpath("//div[@class='info fleft']/ul/li[1]")
    job_salaries = drivr.find_elements_by_xpath("//div[@class='info fleft']/ul/li[2]")
    job_locations = drivr.find_elements_by_xpath("//div[@class='info fleft']/ul/li[3]")
    
    #iterating through the webelements to scrap the texts
    for job_title, job_company, job_exp, job_salary, job_loc in zip(job_titles[:stop],job_companies[:stop],job_exps[:stop],job_salaries[:stop],job_locations[:stop]):
        titles.append(job_title.text)
        companies.append(job_company.text)
        exps.append(job_exp.text)
        salaries.append(job_salary.text)
        locs.append(job_loc.text)
    
    #returning the driver instance and the data as a dataframe
    return drivr, pd.DataFrame({'Job Title':titles, 'Company':companies, 'Experience Required':exps,'Salary offered':salaries, 'Location':locs})

In [4]:
d, dataFrame = get_highlevel_details_naukri('Data Analyst', location='Bangalore', stop=10)
#quiting the driver instance
d.quit()
dataFrame

Data Analyst Jobs In Bangalore


Unnamed: 0,Job Title,Company,Experience Required,Salary offered,Location
0,Data Analyst Analyzing,Cistup Indian Institute of Science,2-5 Yrs,Not disclosed,Bangalore/Bengaluru
1,Senior Analyst-Data Visualization,Accenture Solutions Pvt Ltd,5-8 Yrs,Not disclosed,Bangalore/Bengaluru
2,Market Unit - Data Business Analyst (11),Accenture Solutions Pvt Ltd,1-2 Yrs,Not disclosed,Bangalore/Bengaluru
3,Data Analyst,Myntra Designs Pvt. Ltd.,3-6 Yrs,Not disclosed,Bangalore/Bengaluru
4,Analyst-Finance Data Maintenance,Accenture Solutions Pvt Ltd,3-5 Yrs,Not disclosed,Bangalore/Bengaluru
5,Senior Analyst-Data Management,Accenture Solutions Pvt Ltd,5-8 Yrs,Not disclosed,Bangalore/Bengaluru
6,Analyst-Data Management,Accenture Solutions Pvt Ltd,3-5 Yrs,Not disclosed,Bangalore/Bengaluru
7,Senior Analyst-Data Management,Accenture Solutions Pvt Ltd,5-8 Yrs,Not disclosed,Bangalore/Bengaluru
8,Senior Analyst-Data Visualization,Accenture Solutions Pvt Ltd,5-8 Yrs,Not disclosed,Bangalore/Bengaluru
9,Senior Analyst-Finance Data Maintenance,Accenture Solutions Pvt Ltd,5-8 Yrs,Not disclosed,Bangalore/Bengaluru


## Q2: Write a python program to scrape data for “Data Scientist” Job position in “Bangalore” location. You have to scrape the job-title, job-location, company_name, full job-description. You have to scrape first 10 jobs data.


This task will be done in following steps:
1. first get the webpage https://www.naukri.com/
2. Enter “Data Scientist” in “Skill,Designations,Companies” field and enter “Bangalore” in “enter the location” field.
3. Then click the search button.
4. Then scrape the data for the first 10 jobs results you get.
5. Finally create a dataframe of the scraped data.

Note- 
1. All of the above steps have to be done in code. No step is to be done manually.
2. Please note that you have to scrape full job description. For that you may have to open each job separately as shown below

In [5]:
def get_full_descrp_naukri(driver, stop = None):
    """
    This function will go into each job ads and fetch the full job descriptions form naukri site.
    It takes the 
    'driver' - Webdriver for chrome instance as a parameter
    'stop' - stop - number of jobs to fetch. default =None
    Returns the driver instance and a list of full descriptions
    
    """
    full_descriptions = []
    
    #get the window handles
    main_window = driver.current_window_handle
    all_windows = driver.window_handles
    
    #close the other windows except the main window. 
    for window in all_windows:
        if window != main_window:
            driver.switch_to.window(window)
            driver.close()

    driver.switch_to.window(main_window)
    main_window = driver.current_window_handle
    
    #get the webelements and iterate through each webelement to scrap the data.
    elems = driver.find_elements_by_xpath("//div[@class='info fleft']/a")
    for element in elems[:stop]:
        element.click()
        time.sleep(2)
        
        #get the window handles and switch to the new tab
        all_windows = driver.window_handles
        driver.switch_to.window(all_windows[1])
        #scrap data
        try:
            full_descriptions.append(driver.find_element_by_xpath("//section[@class='job-desc']").text)
        except:
            full_descriptions.append(driver.find_element_by_xpath("//div[@class='clearboth description']").text)
        #close the new tab and go back to main window
        driver.close()
        driver.switch_to.window(main_window)
    
    return driver, full_descriptions


In [6]:
#get the driver instance for the search keyword and filters
d, dataFrame = get_highlevel_details_naukri('Data Scientist', location='Bangalore', stop=10)

#pass the driver instance to get the full descriptions as a list and create a new column with it
d, dataFrame['Full Description'] = get_full_descrp_naukri(d, 10)

#quit the driver instance
d.quit()

#dropping unwanted columns
dataFrame.drop(['Experience Required', 'Salary offered'], axis=1, inplace = True)
dataFrame




Unnamed: 0,Job Title,Company,Location,Full Description
0,Data Scientist,CronJ IT Technologies Private Limited,Bangalore/Bengaluru,Job description\nResponsibilities and Duties\n...
1,Opportunity For Data Scientist Internship - Be...,Corner Stone Solutions,Bangalore/Bengaluru,Job description\nLocation - Bangalore / Bengal...
2,Data Scientist/ Analyst,Becton Dickinson India Pvt. Ltd,Bangalore/Bengaluru,Job description\nRoles and Responsibilities\no...
3,Data Scientist - Machine Learning,AugmatrixGo,Bangalore/Bengaluru,Job description\nRoles and Responsibilities\n\...
4,Data Scientist || Data Analyst || Data science,Inspiration Manpower Consultancy Pvt. Ltd.,"Navi Mumbai, Bangalore/Bengaluru",Job description\nJob description\nJob Summary ...
5,DBCG IND - GAMMA Senior Data Scientist,Boston Consulting Group,"Mumbai, New Delhi, Chennai, Bangalore/Bengaluru",Job description\n What Youll Do\n\nWe re lo...
6,Data Scientist/Senior Data Scientist,GANIT BUSINESS SOLUTIONS PRIVATE LIMITED,"Hyderabad/Secunderabad, Pune, Chennai, Bangalo...",Job description\n\nAbout Ganit Inc\n\nFounded ...
7,Senior Data Scientist | CES IT LTD | CMMI Level 5,CES Ltd.,"Kolkata, Mumbai, Hyderabad/Secunderabad, Pune,...",Job description\nRoles and Responsibilities\n\...
8,Global Medical Data Scientist,GlaxoSmithKline Pharmaceuticals Limited,Bangalore/Bengaluru,This is an ideal role for an experienced candi...
9,Associate Data Scientist - CRM & Loyalty,Shell India Markets Private Limited,Bangalore/Bengaluru,Job description\nThe Role\nGeneral Position De...


In [7]:
#checking a full description
dataFrame.loc[2, 'Full Description']

'Job description\nRoles and Responsibilities\nob Description Summary:\nAs a member of the BD Advanced Analytics team, the Data Scientist role will collect, interpret, and deploy predictive analytics and machine learning models to assist our business partners in delivering advanced data analytics solutions to address key strategic business initiatives. The primary focus being the development and deployment of actionable insights that have a positive impact on BD’s business operations.\nJob Description:\nEvaluating business requirements and developing compelling user stories in collaboration with business stakeholders across various functions and regions\nCollecting, interpreting, preparing and modelling data to provide actionable insights\nPrototyping advanced data analytics solutions for presentation to business partners and stakeholders\nApplying predictive analytics and machine learning methods and techniques appropriately to address business requirements\nProvide expertise to guide 

## Q3: In this question you have to scrape data using the filters available on the webpage as shown below:


1. You have to use the location and salary filter.
2. You have to scrape data for “Data Scientist” designation for first 10 job results.
3. You have to scrape the job-title, job-location, company_name, experience_required.

1. The location filter to be used is “Delhi/NCR”
2. The salary filter to be used is “3-6” lakhs

The task will be done as shown in the below steps:
1. first get the webpage https://www.naukri.com/
2. Enter “Data Scientist” in “Skill,Designations,Companies” field .WEB SCRAPING ASSIGNMENT-2.
3. Then click the search button.
4. Then apply the location filter and salary filter by checking the respective boxes
5. Then scrape the data for the first 10 jobs results you get.
6. Finally create a dataframe of the scraped data.

Note- All of the above steps have to be done in code. No step is to be done 
manually.

In [8]:
#using the 'get_highlevel_details_naukri' function to fetch the results from naukri website
driver,dataFrame=get_highlevel_details_naukri('Data Scientist', location = 'Delhi / NCR', salary_range='3-6 Lakhs', stop=10, use_filter=True)
driver.quit()

#dropping unwanted column
dataFrame.drop(['Salary offered'],axis=1,inplace=True)
dataFrame

Data Scientist Jobs


Unnamed: 0,Job Title,Company,Experience Required,Location
0,Developer - Data Science,ICL Systems India Private Limited,3-5 Yrs,Delhi / NCR
1,Data Scientist/Data Analyst - Python/Machine L...,Change leaders,5-10 Yrs,"Mumbai, Ghaziabad"
2,Data Scientist,Amity University,6-8 Yrs,"Ghaziabad, Faridabad, Delhi / NCR"
3,Data Scientist - Python & Machine Learning,FUTURES AND CAREERS,2-7 Yrs,"Hyderabad/Secunderabad, Pune, Bangalore/Bengal..."
4,Data Scientist - Python & Machine Learning,FUTURES AND CAREERS,2-7 Yrs,"Hyderabad/Secunderabad, Pune, Chennai, Bangalo..."
5,Data Scientist - Python / Machine Learning / T...,FUTURES AND CAREERS,3-8 Yrs,"Mumbai, Hyderabad/Secunderabad, Bangalore/Beng..."
6,Data Scientist - Python & Machine Learning,FUTURES AND CAREERS,2-7 Yrs,"Hyderabad/Secunderabad, Bangalore/Bengaluru, D..."
7,Hiring For Data Scientist,Max Bupa Health Insurance Company Limited,1-6 Yrs,"Gurgaon/Gurugram, Delhi / NCR"
8,Required- Data Scientist (NLP)-Axis Bank - 6 m...,Axis Bank Limited,4-9 Yrs,"Kolkata, New Delhi, Hyderabad/Secunderabad, Pu..."
9,Data Scientist - Python / Machine Learning / T...,FUTURES AND CAREERS,3-8 Yrs,"Hyderabad/Secunderabad, Bangalore/Bengaluru, D..."


## Q4: Write a python program to scrape data for first 10 job results for Data scientist Designation in Noida location. You have to scrape company_name, No. of days ago when job was posted, Rating of the company.


This task will be done in following steps:
1. first get the webpage https://www.glassdoor.co.in/index.htm
2. Enter “Data Scientist” in “Job Title,Keyword,Company” field and enter “Noida” in “location” field.
3. Then click the search button. You will land up in the below page:
4. Then scrape the data for the first 10 jobs results you get in the above shown page.
5. Finally create a dataframe of the scraped data.

Note- All of the above steps have to be done in code. No step is to be done
manually.

Sometimes Glassdoor requires the user to log in in order to view the pages. Hence created a log in function.

In [30]:
def login_glassdoor(driver):
    """
    This function fetches the user name and password from a config.ini file and logs in into Glassdoor website
    """
    #Loading the config file
    config = ConfigParser()
    config.read('./config.ini')
    
    #getting the username and password
    username = config.get('credentials','user_name')
    paswd = config.get('credentials','password')
    
    #signing in as a valid user
    main_window = driver.current_window_handle
    try:
        driver.find_element_by_class_name("locked-home-sign-in").click()
    except NoSuchElementException:
        driver.find_element_by_xpath("//li[@class='sign-in']/a").click()
    time.sleep(3)
    driver.find_element_by_id("userEmail").send_keys(username)
    driver.find_element_by_id("userPassword").send_keys(paswd)
    driver.find_element_by_xpath("//button[contains(text(),'Sign In')]").click()
    time.sleep(3)
    
    driver.switch_to.window(main_window)
    return driver

In [45]:
def get_highlevel_details_glassdoor(job, location = None, salary_range=None, stop=None, use_filter=False):
    """
    This function gets the job title, location, salary_range, number of rows as input and fetches the job titles, companies, 
    age of the job post, locations and ratings data as a dataframe
    """
    
    titles=[]
    companies=[]
    no_days_posts=[]
    #salaries=[]
    locs=[]
    ratings=[]

    #Glassdoor url
    url = 'https://www.glassdoor.co.in/index.htm'
    #launching the website.
    drivr = get_drive_launch(url)
    #logging in into Glassdoor site
    drivr = login_glassdoor(drivr)
    
    #entering the search keyword
    drivr.find_element_by_id("sc.keyword").send_keys(job)
    time.sleep(5)
    
    #Entering the location details
    if ((location) and (not use_filter)):
        drivr.find_element_by_xpath("//input[@data-test='search-bar-location-input']").send_keys(Keys.CONTROL + "a")
        drivr.find_element_by_xpath("//input[@data-test='search-bar-location-input']").send_keys(Keys.DELETE)
        drivr.find_element_by_xpath("//input[@data-test='search-bar-location-input']").send_keys(location)
    time.sleep(2)
    drivr.find_element_by_xpath("//button[@type='submit']").click()
    
    #Entering location in filter facet
    if use_filter:        
        if location:
            time.sleep(3)
            #Expanding and finding the location
            try:
                drivr.find_element_by_xpath("//span[@class = 'ellipsis fleft' and contains(text(), "+"'"+location+"'"+")]/../preceding-sibling::i").click()
            except:
                drivr.find_element_by_xpath("//div[@data-filter-id='citiesGid']//a").click()
                time.sleep(3)
                drivr.find_element_by_xpath("//span[@class = 'ellipsis fleft' and contains(text(), "+"'"+location+"'"+")]/../preceding-sibling::i").click()
        
        #entering the salary range as a filter
        if salary_range:

            try:
                drivr.find_element_by_xpath("//span[contains(text(), "+"'"+salary_range+"'"+")]/../preceding-sibling::i").click()
            except:
                drivr.find_element_by_xpath("//div[@data-filter-id='salaryRange']//a").click()
                time.sleep(3)
                drivr.find_element_by_xpath("//span[contains(text(), "+"'"+salary_range+"'"+")]/../preceding-sibling::i").click()

    time.sleep(5)
    
    #Printing the search result topic
    print(drivr.find_element_by_xpath("//h1[@data-test='jobTitle']").text)
    
    #Fetching the job tiles as web elements
    job_links = drivr.find_elements_by_xpath("//div[@class='d-flex flex-column css-x75kgh e1rrn5ka3']")
    
    #Fetching ratings
    for job_link in job_links[:stop]:
        try:
            if job_link.text:
                ratings.append(job_link.text)
            else:
                ratings.append("No Rating")
            #ratings.append(job_link.find_element_by_xpath("//span[@class='css-19pjha7 e1cjmv6j1']").text)
        except:
            ratings.append("No Rating")
    
    #Fetching the web elements
    job_titles = drivr.find_elements_by_xpath("//a[@data-test='job-link']/span")
    job_companies = drivr.find_elements_by_xpath("//div[@class='d-flex justify-content-between align-items-start']//a/span")
    job_days = drivr.find_elements_by_xpath("//div[@data-test='job-age']")
    #job_salaries = drivr.find_elements_by_xpath("//div[@class='info fleft']/ul/li[2]")
    job_locations = drivr.find_elements_by_xpath("//div[@class='d-flex flex-wrap css-11d3uq0 e1rrn5ka2']/span")
    
    #Fetching texts from the webelements
    for job_title, job_company, job_day, job_loc in zip(job_titles[:stop],job_companies[:stop],job_days[:stop],job_locations[:stop]):
        titles.append(job_title.text.strip())
        companies.append(job_company.text.strip())
        no_days_posts.append(job_day.text.strip())
        #salaries.append(job_salary.text)
        locs.append(job_loc.text.strip())
    
    #Returning the driver instance and the dataframe
    return drivr, pd.DataFrame({'Job Title':titles, 'Company':companies, 'Job Post Age':no_days_posts, 'Location':locs, 'Ratings':ratings})

In [46]:
#Scrapping data
driver, dataframe = get_highlevel_details_glassdoor('Data Scientist','Noida',stop=10)
driver.quit()

#Selecting columns
dataframe=dataframe[['Company', 'Job Post Age', 'Ratings']]
dataframe

Data scientist Jobs in Noida


Unnamed: 0,Company,Job Post Age,Ratings
0,Salasar New Age Technologies,30d+,No Rating
1,Biz2Credit Inc,30d+,3.8
2,Techlive,30d+,5.0
3,Adobe,6d,4.4
4,SearchUrCollege,30d+,No Rating
5,CRMNEXT,12d,3.5
6,Microsoft,30d+,4.4
7,Salasar New Age Technologies,30d+,No Rating
8,Unyscape Infocom Pvt. Ltd,30d+,4.1
9,Genpact,1d,3.8


## Q5: Write a python program to scrape the salary data for Data Scientist designation in Noida location. You have to scrape Company name, Number of salaries, Average salary, Min salary, Max Salary.


The above task will be, done as shown in the below steps:
1. first get the webpage https://www.glassdoor.co.in/Salaries/index.htm
2. Enter “Data Scientist” in Job title field and “Noida” in location field.
3. Click the search button.
4. After that you will land on the below page You have to scrape whole data from this webpage
5. Scrape data for first 10 companies. Scrape the min salary, max salary, company name, Average salary and rating of the company.
6. Store the data in a dataframe.

Note that all of the above steps have to be done by coding only and not manually.

In [47]:
def get_salary_details_glassdoor(url, job_title, location, stop=None):
    """
    This function will get the url, job title, location and number of rows(stop) as input and fetches company, minimum salary, 
    average salary, maximum salary and number of salaries as a dataframe output.
    """
    companies = []
    min_salaries = []
    max_salaries = []
    avg_salaries = []
    num_salaries = []
    
    #Launch the site
    drivr = get_drive_launch(url)
    
    #log in into the site
    drivr = login_glassdoor(drivr)
    
    #Entering the search keyword and location
    drivr.find_element_by_id("KeywordSearch").send_keys(job_title)
    drivr.find_element_by_id("LocationSearch").send_keys(Keys.CONTROL + "a")
    drivr.find_element_by_id("LocationSearch").send_keys(Keys.DELETE)
    drivr.find_element_by_id("LocationSearch").send_keys(location)
    drivr.find_element_by_id("HeroSearchButton").click()
    time.sleep(2)


    limit = stop
    
    #Looping till we reach the number of rows requested by user
    while len(companies) < limit:
        #Get the web elements
        company_elems = drivr.find_elements_by_xpath("//div[@data-test='job-info']/p[2]")
        num_sal_elems = drivr.find_elements_by_xpath("//div[@data-test='job-info']/p[@class='css-1uyte9r css-1kuy7z7 m-0 ']")
        avg_sal_elems = drivr.find_elements_by_xpath("//div[@class='col-2 d-none d-md-flex flex-row justify-content-end']")
        min_sal_elems = drivr.find_elements_by_xpath("//div[@class='common__RangeBarStyle__values d-flex justify-content-between ']/span[1]")
        max_sal_elems = drivr.find_elements_by_xpath("//div[@class='common__RangeBarStyle__values d-flex justify-content-between ']/span[2]")
        
        #Loop over the web elements and fetch the data
        for company, num_sal, avg_sal, min_sal, max_sal in zip(company_elems, num_sal_elems, avg_sal_elems, min_sal_elems, max_sal_elems):
            companies.append(company.text)
            num_salaries.append(num_sal.text)
            avg_salaries.append(avg_sal.text.replace('\n',''))
            min_salaries.append(min_sal.text)
            max_salaries.append(max_sal.text)
            
            #Break if the requested number of rows are achieved within for loop
            if len(companies) == limit:
                break
        #Break if the requested number of rows are achieved within while loop
        if len(companies) == limit:
                break
        
        #Get the url to the next page if number of rows requested is not achieved yet
        url = drivr.find_element_by_xpath("//a[@class='pagination__ArrowStyle__nextArrow  ']").get_attribute('href')
        
        #quit the browser driver and launch a new driver with the url to the next page and repeat the steps above
        drivr.quit()
        drivr = get_drive_launch(url)
        time.sleep(2)
    
    #Create an output dataframe with the data fetched and return
    dataframe = pd.DataFrame({'Company': companies, 'Number of Salaries': num_salaries, 'Average Salary': avg_salaries, 'Minimum Salary': min_salaries, 'Maximum Salary': max_salaries})
    return drivr, dataframe
    

In [48]:
#Pass the url and fetch the data
url = 'https://www.glassdoor.co.in/Salaries/index.htm'
d, df = get_salary_details_glassdoor(url, 'Data Scientist', 'Noida', 10)
#Quit the driver
d.quit()
df

Unnamed: 0,Company,Number of Salaries,Average Salary,Minimum Salary,Maximum Salary
0,Tata Consultancy Services,14 salaries,"₹ 6,01,000/yr",₹336L,"₹1,080L"
1,Accenture,14 salaries,"₹ 11,51,207/yr",₹579L,"₹2,222L"
2,Delhivery,14 salaries,"₹ 12,34,207/yr",₹452L,"₹11,669L"
3,IBM,13 salaries,"₹ 7,63,825/yr",₹589L,"₹2,741L"
4,Ericsson-Worldwide,12 salaries,"₹ 7,32,209/yr",₹350L,"₹1,619L"
5,UnitedHealth Group,10 salaries,"₹ 13,88,910/yr","₹1,050L","₹1,500L"
6,Valiance Solutions,9 salaries,"₹ 8,18,515/yr",₹504L,"₹1,471L"
7,Innovaccer,8 salaries,"₹ 12,01,403/yr",₹623L,"₹1,702L"
8,ZS Associates,7 salaries,"₹ 10,00,000/yr",₹203L,"₹1,817L"
9,EXL Service,7 salaries,"₹ 11,90,000/yr",₹578L,"₹1,500L"


## Q6 : Scrape data of first 100 sunglasses listings on flipkart.com. You have to scrape four attributes:
1. Brand
2. Product Description
3. Price
4. Discount %
The attributes which you have to scrape is ticked marked in the below image.


To scrape the data you have to go through following steps:
1. Go to flipkart webpage by url https://www.flipkart.com/
2. Enter “sunglasses” in the search field where “search for products, brands and more” is written and click the search icon
3. after that you will reach to a webpage having a lot of sunglasses. From this page you can scrap the required data as usual.
4. after scraping data from the first page, go to the “Next” Button at the bottom of the page , then click on it
5. Now scrape data from this page as usual
6. repeat this until you get data for 100 sunglasses.

Note that all of the above steps have to be done by coding only and not manually.

In [34]:
def get_product_highlevel_details_flipkart(product_name, no_of_products = 100):
    """
    This function gets the product name and number of products to be fetched from flipkart and gives the highlevel data like
    Product name, product short description, Product price and product discounts as a dataframe. 
    """
    
    product_names = []
    product_descriptions = []
    product_prices = []
    product_discounts = []
    
    #launch the flipkart site
    url = 'https://www.flipkart.com/'
    driver = get_drive_launch(url)
    driver.find_element_by_xpath("//button[contains(text(),'✕')]").click()

    #searching the product
    driver.find_element_by_xpath("//input[@title='Search for products, brands and more']").send_keys(product_name)
    driver.find_element_by_xpath("//button[@type='submit']").click()
    time.sleep(2)

    #Looping until the requested number of products are fetched
    while len(product_names) < no_of_products:
        #Getting the web elements
        product_name_elems = driver.find_elements_by_xpath("//div[@class='_2WkVRV']")
        product_description_elems = driver.find_elements_by_xpath("//div[@class='_2WkVRV']/../a[not(@class='_3bPFwb')]")
        price_elems= driver.find_elements_by_xpath("//a[@class='_3bPFwb']//div[@class='_30jeq3']")
        price_discount_elems = driver.find_elements_by_xpath("//a[@class='_3bPFwb']")
        
        #Looping over the web elements and fetching the texts
        for prod_name_elem, product_description_elem, price_elem, price_discount_elem in zip(product_name_elems,product_description_elems,price_elems,price_discount_elems):
            product_names.append(prod_name_elem.text.strip())
            product_descriptions.append(product_description_elem.text.strip())
            product_prices.append(price_elem.text.strip())
            try:
                discount = price_discount_elem.find_element_by_class_name("_3Ay6Sb").text
            except:
                discount = None
            if discount:
                product_discounts.append(discount)
            else:
                product_discounts.append("0% off")
            #Break if number of products requested is obtained within for loop
            if len(product_names) == no_of_products:
                break
        
        #Break if number of products requested is obtained within While loop
        if len(product_names) == no_of_products:
            break
        #Go to the next page url if more products are required
        driver.find_element_by_xpath("//span[contains(text(),'Next')]").click()
        time.sleep(3)
    #Create a dataframe and return driver instance and the dataframe
    dataframe = pd.DataFrame({'Product Brand': product_names, 'Product Description' : product_descriptions, 'Product Price' : product_prices,'Product Discount':product_discounts})
    return driver, dataframe

In [35]:
#Fetching data
d, dataFrame = get_product_highlevel_details_flipkart('sunglasses', 100)

d.quit()
dataFrame

Unnamed: 0,Product Brand,Product Description,Product Price,Product Discount
0,Fastrack,UV Protection Wayfarer Sunglasses (Free Size),₹758,15% off
1,Fastrack,UV Protection Rectangular Sunglasses (Free Size),₹695,13% off
2,Fastrack,"Mirrored, UV Protection Wayfarer Sunglasses (F...",₹499,50% off
3,PIRASO,UV Protection Aviator Sunglasses (Free Size),₹314,80% off
4,PIRASO,UV Protection Aviator Sunglasses (54),₹225,85% off
...,...,...,...,...
95,NuVew,"UV Protection, Gradient, Night Vision, Mirrore...",₹377,69% off
96,Fastrack,UV Protection Round Sunglasses (52),"₹1,243",4% off
97,ROZZETTA CRAFT,"UV Protection, Gradient Round Sunglasses (Free...",₹426,84% off
98,FLYING MACHINE,UV Protection Wayfarer Sunglasses (Free Size),₹686,57% off


## Q7: Scrape 100 reviews data from flipkart.com for iphone11 phone. You have to go the link: https://www.flipkart.com/apple-iphone-11-black-64-gb-includesearpods-poweradapter/p/itm0f37c2240b217?pid=MOBFKCTSVZAXUHGR&lid=LSTMOBFKCTSVZAXUHGREPBFGI&marketplace. 


When you will open the above link you will reach to the below shown webpage.

As shown in the above page you have to scrape the tick marked attributes.
These are 
1. Rating 
2. Review_summary 
3. Full review

You have to scrape this data for first 100 reviews.

In [36]:
def get_reviews_flipkart(product_url, no_of_reviews):
    """
    This function gets the product details page url and number of reviews as input and give the Full review of each product 
    page urls passed as input.
    """

    ratings = []
    review_summaries = []
    full_reviews = []
    
    #launch the webpage
    drivr = get_drive_launch(product_url)
    #Get the reviews url from the page
    product_reviews_url = drivr.find_element_by_xpath("//div[@class='_3UAT2v _16PBlm']/..").get_attribute('href')
    #quit the driver and launch the review page using the above url fetched from the product page
    drivr.quit()
    drivr = get_drive_launch(product_reviews_url)

    time.sleep(2)

    #Run a while loop until the numbr of reviews requested are obtained
    while len(ratings) < no_of_reviews:
        #get the webelements
        rating_elems = drivr.find_elements_by_xpath("//div[@class='_3LWZlK _1BLPMq']")
        review_summary_elems = drivr.find_elements_by_xpath("//p[@class='_2-N8zT']")
        full_review_elems= drivr.find_elements_by_xpath("//div[@class='t-ZTKy']")
        
        #loop over the list of webelements to fetch the text data
        for rating_elem,review_summary_elem,full_review_elem in zip(rating_elems,review_summary_elems,full_review_elems):
            ratings.append(rating_elem.text.strip())
            review_summaries.append(review_summary_elem.text.strip())
            full_reviews.append(full_review_elem.text.strip())
            #Break if number of reviews requested is obtained within for loop
            if len(ratings) == no_of_reviews:
                break
        #Break if number of reviews requested is obtained within while loop
        if len(ratings) == no_of_reviews:
            break
        #Go to the next page if number of reviews are not met yet
        drivr.find_element_by_xpath("//span[contains(text(),'Next')]").click()
        time.sleep(3)
    #create and return the dataframe output and driver instance
    dataframe = pd.DataFrame({'Rating': ratings, 'Review Summary' : review_summaries, 'Full Review' : full_reviews})
    return drivr, dataframe

In [49]:
product_url = 'https://www.flipkart.com/apple-iphone-11-black-64-gb-includes%02earpods-power%02adapter/p/itm0f37c2240b217?pid=MOBFKCTSVZAXUHGR&lid=LSTMOBFKCTSVZAXUHGREPBFGI&marketplace'
d, DataFrame = get_reviews_flipkart(product_url, 100)
d.quit()
DataFrame

Unnamed: 0,Rating,Review Summary,Full Review
0,5,Brilliant,The Best Phone for the Money\n\nThe iPhone 11 ...
1,5,Perfect product!,Amazing phone with great cameras and better ba...
2,5,Worth every penny,Previously I was using one plus 3t it was a gr...
3,5,Great product,Amazing Powerful and Durable Gadget.\n\nI’m am...
4,5,Highly recommended,iphone 11 is a very good phone to buy only if ...
...,...,...,...
95,5,Super!,I’d like to start by saying that the overall e...
96,5,Terrific,Have used both iPhone X and iPhone XR and I ca...
97,4,Very Good,impressive super phone and best in class camer...
98,5,Terrific,Good buy.. working perfectly !\n\nThat was upg...


In [50]:
#Verifying a full review
DataFrame.loc[1,'Full Review']

'Amazing phone with great cameras and better battery which gives you the best performance. I just love the camera .'

## Q8: Scrape data for first 100 sneakers you find when you visit flipkart.com and search for “sneakers” in the search field.

You have to scrape 4 attributes of each sneaker :
1. Brand
2. Product Description
3. Price
4. discount %

As shown in the below image, you have to scrape the tick marked attributes

Also note that all the steps required during scraping should be done through code 
only and not manually

In [38]:
#Reusing the same get_product_highlevel_details_flipkart(product_name, no_of_products) for this problem
d, dataFrame = get_product_highlevel_details_flipkart('sneakers', 100)
d.quit()
dataFrame

Unnamed: 0,Product Brand,Product Description,Product Price,Product Discount
0,French Connection,Sneakers For Men,₹799,60% off
1,ROCKFIELD,Sneakers For Men,₹450,54% off
2,Chevit,Perfect & Affordable Combo Pack of 02 Pairs Sn...,₹499,72% off
3,Robbie jones,Casual Sneakers Shoes For Men Sneakers For Men,₹399,60% off
4,Chevit,Combo Pack of 4 Casual Sneakers With Sneakers ...,₹474,76% off
...,...,...,...,...
95,Englewood,White Shoes For Men | Casual White Laceups Sho...,₹496,66% off
96,Chevit,Men's Combo Pack of 02 Shoes for Men Casual Sn...,₹420,57% off
97,Ktiz,Rockstyle Trending Multicolor Ultralight canva...,₹418,67% off
98,ESSENCE,Sneakers For Men,₹442,55% off


## Q9: Go to the link - https://www.myntra.com/shoes Set Price filter to “Rs. 6649 to Rs. 13099” , Color filter to “Black” and then scrape First 100 shoes data you get. The data should include “Brand” of the shoes , Short Shoe description, price of the shoe Please note that applying the filter and scraping the data , everything should be done through code only and there should not be any manual step.

In [39]:
def get_product_details_myntra(url, num_of_items = 10, use_filter = False, price_range_item_num = 1, color = 'White'):
    """
    This function fetch the product name, product short description and prices from Myntra website.
    The function will take the following parameters as input:
    url - Url of the website
    num_of_items - number of items to fetch. default = 10
    use_filter - whether to use the filter facet or not. default = False
    price_range_item_num - The price range in the filter facet always has 4 options with dynamic range based on the product.
                            This parameter uses the index to choose which filter to apply in price range.
    color - the value to use in the 'color' filter facet
    Returns:
        driver instance
        dataframe
    """

    product_names = []
    product_short_desc = []
    prices = []
    
    #launch the url
    drivr = get_drive_launch(url)
    
    #select the filter values
    if use_filter:
        drivr.find_element_by_xpath("//ul[@class='price-list']/li["+str(price_range_item_num)+"]").click()
        time.sleep(2)
        try:
            drivr.find_element_by_xpath("//li[@class='colour-listItem']/label[(contains(text(),'"+color+"'))]").click()
        except:
            drivr.find_element_by_class_name("colour-more").click()
            drivr.find_element_by_xpath("//li[@class='colour-listItem']/label[(contains(text(),'"+color+"'))]").click()
        time.sleep(2)

    #Loop until the number of items requested by the user is met
    while (len(product_names) < num_of_items):  
        #Get the web elements
        brand_name_elems = drivr.find_elements_by_xpath("//h3[@class='product-brand']")
        short_desc_elems = drivr.find_elements_by_xpath("//h4[@class='product-product']")        
        price_elems = drivr.find_elements_by_xpath("//div[@class='product-price']")
        
        #loop over the list of webelements to fetch the text data
        for brand_name, short_desc, price in zip(brand_name_elems, short_desc_elems, price_elems):
            product_names.append(brand_name.text)
            product_short_desc.append(short_desc.text)
            prices.append(price.text.split('Rs. ')[1])
            #Break if the number of items requested is met within the for look
            if len(product_names) == num_of_items:
                break
        #Break if the number of items requested is met within the while look
        if len(product_names) == num_of_items:
                break
        #Go to the next page if more items are needed
        url = drivr.find_element_by_xpath("//a[@rel='next']").get_attribute('href')
        drivr.quit()
        drivr = get_drive_launch(url)
        time.sleep(3)
    #Creat and return the output dataframe and driver instance
    datafrm = pd.DataFrame({'Product Name':product_names, 'Product Short Description': product_short_desc, 'Price in INR':prices})
    return drivr, datafrm


In [40]:
url = 'https://www.myntra.com/shoes'
#Fetch data from Myntra
d, df = get_product_details_myntra(url, 100, True, 2, 'Black')
d.quit()
df


Unnamed: 0,Product Name,Product Short Description,Price in INR
0,Nike,Men AIR ZOOM Running Shoes,11470
1,Puma,Men Fuse Training Sports Shoes,7999
2,Nike,Men KD13 EP Basketball Shoes,12995
3,Nike,Men AIR MAX INFINITY 2 Sneaker,7050
4,Nike,Men REACT MILER Running Shoes,9345
...,...,...,...
95,Geox,Women Leather Solid Pumps,11990
96,Geox,Women Solid Wedges,8490
97,Hush Puppies,Men Solid Leather Formal Derbys,9999
98,Heel & Buckle London,Men Leather Formal Brogues,7693


## Q10: Go to webpage https://www.amazon.in/ Enter “Laptop” in the search field and then click the search icon. Then set CPU Type filter to “Intel Core i7” and “Intel Core i9” After setting the filters scrape first 10 laptops data. 


You have to scrape 3 attributes for each laptop:
1. title
2. Ratings
3. Price

### Logic used
This is a dynamic website. The CPU TYPE filter facet sometimes allows us to select multiple options and sometimes it allows is to select only one option at a time. Hence I have written the following logic to overcome this:
1. Select the 1st CPU Type value passes in the cpu_types list and fetch the data.
2. click clear,and then use the next value sin th cpu_types list in the CPU Type filter facet and fetch the data.
3. Repeat until the cpu_types list is exhausted.
4. Finally, based on the number of products requested, display the data approximately equaly for all the cpu types used.

Note: Since the prices and ratings information were not easy to fetch from the product list page using xpath, I have used regex to fetch the these information from the product list page itself.

In [44]:
def get_highlevel_details_amazon(url, limit, cpu_types = ['all']):
    """
    This function gets url, limit(numbr of rows to fetch), cpu_types(list of cpu types to use as filter) as input and
    fetches the data in the form of a dataframe.
    """
    
    titles = []
    ratings = []
    prices = []
    cpu = []
    
    new_limit = limit
    
    #Regex patter to fetch ratings and prices
    rating_pattern = r'\d.\d+ out of | \d out of'
    price_pattern = r'₹\d+,\d+,\d+|₹\d+,\d+'
    
    #launching the website and searching with the search keyword
    drivr = get_drive_launch(url)
    drivr.find_element_by_id("twotabsearchtextbox").send_keys("laptop")
    drivr.find_element_by_id("nav-search-submit-text").click()
    time.sleep(2)

    #Loopgin through the cpu type list to fetch data for the 'cpu type' input provided by the user
    for cpu_type in cpu_types:
        #Find and click the cpu type
        try:
            drivr.find_element_by_xpath("//li[@aria-label='"+cpu_type+"']//span").click()
            time.sleep(2)
        except:
            pass
        
        #Run a while loop until the number of rows requested by the user is met
        while len(titles) < new_limit:
            #getting the webelements
            product_elems = drivr.find_elements_by_xpath("//div[@class='a-section a-spacing-medium']")
            titles_elems = drivr.find_elements_by_xpath("//h2[@class='a-size-mini a-spacing-none a-color-base s-line-clamp-2']//span")
            
            #Loop over the webelements to fetch the text from them
            for title, prod_elem in zip(titles_elems, product_elems):
                #Break if the number of items requested are met within for loop
                if len(titles) == new_limit:
                    break
                titles.append(title.text)
                cpu.append(cpu_type)
                #Extracting the rating and price information based on the string pattern
                rating = re.findall(rating_pattern, prod_elem.get_attribute('innerText'))
                price = re.findall(price_pattern, prod_elem.get_attribute('innerText'))
                
                if len(rating) > 0:
                    ratings.append(rating[0].split()[0])
                else:
                    ratings.append('No rating')
                if len(price) > 0:
                    prices.append(price[0])
                else:
                    prices.append('No price')
            #Break if the number of items requested are met within while loop
            if len(titles) == new_limit:
                    break
            #Go to the next page if more data is required
            try:
                url = drivr.find_element_by_xpath("//a[contains(text(),'Next')]").get_attribute('href')
            except NoSuchElementException:
                break
            drivr = get_drive_launch(url)
            time.sleep(2)
            
        #Adjust the number of rows to pick so that we are fetching enough data for each cpu type provided by the user
        new_limit = limit + new_limit
        #Clearing the CPU TYPE filter in the website so that the new cpu type can be used
        try:
            drivr.find_element_by_xpath("//span[contains(text(),'Clear')]").click()
        except:
            pass        
        time.sleep(2)
        
    #Creating the initial dataframe with the whole data. 
    #The number of rows will be ('number of rows' requested by user) * (length of cpu_type list given by the user)
    #mul
    dataframe = pd.DataFrame({'Product Name':titles, 'Rating': ratings, 'Price': prices, 'CPU Type':cpu})
        
    #Here we are picking only the top few rows for each cpu type from the dataframe and create a new dataframe to
    #match the number of rows requested by the user.
    new_df = dataframe[0:(limit//len(cpu_types))]
    start = limit
    for i in range(1,len(cpu_types)):
        end = (start + (limit//len(cpu_types)))
        new_df = pd.concat([new_df, dataframe[start:end]], ignore_index=True)
        start = start + limit
        
    #if the number of rows fall short, adjust it by filling it with the last few rows from the initial dataframe craeted
    if new_df.shape[0] < limit:
        new_df = pd.concat([new_df, dataframe[-(limit-new_df.shape[0]):]], ignore_index=True)
    #return the driver and the new dataframe
    return drivr, new_df


In [43]:
#Call the get_highlevel_details_amazon() function and pass the url and cpu type list as input
url = 'https://www.amazon.in/'
cpu_types = ['Intel Core i7','Intel Core i9']
d, df = get_highlevel_details_amazon(url, 10, cpu_types)
d.quit()
df

Unnamed: 0,Product Name,Rating,Price,CPU Type
0,(Renewed) HP ZBook 15 G3 Mobile Workstation - ...,No rating,"₹83,990",Intel Core i7
1,(Renewed) Lenovo Thinkpad Yoga S1 Laptop (CORE...,1.0,"₹38,990",Intel Core i7
2,HP 14 Thin & Light 14-inch FHD Laptop (11th Ge...,4.6,"₹76,500",Intel Core i7
3,Mi Notebook Horizon Edition 14 Intel Core i5-1...,4.3,"₹54,999",Intel Core i7
4,HP Pavilion Gaming 11th Gen Intel Core i7 Proc...,No rating,"₹83,128",Intel Core i7
5,ASUS ZenBook Pro Duo Intel Core i9-10980HK 10t...,3.3,"₹2,59,990",Intel Core i9
6,Dell XPS 9570 15.6-inch UHD Laptop (8th Gen i9...,2.3,"₹2,27,200",Intel Core i9
7,"ASUS ROG Strix Scar 17 (2020), 17.3"" FHD 300Hz...",4.8,"₹2,78,490",Intel Core i9
8,"ASUS ROG Strix Scar 15 (2020), 15.6"" FHD 300Hz...",4.0,"₹1,99,698",Intel Core i9
9,"Apple MacBook Pro (16-inch, 16GB RAM, 1TB Stor...",3.7,"₹2,24,900",Intel Core i9
