# Assessment Recommendation System

## Importing Libraries

In [163]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from tqdm import tqdm
from urllib.parse import urlparse, parse_qs
import logging


## Setting Up Logger

In [175]:
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

console_handler = logging.StreamHandler()
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
console_handler.setFormatter(formatter)
logger.addHandler(console_handler)

In [169]:
test_type_map = {
    'A': 'Ability & Aptitude',
    'B': 'Biodata & Situational Judgement',
    'C': 'Competencies',
    'D': 'Development & 360',
    'E': 'Assessment Exercises',
    'K': 'Knowledge & Skills',
    'P': 'Personality & Behavior',
    'S': 'Simulations'
}

In [170]:
def fetch_sub_page_details(url):
    """
    This method fetches description & duration of the assessment 
    :param url: URL of assessment
    :return: A tuple consisting of description and duration
    """
    
    sub_url = 'https://www.shl.com' + url
        
    res = requests.get(sub_url)
    soup = BeautifulSoup(res.text, 'html.parser')
    description = None
    duration = None
    
    description_tag = soup.find('h4', string = 'Description')
    if description_tag:
        description = description_tag.find_next_sibling('p').get_text(strip = True)

    duration_tag = soup.find('h4', string = 'Assessment length')
    if duration_tag:
        duration = duration_tag.find_next_sibling('p').get_text(strip = True).split('=')[1].lstrip()
        
        if 'minutes' in duration:
            duration = duration.split(' ')[0]
        
    else:
        duration = 'not given'
    
    return description, duration

In [171]:
def fetch_elements(row):
    
    """
    This method fetches data from table row
    :param row: Row of table
    :return: A list of dictionary where each dictionary contains name, url, remote_support, adaptive_support, test_type
             description and duration
    """
    
    data_list = []
    cols = row.find_all('td')
    a_tag = cols[0].find('a')
    url = a_tag.get('href')
    
    description, duration = fetch_sub_page_details(url)
    name = a_tag.text.strip()
    remote_support = 'no'
    adaptive_support = 'no'
    if cols[1].find('span', {'class' : 'catalogue__circle -yes'}):
        remote_support = 'yes'

    if cols[2].find('span', {'class' : 'catalogue__circle -yes'}):
        adaptive_support = 'yes'

    test_type = ''.join([span_tag.text.strip() for span_tag in cols[3].find_all('span', {'class' : 'product-catalogue__key'})])

    data_list.append({
        'name': name,
        'url' : url,
        'remote_support' : remote_support,
        'adaptive_support' : adaptive_support,
        'test_type' : test_type,
        'description' : description,
        'duration' : duration
    })
    return data_list

In [172]:
def pagination(wrapper):
    """
    This method fetches data from pagination container
    :param wrapper: A pagination container having unordered list
    :return: A dictionary with key type and value as end page
    """
    type_dict = {}
    pagination_holder = wrapper.find('ul', {'class': 'pagination'})
    
    if pagination_holder:
        pagination_items = pagination_holder.find_all('li')
        if pagination_items:
            end_page = int(pagination_items[-2].text.strip())
            end_page = end_page * 12 - 12

        a_tag = pagination_items[-2].find('a', class_='pagination__link')
        if a_tag and 'href' in a_tag.attrs:
            href = a_tag['href']
            params = parse_qs(urlparse(href).query)
            type = int(params.get('type', [None])[0])
        type_dict[type] = end_page
            
    return type_dict

In [193]:
def fetch_data():
    data_list = []
    base_url = 'https://www.shl.com/products/product-catalog/'  
    try: 
        response = requests.get(base_url)

        type_dict = {}

        if response.status_code == 200:
            logger.info("URL : " + base_url)
            soup = BeautifulSoup(response.text, 'html.parser')

            table_wrapper = soup.find_all('div', {'class' : 'custom__table-wrapper || js-target-table-wrapper'})
            for wrapper in table_wrapper:
                table = wrapper.find('table')

                for row_no, row in enumerate(table.find_all('tr')[1:]):
                    data_list.extend(fetch_elements(row))

                dict = pagination(wrapper).copy()
                type_dict.update(dict)
        else:
            logger.error(f"{response.status_code} {response.reason}")


        for type in tqdm(type_dict):
            start = 12

            while True:
                url = base_url + f'?start={start}&type={type}'
                logger.info("URL : " + url)
                response = requests.get(url)

                if response.status_code == 200:
                    soup = BeautifulSoup(response.text, 'html.parser')
                    wrapper = soup.find('div', {'class' : 'custom__table-wrapper || js-target-table-wrapper'})
                    if wrapper:
                        table = wrapper.find('table')

                        for row_no, row in enumerate(table.find_all('tr')[1:]):
                            data_list.extend(fetch_elements(row))
                start += 12   

                if start > type_dict[type]:
                    break 

        return data_list
    
    except Exception as e:
        logger.error(f"Exception : {e}")

In [195]:
data = fetch_data()

## Converting to DataFrame & Saving DataFrame

In [160]:
df = pd.DataFrame(data)
df.to_csv('C:\\Users\\Dell\\Desktop\\data.csv', index = False)

## Data Sample

In [186]:
df.head(3)

Unnamed: 0,name,url,remote_support,adaptive_support,test_type,description,duration
0,Account Manager Solution,/products/product-catalog/view/account-manager...,yes,yes,CPAB,The Account Manager solution is an assessment ...,49
1,Administrative Professional - Short Form,/products/product-catalog/view/administrative-...,yes,yes,AKP,The Administrative Professional solution is fo...,36
2,Agency Manager Solution,/products/product-catalog/view/agency-manager-...,yes,yes,ABPS,The Agency Manager solution is for mid-level s...,51
