## Crawler Q Concursos

- [ ] Login e autenticação
- [ ] Resgate de disciplinas e matérias
- [ ] Cálculo de métricas de cada matérias
- [ ] Aramazenamento local em dataframes
- [ ] Raspagem de questões
- [ ] Resolução de questões

In [9]:
from selenium import webdriver
from time import sleep
import pandas as pd
import uuid

In [2]:
# crawler config

capabilities = {
    "browserName": "firefox",
    "browserVersion": "90.0",
    "selenoid:options": {
        "enableVNC": True,
        "enableVideo": False
    }
}

In [47]:
# Base Class

class Craweler:
    
    def __init__(self):
        self.__driver = webdriver.Remote(
            command_executor="http://localhost:4444/wd/hub",
            desired_capabilities=capabilities
        )
        
    def _find_element_by_xpath(self, xpath):
        return self.__driver.find_element_by_xpath(xpath)
    
    def _find_element_by_classname(self, classname):
        return self.__driver.find_element_by_class_name(classname)
        
    def _send_keys_in_element(self, element, content_string):
        return element.send_keys(content_string)
    
    def _request_url(self, url):
        return self.__driver.get(url)
    
    def driver(self):
        return self.__driver
        
        
# Classe Herdada

class Qconcursos(Craweler):
    
    def __init__(self, config_dict):
        self.__driver = super().__init__()
        self.__configs = config_dict
        self.__infos = {
            "auth": {
                "url": "https://www.qconcursos.com/conta/entrar",
                "email_input_xpath" : '//*[@id="login_email"]',
                "password_input_xpath": '//*[@id="login_password"]',
                "button_input_xpath": '//*[@id="login_form"]/input[3]'
            },
            "logout": {
                "url": "https://www.qconcursos.com/usuario",
                "first_button": '//*[@id="user-dropdown"]',
                "second_button": '//*[@id="js-current-user"]/div[2]/div/a[3]'
            },
            "subjects": {
                "url": "https://www.qconcursos.com/questoes-do-enem/disciplinas",
                "body": '/html/body/div[3]/main/div[2]/div',
            },
            "module": {
                "base": '/html/body/div[2]/main/div/div[2]/div'
            }
        }
        
    def auth_user(self):
        self._request_url(self.__infos["auth"]["url"])
        
        input_email_element = self._find_element_by_xpath(self.__infos["auth"]["email_input_xpath"])
        input_password_element = self._find_element_by_xpath(self.__infos["auth"]["password_input_xpath"])
        input_button_element = self._find_element_by_xpath(self.__infos["auth"]["button_input_xpath"])
        
        
        self._send_keys_in_element(input_email_element, self.__configs["email"])
        self._send_keys_in_element(input_password_element, self.__configs["password"])
        
        return input_button_element.click()
    
    
    def logout_user(self):
        self._request_url(self.__infos["logout"]["url"])
        
        first_button = self._find_element_by_xpath(self.__infos["logout"]["first_button"])
        first_button.click()
        second_button = self._find_element_by_xpath(self.__infos["logout"]["second_button"])
        return second_button.click()
    
    def get_subjects(self):
        self._request_url(self.__infos["subjects"]["url"])
        
        body_content = self._find_element_by_xpath(self.__infos["subjects"]["body"])
        subject_elements = body_content.find_elements_by_class_name('q-discipline-item')
        
        
        response = []

        for element_div in subject_elements:
    
            question_itens = element_div.find_elements_by_class_name('q-items')[0]
            link_element = question_itens.find_element_by_tag_name('a')
            
            dict_element = {}
            
            dict_element['title'] = element_div.find_element_by_tag_name('h3').text
            dict_element['url_questões'] = link_element.get_attribute('href')
            dict_element['url_modules'] = element_div.find_element_by_tag_name('h3').find_element_by_tag_name('a').get_attribute('href')
            
            dict_element['number'] = link_element.text.replace('.', '')
            
            response.append(dict_element)
            
        return response
      
        
    def show_credentials(self):
        print(self.__configs)
        
    

In [45]:
crawler = Qconcursos({ "email": "datajus.services@gmail.com", "password": "D@T@jus2021" })

In [15]:
# get all subjects

crawler.auth_user()
subjects = crawler.get_subjects()
crawler.logout_user()

print(subjects)

[{'title': 'Português', 'url_questões': 'https://www.qconcursos.com/questoes-do-enem/disciplinas/letras-portugues/questoes', 'url_modules': 'https://www.qconcursos.com/questoes-do-enem/disciplinas/letras-portugues', 'number': '972'}, {'title': 'Matemática', 'url_questões': 'https://www.qconcursos.com/questoes-do-enem/disciplinas/matematica-matematica/questoes', 'url_modules': 'https://www.qconcursos.com/questoes-do-enem/disciplinas/matematica-matematica', 'number': '1159'}, {'title': 'História', 'url_questões': 'https://www.qconcursos.com/questoes-do-enem/disciplinas/historia-historia/questoes', 'url_modules': 'https://www.qconcursos.com/questoes-do-enem/disciplinas/historia-historia', 'number': '518'}, {'title': 'Geografia', 'url_questões': 'https://www.qconcursos.com/questoes-do-enem/disciplinas/geografia-geografia/questoes', 'url_modules': 'https://www.qconcursos.com/questoes-do-enem/disciplinas/geografia-geografia', 'number': '456'}, {'title': 'Biologia', 'url_questões': 'https://w

In [20]:
# transform in dataframe

for subject in subjects:
    subject['id'] = str(uuid.uuid4())

    
df_questions_base = pd.DataFrame(subjects)
df_questions_base

Unnamed: 0,title,url_questões,url_modules,number,id
0,Português,https://www.qconcursos.com/questoes-do-enem/di...,https://www.qconcursos.com/questoes-do-enem/di...,972,85d81623-ebd8-4758-b39c-2c23f13fcb94
1,Matemática,https://www.qconcursos.com/questoes-do-enem/di...,https://www.qconcursos.com/questoes-do-enem/di...,1159,3d0b1567-6d12-4099-ab40-dc3b8a753eab
2,História,https://www.qconcursos.com/questoes-do-enem/di...,https://www.qconcursos.com/questoes-do-enem/di...,518,21180166-1671-4176-91a9-50bbac3b5184
3,Geografia,https://www.qconcursos.com/questoes-do-enem/di...,https://www.qconcursos.com/questoes-do-enem/di...,456,67107db1-ed05-421b-936b-06068c16a0a7
4,Biologia,https://www.qconcursos.com/questoes-do-enem/di...,https://www.qconcursos.com/questoes-do-enem/di...,519,fd9f3224-7d80-4c13-8ff5-23e5450705ba
5,Química,https://www.qconcursos.com/questoes-do-enem/di...,https://www.qconcursos.com/questoes-do-enem/di...,386,641abec6-1be3-4114-a659-b1721ffb3d8f
6,Física,https://www.qconcursos.com/questoes-do-enem/di...,https://www.qconcursos.com/questoes-do-enem/di...,375,67317c65-df6c-4d63-a232-7f5934600bf7
7,Inglês,https://www.qconcursos.com/questoes-do-enem/di...,https://www.qconcursos.com/questoes-do-enem/di...,120,542dab1a-10f3-4e6f-a817-c31ade938fbd
8,Sociologia,https://www.qconcursos.com/questoes-do-enem/di...,https://www.qconcursos.com/questoes-do-enem/di...,107,17123ef7-fceb-4d42-a351-12d15090fe6d
9,Filosofia,https://www.qconcursos.com/questoes-do-enem/di...,https://www.qconcursos.com/questoes-do-enem/di...,139,fc3c593f-722a-436d-b186-4fead7ec7d53


In [49]:
df_questions_base.to_csv('base_df.csv', index=False)