# Workana Scrapp

## Scrapping one page with Selenium

In [1]:
# import libs
import pandas as pd
import time

from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException


In [2]:
# init session
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))

# maximizing browser window
driver.maximize_window()

In [3]:
# go to workana# go to workana
driver.get(f'https://www.workana.com/jobs?language=pt&page={3}')

time.sleep(5)

In [4]:
# list for df
data =[]

# loop for all jobs in page
for i in range(2, 11):
    # print(i)
    
    # title
    job_title = driver.find_element(By.XPATH, f'//*[@id="projects"]/div[{i}]/div[1]/h2/a/span')
    job_title = job_title.get_attribute('title')
    # print(job_title)

    # date
    job_date = driver.find_element(By.XPATH, f'//*[@id="projects"]/div[{i}]/div[1]/h5')
    job_date = job_date.get_attribute('title')

    # job_link
    job_link = driver.find_element(By.XPATH, f'//*[@id="projects"]/div[{i}]/div[1]/h2/a')
    job_link = job_link.get_attribute('href')

    # bids
    job_bids = driver.find_element(By.XPATH, f'//*[@id="projects"]/div[{i}]/div[2]/div[1]/span[2]')
    job_bids = job_bids.text.split(' ')[1]

    # budget
    job_budget = driver.find_element(By.XPATH, f'//*[@id="projects"]/div[{i}]/div[3]/h4/span')
    job_budget = job_budget.text

    # job description
    job_desc = driver.find_element(By.XPATH, f'//*[@id="projects"]/div[{i}]/div[2]/div[2]/div')
    job_desc = job_desc.text 

    # get job skills 
    job_sk = []

    try:
        skills = driver.find_element(By.XPATH, f'//*[@id="projects"]/div[{i}]/div[2]/div[3]/div')
        skills = skills.find_elements(By.TAG_NAME, 'a')
    
        for skill in skills:
            job_sk.append(skill.text)

        job_sk = ', '.join(job_sk)
    except NoSuchElementException:
        s = 'Não informado'
        job_sk.append(s)

    # create data list
    data_temp = {'Job': job_title,'Publish Date': job_date, 'Skills': job_sk,  'Budget':job_budget, 
                'Bids': job_bids, 'Summary': job_desc, 'Link': job_link}

    data.append(data_temp)
# print(data)

In [5]:
# for i in range(0,9):
#     print(data[i])

In [6]:
# print('Titilo:',job_title)
# print('Publicação:',job_date)
# print(job_desc)
# print(job_link)
# print('Propostas:',job_bids)
# print('Valor:',job_budget)
# print('Skills:',job_sk)

In [7]:
# create dataframe
df_raw =pd.DataFrame(data)
df_raw.head(3)

Unnamed: 0,Job,Publish Date,Skills,Budget,Bids,Summary,Link
0,Desenvolvimento de Ferramenta Para Checkout We...,11 de Maio de 2023 12:50,"Magento, PHP, WordPress",USD 1.000 - 3.000,1,Procuro dev full stack para projeto de desenvo...,https://www.workana.com/job/desenvolvimento-de...
1,Captação de dados automatizados,10 de Maio de 2023 23:27,"JavaScript, MySQL, PHP, API, HTML, HTML5, Python",USD 250 - 500,16,Estou procurando um dev que possa extrair dado...,https://www.workana.com/job/captacao-de-dados-...
2,Preciso de 1 Menina para entrar em contatos co...,11 de Maio de 2023 13:28,"Skype, Marketing, Vendas",USD 50 - 100,0,Preciso de 1 menina para entrar em contato com...,https://www.workana.com/job/preciso-de-1-menin...


## Exploratory analysis and data processing

### Knowing dataframe

In [8]:
df = df_raw.copy()

In [9]:
# dataframe shape
print(df.shape)

(9, 7)


In [10]:
# column types
print(df.dtypes)

Job             object
Publish Date    object
Skills          object
Budget          object
Bids            object
Summary         object
Link            object
dtype: object


In [11]:
# check if column has unique values
vu=[]
vr=[]
for c in df.columns:
    try:
        df[c].is_unique
        if df[c].is_unique == True:
            vu.append(c)
        else:
            vr.append(c)
    except:
        print(f'Erros encontrados em: {c}')

print(f'Valores únicos em: {vu}')
print(f'Valores repetidos em: {vr}')
    

Valores únicos em: ['Job', 'Summary', 'Link']
Valores repetidos em: ['Publish Date', 'Skills', 'Budget', 'Bids']


In [12]:
# search for null values
# df.shape = (9, 7)
for c in df.columns:
    try:
        num_nulls = df.shape[0] - df[c].count()
        print(f'Valores nulos em {c}:',num_nulls)
    except:
        print(f'Erros encontrados em: {c}')

Valores nulos em Job: 0
Valores nulos em Publish Date: 0
Valores nulos em Skills: 0
Valores nulos em Budget: 0
Valores nulos em Bids: 0
Valores nulos em Summary: 0
Valores nulos em Link: 0


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9 entries, 0 to 8
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Job           9 non-null      object
 1   Publish Date  9 non-null      object
 2   Skills        9 non-null      object
 3   Budget        9 non-null      object
 4   Bids          9 non-null      object
 5   Summary       9 non-null      object
 6   Link          9 non-null      object
dtypes: object(7)
memory usage: 636.0+ bytes


### Changing data types

In [14]:
# change Bids to int type
df['Bids'] = df['Bids'].astype(int)

print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9 entries, 0 to 8
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Job           9 non-null      object
 1   Publish Date  9 non-null      object
 2   Skills        9 non-null      object
 3   Budget        9 non-null      object
 4   Bids          9 non-null      int64 
 5   Summary       9 non-null      object
 6   Link          9 non-null      object
dtypes: int64(1), object(6)
memory usage: 636.0+ bytes
None


#### Change date type

In [15]:
months = {
    'Janeiro': 'January',
    'Fevereiro': 'February',
    'Março': 'March',
    'Abril': 'April',
    'Maio': 'May',
    'Junho': 'June',
    'Julho': 'July',
    'Agosto': 'August',
    'Setembro': 'September',
    'Outubro': 'October',
    'Novembro': 'November',
    'Dezembro': 'December'
}

In [16]:
# replaces month names in portuguese with their english equivalents
for m_pt, m_en in months.items():
    df['Publish Date'] = df['Publish Date'].str.replace(m_pt, m_en)
    df['Publish Date'] = df['Publish Date'].str.replace(' de ', ' ')

# change type
df['Publish Date'] = pd.to_datetime(df['Publish Date'], format='%d %B %Y %H:%M')
df['Publish Date']

print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9 entries, 0 to 8
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   Job           9 non-null      object        
 1   Publish Date  9 non-null      datetime64[ns]
 2   Skills        9 non-null      object        
 3   Budget        9 non-null      object        
 4   Bids          9 non-null      int64         
 5   Summary       9 non-null      object        
 6   Link          9 non-null      object        
dtypes: datetime64[ns](1), int64(1), object(5)
memory usage: 636.0+ bytes
None


### Prepare data to analysis


#### Create dummy columns from Skills

In [17]:
# print(df['Skills'],'\n')
# print(df['Skills'][0],'\n')
# print(type(df['Skills'][0]))

In [18]:
# Replace substring - exclude spaces -ok
df['Skills'] = df['Skills'].replace(' ','', regex=True)
# df.head(1)

In [19]:
# create dummy columns
df = pd.concat([df, df['Skills'].str.get_dummies(sep=',')], axis = 1)
if '+' in df.columns:
    df.drop('+', axis=1, inplace=True)
df.shape

(9, 33)

#### Remove some text from Summary

In [20]:
# percorrendo todas as linhas da coluna 'Descrição'
for i in range(len(df['Summary'])):
    # buscando pela substring 'Ver mais'
    index = df.loc[i, 'Summary'].find('...')
    if index != -1:
        # removendo tudo o que estiver após a substring
        df.loc[i, 'Summary'] = df.loc[i, 'Summary'][:index]
        
# exibindo o DataFrame atualizado
print(df.iloc[1]['Summary'])

print('DataFrame atualizado:')

Estou procurando um dev que possa extrair dados de forma automatizada de alguns sites para geração de uma planilha para futura exportação.

Categoria: TI e Programação
Subcategoria: Programação
Qual é o alcance do projeto?: Alteração média
Isso é um projeto ou uma posição de trabalho?: Um projeto
Tenho,
DataFrame atualizado:


In [21]:
# df.info()
# df.head()

### Export CSV

In [22]:
#create csv
df.to_csv('../workana_scraping/data/one_page.csv', index=False)