# Exploratory analysis and data processing

In [1]:
# import libs
import pandas as pd

## Getting data

In [2]:
# read csv
df_raw = pd.read_csv('../workana_scraping/data/data_raw.csv')

# check shape
df_raw.shape

(450, 7)

In [3]:
# create a copy from original data
df = df_raw.copy()

# check shape
df_raw.shape

(450, 7)

## Knowing dataframe

In [4]:
# check shape
df.shape

(450, 7)

In [5]:
# check df info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 450 entries, 0 to 449
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Job           450 non-null    object
 1   Publish Date  450 non-null    object
 2   Skills        450 non-null    object
 3   Budget        450 non-null    object
 4   Bids          450 non-null    int64 
 5   Summary       450 non-null    object
 6   Link          450 non-null    object
dtypes: int64(1), object(6)
memory usage: 24.7+ KB


In [6]:
# check data types
df.dtypes

Job             object
Publish Date    object
Skills          object
Budget          object
Bids             int64
Summary         object
Link            object
dtype: object

In [7]:
# count not null
df.count()

Job             450
Publish Date    450
Skills          450
Budget          450
Bids            450
Summary         450
Link            450
dtype: int64

In [8]:
# search for null
for c in df.columns:
    try:
        num_nulls = df.shape[0] - df[c].count()
        print(f'Valores nulos na coluna {c}:',num_nulls)
    except:
        print(f'Erros encontrados na coluna {c}:')

Valores nulos na coluna Job: 0
Valores nulos na coluna Publish Date: 0
Valores nulos na coluna Skills: 0
Valores nulos na coluna Budget: 0
Valores nulos na coluna Bids: 0
Valores nulos na coluna Summary: 0
Valores nulos na coluna Link: 0


In [9]:
# if error, drop line
# df = df.drop(df[df['Bids'] == 'Erro no orçamento 10 da página5'].index)

# change Bids to int type
df['Bids'] = df['Bids'].astype(int)

print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 450 entries, 0 to 449
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Job           450 non-null    object
 1   Publish Date  450 non-null    object
 2   Skills        450 non-null    object
 3   Budget        450 non-null    object
 4   Bids          450 non-null    int64 
 5   Summary       450 non-null    object
 6   Link          450 non-null    object
dtypes: int64(1), object(6)
memory usage: 24.7+ KB
None


In [10]:
df.shape

(450, 7)

## Change types

### Bids column

In [11]:
# if error, drop line
df = df.drop(df[df['Bids'] == 'Erro no orçamento 10 da página5'].index)

In [12]:
# change Bids to int type
df['Bids'] = df['Bids'].astype(int)

### Publish Date column

In [13]:
# dict pt x en
months = {
    'Janeiro': 'January',
    'Fevereiro': 'February',
    'Março': 'March',
    'Abril': 'April',
    'Maio': 'May',
    'Junho': 'June',
    'Julho': 'July',
    'Agosto': 'August',
    'Setembro': 'September',
    'Outubro': 'October',
    'Novembro': 'November',
    'Dezembro': 'December'
}

In [14]:
# replaces month names in portuguese with their english equivalents
for m_pt, m_en in months.items():
    df['Publish Date'] = df['Publish Date'].str.replace(m_pt, m_en)
    df['Publish Date'] = df['Publish Date'].str.replace(' de ', ' ')

# change type
df['Publish Date'] = pd.to_datetime(df['Publish Date'], format='%d %B %Y %H:%M')
df['Publish Date']

print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 450 entries, 0 to 449
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   Job           450 non-null    object        
 1   Publish Date  450 non-null    datetime64[ns]
 2   Skills        450 non-null    object        
 3   Budget        450 non-null    object        
 4   Bids          450 non-null    int64         
 5   Summary       450 non-null    object        
 6   Link          450 non-null    object        
dtypes: datetime64[ns](1), int64(1), object(5)
memory usage: 24.7+ KB
None


In [15]:
df.shape

(450, 7)

## Create Category & Subcategory columns

In [16]:
# where exists 'Categoria'
count_cat = df[df['Summary'].str.contains('Categoria')].index.tolist()
print(f'A palavra "Categoria" aparece nas linhas: {count_cat}')

A palavra "Categoria" aparece nas linhas: [0, 1, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 18, 19, 20, 22, 24, 25, 26, 27, 28, 29, 31, 32, 33, 35, 36, 37, 38, 39, 40, 41, 43, 45, 46, 47, 48, 49, 50, 51, 54, 55, 56, 57, 58, 59, 62, 63, 64, 67, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 80, 81, 82, 84, 85, 86, 88, 90, 91, 93, 94, 95, 96, 97, 98, 99, 100, 102, 103, 105, 107, 108, 109, 110, 113, 114, 115, 116, 117, 118, 120, 121, 125, 126, 127, 129, 131, 133, 134, 135, 136, 137, 139, 141, 142, 143, 144, 145, 146, 147, 148, 149, 151, 152, 153, 154, 155, 156, 158, 160, 161, 162, 163, 164, 165, 166, 169, 170, 171, 172, 176, 177, 178, 179, 180, 181, 184, 186, 187, 188, 189, 190, 193, 195, 197, 198, 199, 203, 204, 205, 206, 207, 208, 210, 211, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 225, 226, 227, 229, 230, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 243, 244, 246, 247, 248, 250, 251, 252, 253, 255, 256, 257, 259, 260, 261, 262, 263, 265, 266, 267, 269, 270, 271, 273, 274, 275, 278, 2

In [17]:
# where exists 'Subategoria'
count_subcat = df[df['Summary'].str.contains('Subcategoria')].index.tolist()
print(f'A palavra "Categoria" aparece nas linhas: {count_subcat}')

A palavra "Categoria" aparece nas linhas: [0, 1, 4, 5, 7, 8, 9, 10, 12, 13, 14, 19, 22, 24, 25, 26, 27, 28, 29, 31, 32, 33, 35, 37, 38, 39, 40, 41, 43, 45, 47, 48, 49, 50, 51, 54, 55, 56, 57, 58, 59, 62, 64, 67, 69, 70, 72, 73, 74, 75, 76, 77, 78, 80, 81, 82, 85, 86, 88, 91, 93, 94, 95, 96, 97, 98, 99, 100, 102, 103, 107, 108, 110, 113, 114, 115, 117, 120, 125, 126, 129, 131, 133, 134, 135, 136, 137, 141, 142, 143, 144, 146, 147, 148, 149, 151, 152, 153, 154, 155, 156, 158, 160, 161, 162, 163, 164, 165, 166, 169, 170, 172, 176, 177, 178, 179, 180, 181, 184, 186, 187, 188, 189, 193, 195, 197, 198, 199, 203, 204, 205, 206, 207, 208, 210, 211, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 225, 226, 227, 230, 232, 233, 235, 236, 237, 238, 239, 241, 243, 244, 246, 247, 248, 250, 251, 253, 255, 256, 257, 259, 261, 263, 265, 266, 267, 269, 270, 273, 275, 278, 279, 281, 282, 283, 284, 286, 288, 289, 291, 293, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 308, 309, 310, 311, 313, 3

In [18]:
# extract data
df['Category'] = df['Summary'].str.extract('Categoria:\s*(.*)\n')
df['Subcategory'] = df['Summary'].str.extract('Subcategoria:\s*(.*)\n')

df[['Category','Subcategory']] = df[['Category','Subcategory']].fillna('N/I')

In [19]:
# remove after line break
df['Summary'] = df['Summary'].str.split('\n').str[0]

In [20]:
df.shape

(450, 9)

In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 450 entries, 0 to 449
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   Job           450 non-null    object        
 1   Publish Date  450 non-null    datetime64[ns]
 2   Skills        450 non-null    object        
 3   Budget        450 non-null    object        
 4   Bids          450 non-null    int64         
 5   Summary       450 non-null    object        
 6   Link          450 non-null    object        
 7   Category      450 non-null    object        
 8   Subcategory   450 non-null    object        
dtypes: datetime64[ns](1), int64(1), object(7)
memory usage: 31.8+ KB


## Create dummy columns from Skills

In [22]:
# Replace substring - exclude spaces -ok
df['Skills'] = df['Skills'].replace(' ','', regex=True)
# df.head(1)

In [23]:
# create dummy columns
df = pd.concat([df, df['Skills'].str.get_dummies(sep=',')], axis = 1)
if '+' in df.columns:
    df.drop('+', axis=1, inplace=True)
# df.shape

In [24]:
df.shape

(450, 134)

In [25]:
df.head(1)

Unnamed: 0,Job,Publish Date,Skills,Budget,Bids,Summary,Link,Category,Subcategory,3d,...,WebScraping,Wix,WooCommerce,WordPress,YouTube,['N/I'],eBooks,iOS,iPhone,vtiger
0,Desenvolvimento de Ferramenta Para Checkout We...,2023-05-11 12:50:00,"Magento,PHP,WordPress",USD 1.000 - 3.000,1,Procuro dev full stack para projeto de desenvo...,https://www.workana.com/job/desenvolvimento-de...,TI e Programação,Lojas Virtuais (e-commerce),0,...,0,0,0,1,0,0,0,0,0,0


## Export data

In [26]:
#create csv
df.to_csv('../workana_scraping/data/data_t.csv', index=False)