# DATA PREPOCESSING

In [1]:
import pickle
import pandas as pd
from utils import save_pickle_file, load_pickle_file
from constants import RAW_DATA_PATH, PROCESSED_DATA_PATH
from text_processing import process_documents, detect_language

In [2]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /home/ossi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/ossi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ossi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Data loading

In [3]:
data_original = load_pickle_file(RAW_DATA_PATH)
df_original = pd.DataFrame(data_original)

In [4]:
print(df_original.shape)
df_original.head(5).T

(10254, 10)


Unnamed: 0,0,1,2,3,4
categories__name,RPA,RPA,RPA,java,software
date_published,2019-05-11,2019-05-11,2019-05-11,2019-05-11,2019-05-11
employer__name,HUS,HUS,HUS Helsingin yliopistollinen sairaala,Pearson Frank,Pearson Frank
id,62258,62259,62260,62323,62323
is_active,True,True,True,True,True
job_description,"Hae meille, jos haluat työskennellä ICT-alan n...","Hae meille, jos haluat työskennellä ICT-alan n...",Sovellussuunnittelija (RPA) Ohjelmistorobotiik...,Java Developers Wanted in Turku for an Initial...,Java Developers Wanted in Turku for an Initial...
job_title__name,Sovellussuunnittelija (RPA) Ohjelmistorobotiik...,Sovellussuunnittelija (RPA) Ohjelmistorobotiik...,Sovellussuunnittelija (RPA) Ohjelmistorobotiik...,Java Developers,Java Developers
location__name,Helsinki,Helsinki,Helsinki,Turku,Turku
location__population,645482,645482,645482,189930,189930
pages__name,Indeed,Indeed,Indeed,Indeed,Indeed


## Data preprocessing

### Filter by locations

In [5]:
locations = ['Helsinki', 'Tampere', 'Turku', 'Oulu', 'Espoo', 'Vantaa']
df_original = df_original[df_original['location__name'].isin(locations)]
print(df_original.shape)

(10042, 10)


### Filter by language

In [6]:
df_original['language'] = df_original['job_description'].apply(lambda x: detect_language(x)[0])

In [7]:
languages = ['en', 'fi']
df_original = df_original[df_original['language'].isin(languages)]
print(df_original.shape)

(10020, 11)


### Group duplicates

In [8]:
df_data = df_original.groupby('id', as_index=False).aggregate(lambda x: list(set(x)))

def one_length_list_to_value(x):
    if isinstance(x, list):
        if len(x) == 1:
            x = x[0]   
    return x

df_data = df_data.applymap(one_length_list_to_value)
df_data['job_summary'] = df_data['job_title__name'] + ' | ' + df_data['employer__name'] + ' | ' + df_data['location__name']

In [9]:
print(df_data.shape)
df_data.head(5).T    

(6659, 12)


Unnamed: 0,0,1,2,3,4
id,1,2,3,4,5
categories__name,"[python, RPA]",RPA,RPA,"[python, RPA]",RPA
date_published,2018-06-22,2018-06-22,2018-06-20,2018-06-18,2018-06-18
employer__name,Veikkaus Oy,Nordea,Nordea,Raha-Automaattiyhdistys (Ray),HCL Technologies
is_active,False,False,False,False,False
job_description,Veikkaus on suomalaisten omistama peliyhtiö. T...,We are looking for Insurance Professional´s. Y...,It’s an exciting time for you to join us. We’r...,Veikkaus on suomalaisten omistama peliyhtiÃ¶. ...,To review RPA project modules developed by the...
job_title__name,Testausautomaation ja RPA:n asiantuntija,"Insurance Professional, Helsinki",Robotic Process Automation Developer (temporar...,Testausautomaation ja RPA:n asiantuntija,Associate General Manager-RPA
location__name,Helsinki,Helsinki,Helsinki,Helsinki,Helsinki
location__population,645482,645482,645482,645482,645482
pages__name,Indeed,Indeed,Indeed,Indeed,Indeed


### Cleaning, tokenization and lemmatization

In [10]:
documents = list(df_data['job_description'])
job_summary_list = list(df_data['job_summary'])

In [11]:
lemmas = process_documents(documents)
processed_documents = [' '.join(lemmas_per_document) for lemmas_per_document in lemmas]

In [12]:
print(documents[33][:500], '\n')
print(lemmas[33][:50], '\n')
print(processed_documents[33][:500], '\n')

Nokia is a global leader in the technologies that connect people and things. With state-of-the-art software, hardware and services for any type of network, Nokia is uniquely positioned to help communication service providers, governments, and large enterprises deliver on the promise of 5G, the Cloud and the Internet of Things.

Serving customers in over 100 countries, our research scientists and engineers continue to invent and accelerate new technologies that will increasingly transform the way 

['nokia', 'global', 'leader', 'technology', 'connect', 'people', 'thing', 'stateoftheart', 'software', 'hardware', 'service', 'type', 'network', 'nokia', 'uniquely', 'positioned', 'help', 'communication', 'service', 'provider', 'government', 'large', 'enterprise', 'deliver', 'promise', 'cloud', 'internet', 'thing', 'serving', 'customer', 'country', 'research', 'scientist', 'engineer', 'continue', 'invent', 'accelerate', 'new', 'technology', 'increasingly', 'transform', 'way', 'people', 'thing

## Save data for analysis

In [13]:
processed_data = {
    'df_original': df_original,
    'df_processed': df_data,
    'lemmas': lemmas,
    'job_summary_list': job_summary_list,
    'documents': documents,
    'processed_documents': processed_documents,
}
save_pickle_file(processed_data, PROCESSED_DATA_PATH)