### Data Wrangling

In [14]:
# all modules
import warnings
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re

In [15]:
# global stuff
warnings.filterwarnings('ignore')
%matplotlib inline
pd.set_option('display.max_rows', 13)
pd.set_option('display.max_columns', 13)

In [40]:
# load dataset
jobs_db = pd.read_csv(r'../data/processed/jobs_db.csv')

In [41]:
api_data=pd.read_json(r'../data/processed/api_data.json')
api_data

Unnamed: 0,links,normalized_job_title,parent_uuid,title,uuid
0,,1st grade teacher,51cc95467c1844c7dc1d923265c023b2,1st Grade Teacher,26bc4486dfd0f60b3bb0d8d64e001800
1,,1st pressman,96f1cf15fe5880d8660c70f0756944ab,1st Pressman,9be6e14491305132c175cb1781a3356c
10,,3d modeler,caa551520f21c7c220716a8f82806246,3D Modeler,6cb0a6786c08d6c91a4fc9f61e19f62d
11,,3d specialist,caa551520f21c7c220716a8f82806246,3D Specialist,ec45194ae348e1bb21415064364f038b
12,,3d technologist,ce1c0fc273d3c803e94a73ff0b54816e,3D Technologist,4266e89fcaf06973fa23fc6f75b27682
13,,3rd grade reading teacher,51cc95467c1844c7dc1d923265c023b2,3rd Grade Reading Teacher,0cfb09fa1c6691fd6d77a3d3743fa8bc
...,...,...,...,...,...
4,,2nd grade teacher,51cc95467c1844c7dc1d923265c023b2,2nd Grade Teacher,a59278d7f653db37e54a270345a91a05
5,,2nd pressman,96f1cf15fe5880d8660c70f0756944ab,2nd Pressman,9c631442c4966cd3e5a9e52fc70aff1c
6,,2 year olds preschool teacher,d78b3ba8b749686d739480ee77f9e219,2 Year Olds Preschool Teacher,f5a2151d370ebca7b49778e036a287e8


In [42]:
url='https://ec.europa.eu/eurostat/statistics-explained/index.php/Glossary:Country_codes'
html=requests.get(url).content
soup = BeautifulSoup(html, 'lxml')
countries = soup.find_all('td')
lista=[]
lista=[i.text for i in countries]
cleaned=[i.strip() for i in lista if len(i)>3]
split=2
country_codes=[cleaned[i:i+split] for i in range(0,len(cleaned),split)]

In [43]:
country_codes=pd.DataFrame(country_codes)
country_codes=country_codes.rename(columns={0:'country',1:'country_code'})
country_codes['country_code']=country_codes['country_code'].str.replace(')','')
country_codes['country_code']=country_codes['country_code'].str.replace('(','')
country_codes

Unnamed: 0,country,country_code
0,Belgium,BE
1,Greece,EL
2,Lithuania,LT
3,Portugal,PT
4,Bulgaria,BG
5,Spain,ES
...,...,...
66,New Zealand,NZ
67,Taiwan,TW
68,Canada,CA


In [45]:
#Rename columnsdf=pd.merge(jobs_df,api_data, left_on='uuid',right_on='uuid')

df.shape

(3299, 20)


jobs_db=jobs_db.rename(columns={'uuid':'person_id','dem_education_level':'ed_level','dem_full_time_job':'job_type','question_bbi_2016wave4_basicincome_awareness':'bi_awareness','question_bbi_2016wave4_basicincome_vote':'bi_vote','question_bbi_2016wave4_basicincome_effect':'bi_effect','question_bbi_2016wave4_basicincome_argumentsfor':'bi_argsfor','question_bbi_2016wave4_basicincome_argumentsagainst':'bi_argsagsagainst'})
jobs_db.columns

Index(['person_id', 'country_code', 'rural', 'uuid.1', 'ed_level', 'job_type',
       'normalized_job_code', 'uuid.2', 'age', 'gender', 'dem_has_children',
       'age_group', 'uuid.3', 'bi_awareness', 'bi_vote', 'bi_effect',
       'bi_argsfor', 'bi_argsagsagainst'],
      dtype='object')

In [46]:
#Drop unnecessary columns
drop_cols=[i for i in list(jobs_db.columns) if i.startswith('uuid')]
jobs_db=jobs_db.drop(columns=drop_cols)
jobs_db.columns

Index(['person_id', 'country_code', 'rural', 'ed_level', 'job_type',
       'normalized_job_code', 'age', 'gender', 'dem_has_children', 'age_group',
       'bi_awareness', 'bi_vote', 'bi_effect', 'bi_argsfor',
       'bi_argsagsagainst'],
      dtype='object')

In [53]:
jobs_db=jobs_db.rename(columns={'normalized_job_code':'uuid'})
jobs_db['uuid']

0                                    NaN
1       861a9b9151e11362eb3c77ca914172d0
2                                    NaN
3       049a3f3a2b5f85cb2971ba77ad66e10c
4       f4b2fb1aa40f661488e2782b6d57ad2f
5       27af8700f5577cec835acee2cb90a2ff
                      ...               
9643                                 NaN
9644    847165cfda6b1dc82ae22b967da8af2f
9645    a4d5b8b38f9513825d0d94a981ebe962
9646                                 NaN
9647    775190277a849cba701b306a7b374c0a
9648    8a7a3f1bce0958ae9f090683e5a925b7
Name: uuid, Length: 9649, dtype: object

In [54]:
df=pd.replace(jobs_df,api_data, left_on='uuid',right_on='uuid')

Unnamed: 0,person_id,country_code,rural,ed_level,job_type,uuid,...,bi_argsagsagainst,country,links,normalized_job_title,parent_uuid,title
0,f6e7ee00-deac-0133-4de8-0a81e8b09a82,AT,countryside,no,no,,...,None of the above,Austria,"[{'rel': 'self', 'href': '/jobs?offset=0&limit...",,,
1,83127080-da3d-0133-c74f-0a81e8b09a82,AT,city,,no,,...,Foreigners might come to my country and take a...,Austria,"[{'rel': 'self', 'href': '/jobs?offset=0&limit...",,,
2,b50dbb80-da53-0133-8956-0a81e8b09a82,AT,urban,medium,no,,...,Foreigners might come to my country and take a...,Austria,"[{'rel': 'self', 'href': '/jobs?offset=0&limit...",,,
3,9949c4c0-da5f-0133-c832-0a81e8b09a82,AT,urban,no,no,,...,Foreigners might come to my country and take a...,Austria,"[{'rel': 'self', 'href': '/jobs?offset=0&limit...",,,
4,69f1f400-dc5f-0133-ad9b-0a81e8b09a82,AT,city,medium,no,,...,Foreigners might come to my country and take a...,Austria,"[{'rel': 'self', 'href': '/jobs?offset=0&limit...",,,
5,0f286c80-dd77-0133-7803-0a81e8b09a82,AT,rural,medium,no,,...,It might encourage people to stop working | Fo...,Austria,"[{'rel': 'self', 'href': '/jobs?offset=0&limit...",,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3293,88a72f20-da1b-0133-ec7a-0a81e8b09a82,SK,Non-Rural,low,no,,...,None of the above,Slovakia,"[{'rel': 'self', 'href': '/jobs?offset=0&limit...",,,
3294,b98527f0-d8da-0133-96c4-0a81e8b09a82,SK,countryside,high,no,,...,None of the above,Slovakia,"[{'rel': 'self', 'href': '/jobs?offset=0&limit...",,,
3295,560ba1e0-da40-0133-db35-0a81e8b09a82,SK,urban,medium,no,,...,It might encourage people to stop working,Slovakia,"[{'rel': 'self', 'href': '/jobs?offset=0&limit...",,,


In [50]:
df.shape

(3299, 20)

In [51]:
#CLEANING
df.isnull().sum()

person_id                  0
country_code               0
rural                      0
ed_level                 325
job_type                   0
uuid                    3299
                        ... 
bi_argsagsagainst          0
country                    0
links                      0
normalized_job_title    3299
parent_uuid             3299
title                   3299
Length: 20, dtype: int64