<a href="https://colab.research.google.com/github/madisynlu/isp/blob/master/20200418_ISP_N26.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import jobs from N26 website

## Collect info from main page

In [0]:
# get to main jobs page
import requests
page = requests.get('https://n26.com/en/careers').content

In [0]:
# parse the main page
from lxml import html
tree = html.fromstring(page)

In [0]:
# capture info from the main jobs page
id = tree.xpath('//ul[@class="ah aj al an ap aq jj jx jy jz ka"]//a/@href')
title = tree.xpath('//div[@class="ak iz ja kb kc kd ke kf kg kh ki kj kk kl km kn ko kp kq kr ks kt ku kv kw kx ky kz la lb lc ld le lf lg lh li lj lk ll lm ln lo lp lq lr ls lt lu lv lw lx ly lz ma mb mc md me mf mg mh mi mj mk ml mm mn mo mp mq mr ms mt mu mv mw mx my mz na nb nc nd ne nf ng nh ni nj nk nl nm nn no np nq nr ns nt nu nv nw nx ny nz oa ob v"]/text()')
timetype = tree.xpath('//dt[contains(text(), "Time Type")]/following-sibling::dd/text()')

In [0]:
# make this into a table
import pandas as pd
df = pd.concat([pd.Series(id), pd.Series(title),
               pd.Series(timetype)], axis=1)
df.columns = ['id', 'title', 'timetype']

In [0]:
# check for duplicates
df['id'].duplicated().sum()
df.duplicated().sum()

0

In [0]:
# look at table
df

Unnamed: 0,id,title,timetype
0,/en/careers/positions/2152856,Content Writer - English Speaking,Full time
1,/en/careers/positions/2150159,Director of Banking Strategy and Excellence,Full time
2,/en/careers/positions/1815799,Director of Global Public Relations,Full time
3,/en/careers/positions/1979536,Director of People - US,Full time
4,/en/careers/positions/2152786,Editor in Chief,Full time
5,/en/careers/positions/2034972,Engineering Manager - Payments/Financial Crime,


## Collect info from individual job pages

In [0]:
# capture info from individual pages base on the collected job ids from main page
import re
import requests
from lxml import html
def f(i):
    newid = id[i]
    url = ('https://n26.com' + newid)
    page1 = requests.get('https://n26.com' + newid).content
    tree1 = html.fromstring(page1)
    dept = tree1.xpath('//dt[contains(text(), "Department")]/following-sibling::dd/a/text()')
    loc = tree1.xpath('//dt[contains(text(), "Location")]/following-sibling::dd/text()')
    role = tree1.xpath('//h2[2]/following-sibling::ul[1]/li/text()')
    #role = [re.sub('[^a-zA-Z0-9 ,./\-]+', '', x) for x in role]
    #role = [x.rstrip() for x in role]
    need = tree1.xpath('//h2[3]/following-sibling::ul[1]//li/text()')
    bg = tree1.xpath('//p[contains(text(), "Background:")]/following-sibling::ul[1]/li/text()')
    skills = tree1.xpath('//p[contains(text(), "Skills:")]/following-sibling::ul[1]/li/text()')
    traits = tree1.xpath('//p[contains(text(), "Traits:")]/following-sibling::ul[1]/li/text()')
    return([url, loc, dept, role, need, bg, skills, traits])
            
info = [f(0), f(1)]
for i in range(2, len(id)):
    info = info + [f(i)]
url = [x[0] for x in info]
loc = [x[1] for x in info]
dept = [x[2] for x in info]
role = [x[3] for x in info]
need = [x[4] for x in info]
bg = [x[5] for x in info]
skills = [x[6] for x in info]
traits = [x[7] for x in info]

In [0]:
# re-pack new info
import pandas as pd
df = pd.concat([pd.Series(url), pd.Series(title), 
                pd.Series(timetype), 
                pd.Series(loc), pd.Series(dept), 
                pd.Series(role), pd.Series(need),
                pd.Series(bg),
                pd.Series(skills), pd.Series(traits)], 
                axis=1)
                
df.columns = ['url', 'job', 'timetype',
              'location', 'dept',
              'role', 'need',
              'background', 
              'skills', 'traits']
df              

Unnamed: 0,url,job,timetype,location,dept,role,need,background,skills,traits
0,https://n26.com/en/careers/positions/2152856,Content Writer - English Speaking,Full time,[Berlin],[Marketing],[Write and transcreate content (US English) fo...,[You’re a native English speaker with over 3 y...,[You’re a native English speaker with over 3 y...,[You possess exceptional English-language writ...,[Actively help yourself (and others) be succes...
1,https://n26.com/en/careers/positions/2150159,Director of Banking Strategy and Excellence,Full time,[Berlin],[Banking],"[Assume responsibility for the implementation,...",[At least 8 years of experience in business- o...,[At least 8 years of experience in business- o...,"[A strategic and innovative mindset, supported...",[Actively help yourself (and others) be succes...
2,https://n26.com/en/careers/positions/1815799,Director of Global Public Relations,Full time,[Berlin],[Marketing],[Own N26 external communications efforts by co...,[12 + years experience in corporate communicat...,[],[],[]
3,https://n26.com/en/careers/positions/1979536,Director of People - US,Full time,[New York],[People],[Drive and lead complex projects to support ou...,[Drive and lead complex projects to support ou...,[],[],[Actively help yourself (and others) be succes...
4,https://n26.com/en/careers/positions/2152786,Editor in Chief,Full time,[Berlin],[Marketing],[Ensure consistent excellence in story assignm...,"[You have 10+ years of editing experience, Sev...","[You have 10+ years of editing experience, Sev...","[You are passionate about great storytelling, ...",[Actively help yourself (and others) be succes...
5,https://n26.com/en/careers/positions/2034972,Engineering Manager - Payments/Financial Crime,,[Berlin],[Technology],"[Support, invest in and develop an incredible ...","[Experience growing, coaching, mentoring and p...",[],[],[]


In [0]:
df.location[3]

['New York']

## Initial analysis of jobs

In [0]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   url         6 non-null      object
 1   job         6 non-null      object
 2   timetype    5 non-null      object
 3   location    6 non-null      object
 4   dept        6 non-null      object
 5   role        6 non-null      object
 6   need        6 non-null      object
 7   background  6 non-null      object
 8   skills      6 non-null      object
 9   traits      6 non-null      object
dtypes: object(10)
memory usage: 608.0+ bytes


In [0]:
df.head()

Unnamed: 0,url,job,timetype,location,dept,role,need,background,skills,traits
0,https://n26.com/en/careers/positions/2152856,Content Writer - English Speaking,Full time,[Berlin],[Marketing],[Write and transcreate content (US English) fo...,[You’re a native English speaker with over 3 y...,[You’re a native English speaker with over 3 y...,[You possess exceptional English-language writ...,[Actively help yourself (and others) be succes...
1,https://n26.com/en/careers/positions/2150159,Director of Banking Strategy and Excellence,Full time,[Berlin],[Banking],"[Assume responsibility for the implementation,...",[At least 8 years of experience in business- o...,[At least 8 years of experience in business- o...,"[A strategic and innovative mindset, supported...",[Actively help yourself (and others) be succes...
2,https://n26.com/en/careers/positions/1815799,Director of Global Public Relations,Full time,[Berlin],[Marketing],[Own N26 external communications efforts by co...,[12 + years experience in corporate communicat...,[],[],[]
3,https://n26.com/en/careers/positions/1979536,Director of People - US,Full time,[New York],[People],[Drive and lead complex projects to support ou...,[Drive and lead complex projects to support ou...,[],[],[Actively help yourself (and others) be succes...
4,https://n26.com/en/careers/positions/2152786,Editor in Chief,Full time,[Berlin],[Marketing],[Ensure consistent excellence in story assignm...,"[You have 10+ years of editing experience, Sev...","[You have 10+ years of editing experience, Sev...","[You are passionate about great storytelling, ...",[Actively help yourself (and others) be succes...
