In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_json("../raw_data/glassdoor_scraper_output/data_scientist_in_berlin_2020-09-03.json")

In [4]:
df.head(5)

Unnamed: 0,position,company,description,url
0,Data Scientist (m/f/x),Deutsche Bank AG\n3.6\n★,Job Description:\n\n\nTG 8\n\nDetails of the r...,https://www.glassdoor.de/job-listing/data-scie...
1,Studentische Aushilfe (m/w/d) Data Science im ...,Federal Association of the AOK\n3.9\n★,Vielfältige Aufgaben warten auf Sie\nUnterstüt...,https://www.glassdoor.de/job-listing/studentis...
2,Data Architect (m/w/d) - Energy Management,Viessmann Werke Berlin Gmbh\n4.0\n★,What gets you out of bed in the morning and ke...,https://www.glassdoor.de/job-listing/data-arch...
3,Data Scientist Bioinformatics (m/f/d),Centogene AG\n2.2\n★,"Welcome at CENTOGENE!\n\nWe, CENTOGENE GmbH (""...",https://www.glassdoor.de/job-listing/data-scie...
4,Data Scientist,Marley Spoon\n4.0\n★,Marley Spoon is the new way to cook. We bring ...,https://www.glassdoor.de/job-listing/data-scie...


## Preprocessing

(216, 5)

In [4]:
## change case to lower
def to_lower(text):
    return text.lower()
df["clean"] = df["description"].apply(to_lower)

In [5]:
## remove numbers from the corpus
def remove_number(text):
    text = ''.join(word for word in text if not word.isdigit())
    
    return text

df["clean"]= df["clean"].apply(remove_number)


In [6]:
import string

## remove special puncutation from text
def remove_punctuation(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    
    return text

df["clean"] = df["clean"].apply(remove_punctuation)
    



In [7]:
## tag the language of the dataframe
from time import sleep
from langdetect import detect

def tag_language(text):
    ln = detect(text)
    
    return ln

df["language"] = df["clean"].apply(tag_language)

0      job description\n\n\ntg \n\ndetails of the rol...
1      vielfältige aufgaben warten auf sie\nunterstüt...
2      what gets you out of bed in the morning and ke...
3      welcome at centogene\n\nwe centogene gmbh cent...
4      marley spoon is the new way to cook we bring d...
                             ...                        
297    stenon is the world’s first realtime soil anal...
298    what gets you out of bed in the morning and ke...
299    we are hiring\n\nat permacon you will find the...
300    department offprice data science\n\nreports to...
301    executive principal data engineer  digital fou...
Name: clean, Length: 302, dtype: object

In [10]:
## remove stopwords and tokenize the text
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize

def remove_stopwords(text):

    stop_words = set(stopwords.words('english')) 

    word_tokens = word_tokenize(text) 
  
    text = [w for w in word_tokens if not w in stop_words] 
  
    return text


df["clean"] = df["clean"].apply(remove_stopwords)

In [11]:
## lemmatize the output
from nltk.stem import WordNetLemmatizer

def lemmatize_words(text):
    
    lemmatizer = WordNetLemmatizer()

    lemmatized = [lemmatizer.lemmatize(word) for word in text]

    return lemmatized

df["clean"] = df["clean"].apply(lemmatize_words)

In [18]:
df[df["language"] == "en"]

Unnamed: 0,position,company,description,url,clean,language
0,Data Scientist (m/f/x),Deutsche Bank AG\n3.6\n★,Job Description:\n\n\nTG 8\n\nDetails of the r...,https://www.glassdoor.de/job-listing/data-scie...,"[job, description, tg, detail, role, fit, team...",en
2,Data Architect (m/w/d) - Energy Management,Viessmann Werke Berlin Gmbh\n4.0\n★,What gets you out of bed in the morning and ke...,https://www.glassdoor.de/job-listing/data-arch...,"[get, bed, morning, keep, motivated, throughou...",en
3,Data Scientist Bioinformatics (m/f/d),Centogene AG\n2.2\n★,"Welcome at CENTOGENE!\n\nWe, CENTOGENE GmbH (""...",https://www.glassdoor.de/job-listing/data-scie...,"[welcome, centogene, centogene, gmbh, centogen...",en
4,Data Scientist,Marley Spoon\n4.0\n★,Marley Spoon is the new way to cook. We bring ...,https://www.glassdoor.de/job-listing/data-scie...,"[marley, spoon, new, way, cook, bring, delight...",en
5,Data Scientist,Quantum Brains,Responsibilities (include but not limited to t...,https://www.glassdoor.de/job-listing/data-scie...,"[responsibility, include, limited, following, ...",en
...,...,...,...,...,...,...
297,Senior Data Engineer (m/f/d),stenon GmbH,stenon is the world’s first real-time soil ana...,https://www.glassdoor.de/job-listing/senior-da...,"[stenon, world, ’, first, realtime, soil, anal...",en
298,Data Architect (m/w/d) - Energy Management,00 - Viessmann Group,What gets you out of bed in the morning and ke...,https://www.glassdoor.de/job-listing/data-arch...,"[get, bed, morning, keep, motivated, throughou...",en
299,Senior scientist (m/f/d),Permacon GmbH\n1.9\n★,We are hiring\n\nAt PERMACON you will find the...,https://www.glassdoor.de/job-listing/senior-sc...,"[hiring, permacon, find, seriousness, security...",en
300,Software Engineer for Algorithmic Data Products,Zalando\n3.5\n★,Department: Offprice Data Science\n\nReports t...,https://www.glassdoor.de/job-listing/software-...,"[department, offprice, data, science, report, ...",en
