# Exploratory analysis for NLP models

Vlad scraped glassdoor last year for job descriptions. Here is the raw data:

In [1]:
# packages needed
#!pip install langdetect
#!pip install nltk
#!pip install pyLDAvis
#!pip install gensim

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import string
from time import sleep
from langdetect import detect
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
#import pyLDAvis

In [2]:
df = pd.read_json("../../../glassdoor_scraper_output/data_scientist_in_berlin_2020-09-03.json")

In [3]:
df.head(5)

Unnamed: 0,position,company,description,url
0,Data Scientist (m/f/x),Deutsche Bank AG\n3.6\n★,Job Description:\n\n\nTG 8\n\nDetails of the r...,https://www.glassdoor.de/job-listing/data-scie...
1,Studentische Aushilfe (m/w/d) Data Science im ...,Federal Association of the AOK\n3.9\n★,Vielfältige Aufgaben warten auf Sie\nUnterstüt...,https://www.glassdoor.de/job-listing/studentis...
2,Data Architect (m/w/d) - Energy Management,Viessmann Werke Berlin Gmbh\n4.0\n★,What gets you out of bed in the morning and ke...,https://www.glassdoor.de/job-listing/data-arch...
3,Data Scientist Bioinformatics (m/f/d),Centogene AG\n2.2\n★,"Welcome at CENTOGENE!\n\nWe, CENTOGENE GmbH (""...",https://www.glassdoor.de/job-listing/data-scie...
4,Data Scientist,Marley Spoon\n4.0\n★,Marley Spoon is the new way to cook. We bring ...,https://www.glassdoor.de/job-listing/data-scie...


## 1. Preprocessing functions
done by Alex

In [4]:
df.shape

(302, 4)

In [32]:
## change case to lower
def to_lower(text):
    return text.lower()
df["clean"] = df["description"].apply(to_lower)

In [33]:
## remove numbers from the corpus
def remove_number(text):
    text = ''.join(word for word in text if not word.isdigit())
    
    return text

df["clean"]= df["clean"].apply(remove_number)

In [34]:
## remove special puncutation from text
def remove_punctuation(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    
    return text

df["clean"] = df["clean"].apply(remove_punctuation)

In [84]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [35]:
## tag the language of the dataframe
from time import sleep
from langdetect import detect

def tag_language(text):
    ln = detect(text)
    
    return ln

df["language"] = df["clean"].apply(tag_language)

In [36]:
## remove stopwords and tokenize the text
def remove_stopwords(text):

    stop_words = set(stopwords.words('english')) 

    word_tokens = word_tokenize(text) 
  
    text = [w for w in word_tokens if not w in stop_words] 
  
    return text


df["clean"] = df["clean"].apply(remove_stopwords)

In [37]:
## lemmatize the output

def lemmatize_words(text):
    
    lemmatizer = WordNetLemmatizer()

    lemmatized = [lemmatizer.lemmatize(word) for word in text]

    return lemmatized

In [38]:
df['clean'][0][:10]

['job',
 'description',
 'tg',
 'details',
 'role',
 'fits',
 'team',
 'reporting',
 'team',
 'lead']

## 2. Exploration of LDA

Main aim: try out LDA combined with different tokenizing methods. Unsupervised method, requires no input from a human

Output: extract topics and keywords from job descriptions

### E_01
try approach from: https://medium.com/analytics-vidhya/data-science-job-search-using-nlp-and-lda-in-python-12ecbfac79f9

steps:
- bla



In [39]:
df["clean"] = df["clean"].apply(lemmatize_words)

In [113]:
# seledct english descriptions and reset index
df_eng = df[df["language"] == "en"]
df_eng.reset_index(inplace=True)
df_eng = df_eng.rename(columns={'index': 'tmp_index'})
df_eng = df_eng.drop(columns='tmp_index')
df_eng.head(2)

Unnamed: 0,position,company,description,url,clean,language
0,Data Scientist (m/f/x),Deutsche Bank AG\n3.6\n★,Job Description:\n\n\nTG 8\n\nDetails of the r...,https://www.glassdoor.de/job-listing/data-scie...,"[job, description, tg, detail, role, fit, team...",en
1,Data Architect (m/w/d) - Energy Management,Viessmann Werke Berlin Gmbh\n4.0\n★,What gets you out of bed in the morning and ke...,https://www.glassdoor.de/job-listing/data-arch...,"[get, bed, morning, keep, motivated, throughou...",en


In [115]:
df_eng.shape

(216, 6)

In [116]:
# make bag of words with gensim
# create corpus
descriptions = df_eng['clean']

# make dictionary of words
id2word = corpora.Dictionary(descriptions)

# # Term Document Frequency; description to bag fo words
corpus = [id2word.doc2bow(text) for text in descriptions]

In [117]:
# View endoded words
print(corpus[0][:10]) 

[(0, 1), (1, 1), (2, 1), (3, 2), (4, 2), (5, 2), (6, 1), (7, 2), (8, 8), (9, 1)]


In [119]:
# access words
id2word[0]

'able'

In [143]:
len(corpus)

216

In [148]:
corpus[0]

[(0, 1),
 (1, 1),
 (2, 1),
 (3, 2),
 (4, 2),
 (5, 2),
 (6, 1),
 (7, 2),
 (8, 8),
 (9, 1),
 (10, 1),
 (11, 1),
 (12, 1),
 (13, 1),
 (14, 10),
 (15, 1),
 (16, 2),
 (17, 1),
 (18, 2),
 (19, 1),
 (20, 2),
 (21, 1),
 (22, 1),
 (23, 1),
 (24, 2),
 (25, 1),
 (26, 1),
 (27, 1),
 (28, 1),
 (29, 1),
 (30, 1),
 (31, 2),
 (32, 1),
 (33, 1),
 (34, 1),
 (35, 1),
 (36, 1),
 (37, 1),
 (38, 1),
 (39, 1),
 (40, 1),
 (41, 1),
 (42, 2),
 (43, 12),
 (44, 1),
 (45, 1),
 (46, 1),
 (47, 1),
 (48, 1),
 (49, 1),
 (50, 2),
 (51, 1),
 (52, 1),
 (53, 1),
 (54, 1),
 (55, 1),
 (56, 3),
 (57, 1),
 (58, 2),
 (59, 1),
 (60, 1),
 (61, 1),
 (62, 1),
 (63, 1),
 (64, 2),
 (65, 1),
 (66, 2),
 (67, 3),
 (68, 3),
 (69, 1),
 (70, 2),
 (71, 1),
 (72, 1),
 (73, 1),
 (74, 1),
 (75, 4),
 (76, 1),
 (77, 1),
 (78, 1),
 (79, 1),
 (80, 2),
 (81, 1),
 (82, 1),
 (83, 1),
 (84, 2),
 (85, 1),
 (86, 1),
 (87, 1),
 (88, 6),
 (89, 1),
 (90, 1),
 (91, 1),
 (92, 1),
 (93, 1),
 (94, 2),
 (95, 1),
 (96, 1),
 (97, 1),
 (98, 2),
 (99, 1),
 (100, 2

In [154]:
tmp = pd.Series(descriptions[0]).sort_values()
tmp[:50]

215           able
132     accredited
226         across
100       adoption
114       adoption
165       advanced
85        advanced
202          agile
71           agile
162             ai
15         analyst
40         analyst
23       analytics
31       analytics
87       analytics
59       analytics
76       analytics
70       analytics
168      analytics
11       analytics
297    application
115          apply
154       applying
283    arrangement
214     articulate
46           audit
33           audit
18           audit
29           audit
81           audit
237          audit
95           audit
109          audit
123          audit
105          audit
63         auditor
34      automation
90      automation
129       bachelor
112     background
264     background
151        banking
51          become
120         become
307         belief
13          berlin
177            big
218          broad
173          broad
267          build
dtype: object

In [153]:
# write function!!
# find words per topic 
for cp in range(3):#len(corpus)):
    word_freq = corpus[cp]
    for id, freq in word_freq:
        print((id2word[id], freq))

('able', 1)
('accredited', 1)
('across', 1)
('adoption', 2)
('advanced', 2)
('agile', 2)
('ai', 1)
('analyst', 2)
('analytics', 8)
('application', 1)
('apply', 1)
('applying', 1)
('arrangement', 1)
('articulate', 1)
('audit', 10)
('auditor', 1)
('automation', 2)
('bachelor', 1)
('background', 2)
('banking', 1)
('become', 2)
('belief', 1)
('berlin', 1)
('big', 1)
('broad', 2)
('build', 1)
('business', 1)
('case', 1)
('center', 1)
('chemistry', 1)
('click', 1)
('coe', 2)
('collaboratively', 1)
('colleague', 1)
('college', 1)
('committed', 1)
('computer', 1)
('conduct', 1)
('confluence', 1)
('country', 1)
('create', 1)
('creative', 1)
('culture', 2)
('data', 12)
('date', 1)
('define', 1)
('degree', 1)
('deliver', 1)
('description', 1)
('detail', 1)
('develop', 2)
('developer', 1)
('different', 1)
('directly', 1)
('disability', 1)
('discrimination', 1)
('diverse', 3)
('diversity', 1)
('drive', 2)
('dynamic', 1)
('economics', 1)
('effective', 1)
('efficient', 1)
('embrace', 1)
('encourage',

In [None]:
# implement LDA

In [None]:
# calculate coherence score

In [None]:
# crossvalidate with coherence scores

In [None]:
# visualise w/ pyLDAvis