# Analyse Job Advertisments
In this notebook a bunch of PDF files that contain job advertisments are analysed with the use of the NLTK library. 

The script reads in a list of PDF files, then tokenize and stem the raw text of the PDFs and finally computes the tf-idf values for a list of keywords with the PDF files as foundation for the idf computation.

In [1]:
import nltk
import string
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.porter import PorterStemmer
import PyPDF2 

In [37]:
dataFolder = "./data"
#dataFolder = "/Users/schaer/sciebo/MALIS_Stellen/OpenBiblioJobs"
token_dict = {}

In [38]:
def tokenize(text):
    tokens = nltk.word_tokenize(text)
    stems = []
    for item in tokens:
        stems.append(PorterStemmer().stem(item))
    return stems

In [48]:
numFiles = 0
for dirpath, dirs, files in os.walk(dataFolder):
    for f in files:        
        fname = os.path.join(dirpath, f)
        #print("fname=", fname)
        with open(fname, 'rb') as pdfFileObj:            
            try: 
                numFiles=numFiles+1
                pdfReader = PyPDF2.PdfFileReader(pdfFileObj) 
                numPages = pdfReader.getNumPages()
                for numPage in range(numPages):
                    pageObj = pdfReader.getPage(numPage) 
                    text = pageObj.extractText()                
                    token_dict[f] = text.lower().translate(str.maketrans('', '', string.punctuation))
            except:
                print("error in PDF file " + fname)
                numFiles=numFiles-1
            pdfFileObj.close()
print("Read " + str(numFiles) + " files")

Read 13 files


In [49]:
german_stopwords = ["a","ab","aber","ach","acht","achte","achten","achter","achtes","ag","alle","allein","allem","allen","aller","allerdings","alles","allgemeinen","als","also","am","an","ander","andere","anderem","anderen","anderer","anderes","anderm","andern","anderr","anders","au","auch","auf","aus","ausser","ausserdem","außer","außerdem","b","bald","bei","beide","beiden","beim","beispiel","bekannt","bereits","besonders","besser","besten","bin","bis","bisher","bist","c","d","d.h","da","dabei","dadurch","dafür","dagegen","daher","dahin","dahinter","damals","damit","danach","daneben","dank","dann","daran","darauf","daraus","darf","darfst","darin","darum","darunter","darüber","das","dasein","daselbst","dass","dasselbe","davon","davor","dazu","dazwischen","daß","dein","deine","deinem","deinen","deiner","deines","dem","dementsprechend","demgegenüber","demgemäss","demgemäß","demselben","demzufolge","den","denen","denn","denselben","der","deren","derer","derjenige","derjenigen","dermassen","dermaßen","derselbe","derselben","des","deshalb","desselben","dessen","deswegen","dich","die","diejenige","diejenigen","dies","diese","dieselbe","dieselben","diesem","diesen","dieser","dieses","dir","doch","dort","drei","drin","dritte","dritten","dritter","drittes","du","durch","durchaus","durfte","durften","dürfen","dürft","e","eben","ebenso","ehrlich","ei","ei,","eigen","eigene","eigenen","eigener","eigenes","ein","einander","eine","einem","einen","einer","eines","einig","einige","einigem","einigen","einiger","einiges","einmal","eins","elf","en","ende","endlich","entweder","er","ernst","erst","erste","ersten","erster","erstes","es","etwa","etwas","euch","euer","eure","eurem","euren","eurer","eures","f","folgende","früher","fünf","fünfte","fünften","fünfter","fünftes","für","g","gab","ganz","ganze","ganzen","ganzer","ganzes","gar","gedurft","gegen","gegenüber","gehabt","gehen","geht","gekannt","gekonnt","gemacht","gemocht","gemusst","genug","gerade","gern","gesagt","geschweige","gewesen","gewollt","geworden","gibt","ging","gleich","gott","gross","grosse","grossen","grosser","grosses","groß","große","großen","großer","großes","gut","gute","guter","gutes","h","hab","habe","haben","habt","hast","hat","hatte","hatten","hattest","hattet","heisst","her","heute","hier","hin","hinter","hoch","hätte","hätten","i","ich","ihm","ihn","ihnen","ihr","ihre","ihrem","ihren","ihrer","ihres","im","immer","in","indem","infolgedessen","ins","irgend","ist","j","ja","jahr","jahre","jahren","je","jede","jedem","jeden","jeder","jedermann","jedermanns","jedes","jedoch","jemand","jemandem","jemanden","jene","jenem","jenen","jener","jenes","jetzt","k","kam","kann","kannst","kaum","kein","keine","keinem","keinen","keiner","keines","kleine","kleinen","kleiner","kleines","kommen","kommt","konnte","konnten","kurz","können","könnt","könnte","l","lang","lange","leicht","leide","lieber","los","m","machen","macht","machte","mag","magst","mahn","mal","man","manche","manchem","manchen","mancher","manches","mann","mehr","mein","meine","meinem","meinen","meiner","meines","mensch","menschen","mich","mir","mit","mittel","mochte","mochten","morgen","muss","musst","musste","mussten","muß","mußt","möchte","mögen","möglich","mögt","müssen","müsst","müßt","n","na","nach","nachdem","nahm","natürlich","neben","nein","neue","neuen","neun","neunte","neunten","neunter","neuntes","nicht","nichts","nie","niemand","niemandem","niemanden","noch","nun","nur","o","ob","oben","oder","offen","oft","ohne","ordnung","p","q","r","recht","rechte","rechten","rechter","rechtes","richtig","rund","s","sa","sache","sagt","sagte","sah","satt","schlecht","schluss","schon","sechs","sechste","sechsten","sechster","sechstes","sehr","sei","seid","seien","sein","seine","seinem","seinen","seiner","seines","seit","seitdem","selbst","sich","sie","sieben","siebente","siebenten","siebenter","siebentes","sind","so","solang","solche","solchem","solchen","solcher","solches","soll","sollen","sollst","sollt","sollte","sollten","sondern","sonst","soweit","sowie","später","startseite","statt","steht","suche","t","tag","tage","tagen","tat","teil","tel","tritt","trotzdem","tun","u","uhr","um","und","und?","uns","unse","unsem","unsen","unser","unsere","unserer","unses","unter","v","vergangenen","viel","viele","vielem","vielen","vielleicht","vier","vierte","vierten","vierter","viertes","vom","von","vor","w","wahr?","wann","war","waren","warst","wart","warum","was","weg","wegen","weil","weit","weiter","weitere","weiteren","weiteres","welche","welchem","welchen","welcher","welches","wem","wen","wenig","wenige","weniger","weniges","wenigstens","wenn","wer","werde","werden","werdet","weshalb","wessen","wie","wieder","wieso","will","willst","wir","wird","wirklich","wirst","wissen","wo","woher","wohin","wohl","wollen","wollt","wollte","wollten","worden","wurde","wurden","während","währenddem","währenddessen","wäre","würde","würden","x","y","z","z.b","zehn","zehnte","zehnten","zehnter","zehntes","zeit","zu","zuerst","zugleich","zum","zunächst","zur","zurück","zusammen","zwanzig","zwar","zwei","zweite","zweiten","zweiter","zweites","zwischen","zwölf","über","überhaupt","übrigens"]
tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words=german_stopwords)
tfs = tfidf.fit_transform(token_dict.values())



In [63]:
token_list = []
for token in token_dict.values():
    t = tokenize(token)    
    token_list.extend(t)
#response = tfidf.transform([list(token_dict.values())])


In [68]:
token_dict.values()

dict_values(['3\n\n3\n \n \n \n \n \n \nprof dr lorenz lorenz\n\nmeyer\n \nonlinejournalismus  medienentwicklung\n \nqualifikationsprofil des \nbachelors\n \nor an hier li\n\n\n \n \ninsgesamt machen die\n \nzur re\n\nakkreditierung anstehenden programme laut u\nnterlagen einen \nsehr \n\ngen geschulten eindruck \ndie curricula sind einleuchtend und \numfassen sowohl wissenschaftliche als auch berufspraktische aspekte\n \n\n \n\nwerden\n \nin den profilen der lehrenden ebenso sichtbar wie im \nzusc\nhnitt der lehrprojekte und in besonderen veranstaltungsformen \n \n \n\n \n \n \n \n \n \nprof dr l lorenz\n\nmeyer\n \n', '', '\n˘\n\n˘ˇ˘ˆ˘˘\n˙˘˛ \n˘˘\n\n  ˚ ˘\nˇ\n˘˘˙˘\n˚ \n˘\n˘˘\n\n1˘\n\n˛˘\n\n\n˘\n\n˘\n˘˘\n˛\n˙\n˘3˘\n˘˘\nˇ˘\n˘˘\n˛\n4\n˘\n˘˘\n ˘\n˘7\n8\n\n99˚˜\n9˘\n˘\n\n ˚\n˘\n˘\n\n \n˛\n˛˘\n\n8˚\n0b9cc9d \n˘˘\n \n4\n\nˇ\n\n\nccc˘\n˙˘˘˘\nc9fg\n\n˘˘\n5\n4\n\n3\n\n\n˘\n3\n ˘˘\n\n4\nˆ\nˆ˙ˇ\n˙˙\n\n0˘\n\n\n˘\n\n˘˘i˘˘˚\n˚\nj˘\n˘\n\n˘\n˘\n˘˘3˘˘˘˚\n\n4\n\nˆ\nˇ\n\nˆ˘˘\n˘˘\n˝\n˘\n\n\n0\n497\n˘˘\n˘

In [65]:
term_list = 'fakultät data information science open access computer library'
response = tfidf.transform([term_list])
feature_names = tfidf.get_feature_names()
for col in response.nonzero()[1]:
    if(response[0, col] > 0):
        print(feature_names[col], ' - ', response[0, col])

inform  -  0.5773502691896258
fakultät  -  0.5773502691896258
data  -  0.5773502691896258


In [70]:
print(tfs)

  (0, 431)	0.13794322168981463
  (0, 104)	0.13794322168981463
  (0, 261)	0.13794322168981463
  (0, 202)	0.13794322168981463
  (0, 465)	0.13794322168981463
  (0, 363)	0.13794322168981463
  (0, 260)	0.12232241854045538
  (0, 324)	0.13794322168981463
  (0, 72)	0.13794322168981463
  (0, 101)	0.13794322168981463
  (0, 61)	0.1102059822974133
  (0, 453)	0.13794322168981463
  (0, 366)	0.12232241854045538
  (0, 416)	0.13794322168981463
  (0, 143)	0.13794322168981463
  (0, 127)	0.13794322168981463
  (0, 140)	0.13794322168981463
  (0, 189)	0.13794322168981463
  (0, 184)	0.13794322168981463
  (0, 288)	0.13794322168981463
  (0, 258)	0.13794322168981463
  (0, 326)	0.13794322168981463
  (0, 68)	0.13794322168981463
  (0, 58)	0.13794322168981463
  (0, 337)	0.13794322168981463
  :	:
  (11, 1)	0.03522599897700464
  (11, 345)	0.03522599897700464
  (11, 272)	0.03522599897700464
  (11, 206)	0.07045199795400928
  (11, 301)	0.07045199795400928
  (11, 70)	0.03522599897700464
  (11, 2)	0.03522599897700464
  (11

In [66]:
import collections
tokens = token_list
for sw in stopwords:
    try:
        tokens = list(filter(lambda a: a != sw, tokens)) # remove stopwords from list
    except:
        print(sw + " not in list")
ctr = collections.Counter(tokens)
print("Frequency of the elements in the List : ",ctr)


Frequency of the elements in the List :  Counter({'˘': 25, 'done': 23, '˘˘': 13, '3': 8, 'data': 8, 'al': 7, '4': 7, 'recommend': 7, 'rate': 7, 'set': 7, 'dr': 6, 'lorenz': 6, 'de': 6, 'prof': 5, 'name': 5, 'tabl': 5, 'text': 5, 'key': 5, 'meyer': 4, '˚': 4, '8': 4, 'item': 4, 'and': 4, 'infotyp': 4, 'sowohl': 3, 'lehrenden': 3, 'ˇ': 3, '˛': 3, 'wert': 3, 'attribut': 3, '10': 3, 'movi': 3, 'music': 3, 'punkt': 3, '20': 3, 'stellt': 3, '2': 3, 'fakultät': 3, 'vorgelegt': 3, 'action': 3, 'db': 3, 'absenc': 3, 'onlinejournalismu': 2, 'medienentwicklung': 2, 'qualifikationsprofil': 2, 'bachelor': 2, 'or': 2, 'li': 2, 'insgesamt': 2, 're': 2, 'akkreditierung': 2, 'anstehenden': 2, 'programm': 2, 'laut': 2, 'nterlagen': 2, 'gen': 2, 'geschulten': 2, 'eindruck': 2, 'curricula': 2, 'einleuchtend': 2, 'umfassen': 2, 'wissenschaftlich': 2, 'berufspraktisch': 2, 'aspekt': 2, 'profilen': 2, 'sichtbar': 2, 'zusc': 2, 'hnitt': 2, 'lehrprojekt': 2, 'besonderen': 2, 'veranstaltungsformen': 2, '˛˘': 2,