# DreamJobber

---

**Process**
1. Clean text
2. Bag of Words
3. LDA model (Latent Dirichlet allocation) 
4. Classification model

---

**Import Necessary Libraries**

In [1]:
import pandas as pd
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
import nltk
from functions import *

In [2]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/admin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

---

**Load Data**

In [3]:
df = pd.read_csv('data/combined.csv')

In [4]:
df.head()

Unnamed: 0,job_description,job_title
0,DOJ offers a range of opportunities for experi...,Attorney and Assistant United States Attorney
1,As an FBI Special Agent with a military or law...,Special Agent - Law Enforcement or Military Ve...
2,As an FBI Special Agent with expertise in educ...,Special Agent - Education/Teaching Background
3,As an FBI Special Agent with Accounting/Financ...,Special Agent - Accounting/Finance Background
4,"As an FBI Special Agent, your STEM background ...","Special Agent - Science, Technology, Engineeri..."


In [5]:
#check for missing values
df.isna().sum()

job_description    5
job_title          0
dtype: int64

In [6]:
#looks like there are 5 rows that have no job description
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32000 entries, 0 to 31999
Data columns (total 2 columns):
job_description    31995 non-null object
job_title          32000 non-null object
dtypes: object(2)
memory usage: 500.1+ KB


In [7]:
#drop rows with no job descriptions
df = df[pd.notnull(df['job_description'])]

In [8]:
#sanity check, looks good
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 31995 entries, 0 to 31999
Data columns (total 2 columns):
job_description    31995 non-null object
job_title          31995 non-null object
dtypes: object(2)
memory usage: 749.9+ KB


---

## Text Cleaning

1. Tokenize
2. Remove words with fewer than 3 characters
3. Remove stop words
4. Normalize words (Lemmatize and Stem)

**Test the functions on one row of text**

In [10]:
stemmer = SnowballStemmer('english')

In [11]:
text_sample = df[df.index == 5000].values[0][0]

print('original text: ')
words = []
for word in text_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized text: ')
print(preprocess(text_sample))

original text: 
['Newly', 'appointed', 'employee(s)', 'or', 'employee(s)', 'converted', 'to', 'permanent', 'status,', 'selected', 'under', 'this', 'announcement,', 'may', 'be', 'eligible', 'to', 'apply', 'for', 'an', 'award', 'up', 'to', 'the', 'maximum', 'limitation', 'under', 'the', 'provisions', 'of', 'the', 'Education', 'Debt', 'Reduction', 'Program', '(EDRP).', 'Funding,', 'on', 'the', 'final', 'award', 'amount,', 'is', 'contingent', 'on', 'the', 'availability', 'of', 'EDRP', 'funds.', 'Employee(s)', 'must', 'apply', 'for', 'EDRP', 'within', 'four', '(4)', 'months', 'of', 'appointment', 'or', 'conversion.']


 tokenized and lemmatized text: 
['newli', 'appoint', 'employe', 'employe', 'convert', 'perman', 'status', 'select', 'announc', 'elig', 'appli', 'award', 'maximum', 'limit', 'provis', 'educ', 'debt', 'reduct', 'program', 'edrp', 'fund', 'final', 'award', 'conting', 'avail', 'edrp', 'fund', 'employe', 'appli', 'edrp', 'month', 'appoint', 'convers']


**Apply functions to job_description**

In [12]:
#apply function and display first 5 rows
processed_text = df['job_description'].map(preprocess)
processed_text[:5]

0    [offer, rang, opportun, experi, attorney, work...
1    [special, agent, militari, enforc, background,...
2    [special, agent, expertis, educ, gift, relat, ...
3    [special, agent, account, financi, expertis, e...
4    [special, agent, stem, background, provid, ski...
Name: job_description, dtype: object

---

## Bag of Words

In [13]:
#I'll use bag of words to extract features from text for use in modeling

In [14]:
dictionary = gensim.corpora.Dictionary(processed_text)

In [15]:
#check the length before I filter out the extremes
len(dictionary)

56095

In [16]:
dictionary.filter_extremes(no_below=2, no_above=0.5, keep_n=100000)

In [17]:
#check length after filtering out extremes
len(dictionary)

24149

In [18]:
#bow2doc: counts the number of occurrences of each distinct word, 
#converts the word to its integer word id and returns the result as a sparse vector

bow2doc_corpus = [dictionary.doc2bow(text) for text in processed_text]

In [19]:
#let's take a look
bow_doc_5000 = bow2doc_corpus[5000]

for i in range(len(bow_doc_5000)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_5000[i][0], 
                                                     dictionary[bow_doc_5000[i][0]], 
                                                     bow_doc_5000[i][1]))

Word 47 ("militari") appears 2 time.
Word 86 ("area") appears 4 time.
Word 98 ("follow") appears 1 time.
Word 168 ("local") appears 1 time.
Word 193 ("announc") appears 2 time.
Word 196 ("day") appears 1 time.
Word 204 ("open") appears 1 time.
Word 255 ("includ") appears 1 time.
Word 285 ("defin") appears 1 time.
Word 308 ("close") appears 1 time.
Word 315 ("consider") appears 2 time.
Word 316 ("date") appears 1 time.
Word 413 ("vacanc") appears 2 time.
Word 638 ("claim") appears 1 time.
Word 642 ("prefer") appears 1 time.
Word 652 ("member") appears 1 time.
Word 659 ("commut") appears 1 time.
Word 691 ("separ") appears 1 time.
Word 712 ("armi") appears 1 time.
Word 905 ("counti") appears 1 time.
Word 1192 ("move") appears 1 time.
Word 1705 ("citizen") appears 1 time.
Word 2246 ("involuntarili") appears 1 time.
Word 2252 ("spous") appears 1 time.
Word 2290 ("monterey") appears 2 time.
Word 2968 ("garrison") appears 1 time.
Word 3887 ("cruz") appears 1 time.
Word 3888 ("presidio") appea

---

## LDA model with Bag of Words

In [22]:
lda_model = gensim.models.LdaMulticore(bow2doc_corpus, 
                                       num_topics=25, 
                                       id2word=dictionary, 
                                       passes=5, 
                                       workers=4)


CPU times: user 4 µs, sys: 1 µs, total: 5 µs
Wall time: 9.78 µs
CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 9.06 µs


In [23]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.019*"perform" + 0.019*"abil" + 0.012*"duti" + 0.011*"skill" + 0.010*"equip" + 0.010*"custom" + 0.009*"time" + 0.009*"essenti" + 0.009*"abl" + 0.008*"assign"
Topic: 1 
Words: 0.021*"sale" + 0.016*"claim" + 0.013*"busi" + 0.012*"account" + 0.012*"manag" + 0.010*"insur" + 0.009*"execut" + 0.009*"market" + 0.008*"benefit" + 0.008*"includ"
Topic: 2 
Words: 0.023*"develop" + 0.018*"test" + 0.018*"design" + 0.014*"data" + 0.011*"engin" + 0.010*"product" + 0.010*"team" + 0.010*"null" + 0.009*"project" + 0.008*"technic"
Topic: 3 
Words: 0.017*"support" + 0.016*"skill" + 0.014*"custom" + 0.011*"system" + 0.011*"applic" + 0.011*"network" + 0.011*"technic" + 0.010*"manag" + 0.010*"knowledg" + 0.009*"provid"
Topic: 4 
Words: 0.038*"applic" + 0.030*"locat" + 0.027*"offic" + 0.026*"announc" + 0.021*"nation" + 0.016*"manag" + 0.016*"guard" + 0.015*"vacanc" + 0.013*"open" + 0.013*"date"
Topic: 5 
Words: 0.031*"aflac" + 0.018*"compani" + 0.017*"busi" + 0.016*"develop" + 0.016*"year" +

In [None]:
#next-steps
#find optimal lda model parameters to get a good seperation or topics
#figure out the topics
#use word probability 


---

## LDA model with tf-idf

In [None]:
#from gensim import corpora, models
#tfidf = models.TfidfModel(bow2doc_corpus)
#corpus_tfidf = tfidf[bow2doc_corpus]
#from pprint import pprint
#for doc in corpus_tfidf:
#    pprint(doc)
#    break

In [None]:
#lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, 
 #                                            num_topics=25, 
  #                                           id2word=dictionary, 
   #                                          passes=10, 
    #                                         workers=4)
#for idx, topic in lda_model_tfidf.print_topics(-1):
#    print('Topic: {} Word: {}'.format(idx, topic))