In [3]:
%config Completer.use_jedi=False

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import os
import json
import string

import nltk
from nltk import word_tokenize, FreqDist  # For Tokenisation & Frequency Distribution after Tokenization 
from nltk.corpus import stopwords # Stopwords
import wordcloud

from PIL import Image
import re
import logging
import gensim
from gensim import corpora
import spacy

In [5]:
def preprocess(tokens):
    tokens_nop = [t for t in tokens if t not in string.punctuation] # Removing Punctuations
    tokens_nop = [t.lower() for t in tokens_nop]  # Converting to Lower case
    # wnl = nltk.WordNetLemmatizer()
    stop = stopwords.words('english')
    stop.extend(['armenian','armenia', 'job', 'title', 'position', 'location', 'responsibilities', 'application', 'procedures',
                 'deadline', 'required', 'qualifications', 'renumeration', 'salary', 'date', 'company'])
    tokens_nostop = [t for t in tokens_nop if t not in stop] # Removing Stopwords
    # tokens_lem = [wnl.lemmatize(t) for t in tokens_nostop]
    tokens_clean = [t for t in tokens_nostop if len(t) >= 3]  # simple way to remove the offending " punctuations
    return tokens_clean

In [6]:
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

In [7]:
def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

In [8]:
nlp = spacy.load('en', disable=['parser', 'ner'])

In [10]:
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [11]:
data = pd.read_csv(r"E:\DSP\DSP_22nd Nov\Machine Learning\FInal Class\Case Studies & FInal Projects\6\data job posts.csv")
data.head()

Unnamed: 0,jobpost,date,Title,Company,AnnouncementCode,Term,Eligibility,Audience,StartDate,Duration,...,Salary,ApplicationP,OpeningDate,Deadline,Notes,AboutC,Attach,Year,Month,IT
0,AMERIA Investment Consulting Company\nJOB TITL...,"Jan 5, 2004",Chief Financial Officer,AMERIA Investment Consulting Company,,,,,,,...,,"To apply for this position, please submit a\nc...",,26 January 2004,,,,2004,1,False
1,International Research & Exchanges Board (IREX...,"Jan 7, 2004",Full-time Community Connections Intern (paid i...,International Research & Exchanges Board (IREX),,,,,,3 months,...,,Please submit a cover letter and resume to:\nI...,,12 January 2004,,The International Research & Exchanges Board (...,,2004,1,False
2,Caucasus Environmental NGO Network (CENN)\nJOB...,"Jan 7, 2004",Country Coordinator,Caucasus Environmental NGO Network (CENN),,,,,,Renewable annual contract\nPOSITION,...,,Please send resume or CV toursula.kazarian@......,,20 January 2004\nSTART DATE: February 2004,,The Caucasus Environmental NGO Network is a\nn...,,2004,1,False
3,Manoff Group\nJOB TITLE: BCC Specialist\nPOSI...,"Jan 7, 2004",BCC Specialist,Manoff Group,,,,,,,...,,Please send cover letter and resume to Amy\nPe...,,23 January 2004\nSTART DATE: Immediate,,,,2004,1,False
4,Yerevan Brandy Company\nJOB TITLE: Software D...,"Jan 10, 2004",Software Developer,Yerevan Brandy Company,,,,,,,...,,Successful candidates should submit\n- CV; \n-...,,"20 January 2004, 18:00",,,,2004,1,True


In [12]:
data.shape

(19001, 24)

In [13]:
data = data.drop_duplicates(["RequiredQual"])

In [14]:
data.shape

(16689, 24)

In [15]:
data.RequiredQual.shape

(16689,)

In [16]:
data['RequiredQual'] = data['RequiredQual'].astype(str)

In [17]:
data['RequiredQual_tokenize'] = data.RequiredQual.map(word_tokenize)

In [18]:
# df['RequiredQual_tokens'] = df.RequiredQual_tokens.apply(set)
data['RequiredQual_processed'] = data.RequiredQual_tokenize.apply(preprocess)
# df['jobpost_processedtext'] = df.jobpost_processed.apply(lambda x: ' '.join(x))

In [None]:
# Build the bigram and trigram models
# bigram = gensim.models.Phrases(df['RequiredQual_processed'], min_count=5, threshold=100)  # higher threshold fewer phrases.
#trigram = gensim.models.Phrases(bigram[df['RequiredQual_processed']], threshold=100)
#bigram_mod = gensim.models.phrases.Phraser(bigram)
#trigram_mod = gensim.models.phrases.Phraser(trigram)

In [19]:
data.RequiredQual_processed = lemmatization(data.RequiredQual_processed, allowed_postags=['NOUN', 'VERB'])  # 'ADJ',, 'ADV'])

In [20]:
# logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
dictionary = corpora.Dictionary(data['RequiredQual_processed'])
print(dictionary)
dictionary.filter_extremes(no_below=3, no_above=0.7)
print(dictionary)

Dictionary(7572 unique tokens: ['ability', 'accept', 'account', 'accounting', 'accounting/']...)
Dictionary(4101 unique tokens: ['ability', 'accept', 'account', 'accounting', 'accounting/']...)


In [21]:
topic_num = 7

In [22]:
# Use the dictionary to prepare a DTM (using TF)
dtm_train = [dictionary.doc2bow(d) for d in data['RequiredQual_processed']]
lda = gensim.models.ldamodel.LdaModel(dtm_train, num_topics=topic_num, alpha='auto', chunksize=30, id2word=dictionary,
                                      passes=20, random_state=432)

In [23]:
lda.show_topics()
print(lda.show_topics(num_words=20))

[(0, '0.047*"possession" + 0.046*"background" + 0.039*"drive" + 0.039*"license" + 0.036*"implementation" + 0.034*"security" + 0.027*"deliver" + 0.026*"passion" + 0.022*"equipment" + 0.022*"journalism" + 0.016*"analytic" + 0.014*"guideline" + 0.014*"fund" + 0.014*"learner" + 0.014*"driver" + 0.013*"custom" + 0.012*"driving" + 0.012*"grant" + 0.012*"diplomacy" + 0.011*"search"'), (1, '0.091*"finance" + 0.063*"accounting" + 0.050*"legislation" + 0.040*"standard" + 0.040*"banking" + 0.040*"bank" + 0.034*"account" + 0.026*"study" + 0.024*"software" + 0.022*"linux" + 0.021*"year" + 0.019*"qualification" + 0.019*"ifrs" + 0.017*"audit" + 0.016*"acca" + 0.015*"accept" + 0.015*"system" + 0.015*"reporting" + 0.014*"networking" + 0.013*"case"'), (2, '0.092*"ability" + 0.049*"year" + 0.044*"communication" + 0.039*"degree" + 0.036*"field" + 0.032*"team" + 0.030*"write" + 0.028*"computer" + 0.020*"education" + 0.020*"university" + 0.018*"relate" + 0.017*"office" + 0.014*"fluency" + 0.013*"excel" + 0.

In [24]:
dtopics_train = lda.get_document_topics(dtm_train)
# print topic distribution for 1st 5 rows
for i in range(0, 5):
    print(dtopics_train[i])

[(0, 0.031260747), (1, 0.20658864), (2, 0.27483398), (3, 0.040645115), (4, 0.35231265), (5, 0.038210675), (6, 0.056148198)]
[(0, 0.017064119), (1, 0.0225012), (2, 0.76132786), (3, 0.09044776), (4, 0.06739678), (5, 0.012084629), (6, 0.029177617)]
[(0, 0.023425091), (1, 0.0313267), (2, 0.54872847), (3, 0.18610787), (4, 0.15373828), (5, 0.016583504), (6, 0.040090177)]
[(0, 0.026994148), (1, 0.010329636), (2, 0.392453), (3, 0.0865056), (4, 0.25024354), (6, 0.22825614)]
[(0, 0.05742499), (1, 0.11620826), (2, 0.27895254), (3, 0.44574165), (4, 0.06391717), (5, 0.011060745), (6, 0.026694685)]
