#### Pipeline to process user input and return possible future jobs
this notebook contains several parts that process current job description entered by user and return suggestions of future jobs.
<br>1) preprocess and vectorize text in user input.
<br>2) extract document topic matrix for user's current job (job i).
<br>3) calculate topic weights of future job (job i+1) by multiplying topic weight matrix of job i with transition matrix.
<br>4) find jobs in dataset that are most similar to predicted future job, retrieve title, top skills (which are top topics), and keywords of each similar job in dataset.

In [1]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import os
import pickle
import re
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.decomposition import NMF
from gensim.summarization import keywords

In [7]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
df = pickle.load(open('work_exp_train.pkl', 'rb'))
transition_mat = pickle.load(open('transition_mat.pkl', 'rb'))
vectorizer = pickle.load(open('vectorizer.pkl', 'rb'))
nmf = pickle.load(open('nmf.pkl', 'rb'))

In [8]:
df.head()

Unnamed: 0,resume_id,job_id,job_title_processed,job_description_processed,title_and_desc,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Topic 6,Topic 7,Topic 8,Topic 9,Topic 10,Topic 11,Topic 12,Topic 13,Topic 14,Topic 15,Topic 16,Topic 17,Topic 18,Topic 19,Topic 20,highest_topic1,highest_topic2,highest_topic3
0,1,0,data scientist,manager smart solution team information planni...,data scientist manager smart solution team inf...,0.374065,0.0,0.0,0.0,0.025764,0.0,0.08613,0.0,0.0,0.041831,0.0,0.060487,0.0,0.258667,0.030413,0.0,0.122643,0.0,0.0,0.0,Topic 1,Topic 14,Topic 17
1,1,1,business inteligence consultant,social medium analysis participate development...,business inteligence consultant social medium ...,0.0,0.0,0.094456,0.0,0.283788,0.0,0.019943,0.0,0.106125,0.0,0.121584,0.062855,0.0,0.0,0.124647,0.182327,0.004276,0.0,0.0,0.0,Topic 5,Topic 16,Topic 15
2,2,0,director business relation,gimme data scientist business solution present...,director business relation gimme data scientis...,0.043636,0.0,0.0,0.0,0.406016,0.002058,0.093207,0.240388,0.050646,0.0,0.0,0.0,0.0,0.009227,0.154823,0.0,0.0,0.0,0.0,0.0,Topic 5,Topic 8,Topic 15
3,2,1,senior data analyst customer interaction decis...,sa fulltime translated business objective data...,senior data analyst customer interaction decis...,0.010189,0.066499,0.004399,0.042836,0.21031,0.036869,0.356958,0.074922,0.055022,0.013159,0.0,0.0,0.0,0.0,0.103456,0.01533,0.010051,0.0,0.0,0.0,Topic 7,Topic 5,Topic 15
4,3,0,freelance developer,toronto canada completed freelancer project in...,freelance developer toronto canada completed f...,0.0,0.0,0.462758,0.0,0.0,0.0,0.26747,0.0,0.0,0.122256,0.020006,0.002502,0.0,0.0,0.057464,0.028745,0.038407,0.0,0.0,0.000393,Topic 3,Topic 7,Topic 10


In [5]:
def preprocess(text):
    """ preprocess text: remove special characters, remove digits, tokenize,
    lowercase, remove stopwords, lemmatize
    """
    tokenizer = RegexpTokenizer(r'\w+')
    stopwords_en = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
        
    text = re.sub('[^a-zA-Z]', ' ', text )
    text = re.sub(r'\s+', ' ', text)
    tokens = tokenizer.tokenize(text)
    tokens = [token.lower() for token in tokens]
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stopwords_en]
    return ' '.join(tokens)

In [15]:
def next_job_topics(description, vectorizer, model, transition_matrix):
    """get document topic matrix for current job description entered by user
    and compute document topic matrix for future job by multiplying
    current job topic matrix with transition matrix
    description -- new job description to be analyzed
    vectorizer -- vectorizer for text processing
    model -- nmf model
    transition_matrix -- probabilities of transitioning from each of 20 topics
    at job i to each of 20 topics at job i+1
    """
    desc = preprocess(description)
    desc_vec = vectorizer.transform([desc])
    # get topic weights for description of current job
    desc_nmf = model.transform(desc_vec)
    # normalize topic weights so that they sum up to 1 
    desc_nmf = desc_nmf/np.sum(desc_nmf)
    
    # calculate document topic matrix for next job
    next_job_topics = np.dot(desc_nmf, transition_matrix)
    
    return next_job_topics

In [16]:
def job_similarity(topic_weights1, topic_weights2):
    """return cosine similarity between two jobs each represented by topic weights
    topic_weights1 -- 1X20 matrix (20 topics)
    topic_weights2 -- 1X20 matrix (20 topics)
    """
    similarity = np.dot(topic_weights1, topic_weights2.T) \
                 /(np.linalg.norm(topic_weights1)*np.linalg.norm(topic_weights2))
    
    return similarity

In [68]:
def next_job_output(description, vectorizer, model, transition_matrix, df):
    """find 6 jobs in dataset that are most similar to predicted future job,
    return title, top 3 topics/skills of the 6 jobs, and keywords in 
    each of the 6 job descriptions. 
    """
    # topic weights of predicted future job
    next_job = next_job_topics(description, vectorizer, model, transition_matrix)
    
    # cosine similarity between user input and jobs in dataset
    topic_cols = [col for col in df.columns if 'Topic' in col]
    topics_all = df[topic_cols].values
    for i in range(len(df)):
        df.loc[i, 'similarity'] = job_similarity(topics_all[i], next_job)
    
    # 6 most similar jobs
    df6 = df.sort_values(by=['similarity'], ascending=False)[:6].reset_index(drop=True)
    # get keywords from each job description
    df6['keyterms'] = df6['job_description_processed'].apply(lambda x: keywords(x, ratio=0.18, words=None, split=False, scores=False, pos_filter=None, lemmatize=True))
    df6['keyterm_list'] = df6['keyterms'].apply(lambda x: ' '.join(x.split('\n')))
    
    topic_dict = {'Topic 1':'data warehousing', 'Topic 2':'project management',
                  'Topic 3':'web development', 'Topic 4':'network management',
                  'Topic 5':'statistical analysis', 'Topic 6':'product testing',
                  'Topic 7':'machine learning', 'Topic 8': 'customer support',
                  'Topic 9':'business solutions', 'Topic 10':'management/leadership',
                  'Topic 11': 'systems management', 'Topic 12': 'enterprise data architecture',
                  'Topic 13': 'IT technical support', 'Topic 14': 'big data tools',
                  'Topic 15': 'marketing', 'Topic 16': 'data reporting',
                  'Topic 17': 'academic research', 'Topic 18': 'database management',
                  'Topic 19': 'SQL server tools', 'Topic 20': 'client relationship'}
    
    example_list = []
    for i in range(6):
        job_title = df6.loc[i, 'job_title_processed']
        highest_topic1 = df6.loc[i, 'highest_topic1']
        highest_topic2 = df6.loc[i, 'highest_topic2']
        highest_topic3 = df6.loc[i, 'highest_topic3']
        keyterms = df6.loc[i, 'keyterm_list']
        example = [job_title, topic_dict[highest_topic1], topic_dict[highest_topic2], topic_dict[highest_topic3], keyterms]
        example_list.append(example)
    
    return example_list

In [74]:
description = 'Conceptualize the technology process workflow and manage a team of software developers. Improve the existing regression models (neural networks) written in R and suggest new ways to implement in Python and TensorFlow. Deploy the services on Google Cloud with SQL support and use C# for client/server programing and ASP pages.'

In [80]:
examples = next_job_output(description, vectorizer, nmf, transition_mat, df)

In [81]:
examples

[['consultant bi data scientist',
  'machine learning',
  'data warehouse/governance',
  'big data tools',
  'data management analysis solution energy development ssis package configuration include modeling support performance measurement failure detection report dashboard network visualization python script scale design database source equipment electrical topological web sparql rdf hadoop integrate time machine meter participated analytics process'],
 ['lead data scientist algorithm development',
  'machine learning',
  'client relationship',
  'SQL server tools',
  'way use predictive regression device given programing asp research phase'],
 ['kpi manager',
  'machine learning',
  'data reporting',
  'data warehouse/governance',
  'kpis developed modeling scorecard security metadata page data primarily definition analyze fix custom financial'],
 ['junior software developer',
  'machine learning',
  'project management',
  'network infrastructure',
  'domain update internet malicious