In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import PorterStemmer
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.preprocessing import normalize
from sklearn.externals import joblib

In [4]:
description = 'Developed data mining applications to scrape data directly from html and plain text. Performed complex SQL queries to aggregate data from disparate sources. Generated databases that exceed - in terms of size and quality - those provided by commercial vendors, giving Odette Faculty a competitive advantage with respect to the originality of their research. Performed ad hod analysis (linear/logistic/multivariate regressions, survival analysis and classification. Communicated findings with visualizations produced using Python data visualization library matplotlib.'

In [12]:
vectorizer = joblib.load('vectorizer.pkl')

In [8]:
model = joblib.load('nmf_topic_model.pkl')

In [2]:
df = pd.read_pickle('work_exp.pickle')

In [3]:
df.head()

Unnamed: 0,resume_id,job_type,job_id,job_title,job_duration,job_description,lang,job_title_processed,job_description_processed,Topic 1,...,Topic 9,Topic 10,Topic 11,Topic 12,Topic 13,Topic 14,Topic 15,highest_topic1,highest_topic2,highest_topic3
0,1,data+scientist,0,Data Scientist,February 2013 to Present,"2016 ~ Manager, Smart Solution Team in Informa...",en,data scientist,manager smart solution team information planni...,0.86474,...,0.0,0.058644,0.0,0.035346,0.236468,0.179737,0.0,Topic 1,Topic 7,Topic 13
1,1,data+scientist,1,Business Inteligence Consultant,January 2006 to February 2013,2011 Social Media Analysis- Participate in the...,en,business inteligence consultant,social medium analysis participate development...,0.01317,...,0.380385,0.0,0.022767,0.604392,0.03565,0.220245,0.0,Topic 12,Topic 5,Topic 9
2,2,data+scientist,0,Director of Business Relations,May 2018 to Present,GIMME360 Data Scientist-Business Solution 20...,en,director business relation,gimme data scientist business solution present...,0.192328,...,0.199249,0.0,0.0,0.637112,0.0,0.0,0.0,Topic 12,Topic 8,Topic 7
3,2,data+scientist,1,Senior Data Analyst-Customer Interaction Decis...,May 2014 to July 2016,SAS Fulltime• Translated business objectives ...,en,senior data analyst customer interaction decis...,sa fulltime translated business objective data...,0.038701,...,0.169296,0.058586,0.0,0.400573,0.07471,0.0,0.0,Topic 7,Topic 12,Topic 9
4,3,data+scientist,0,FREELANCE DEVELOPER,March 2016 to Present,"Toronto, Canada• Completed 30+ freelancer Proj...",en,freelance developer,toronto canada completed freelancer project in...,0.0,...,0.0,0.253182,0.547395,0.134815,0.033566,0.091773,0.0,Topic 7,Topic 11,Topic 10


In [2]:
def preprocess(text):
    """ preprocess text: remove special characters, remove digits, tokenize,
    lowercase, remove stopwords, lemmatize
    """
    tokenizer = RegexpTokenizer(r'\w+')
    stopwords_en = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
        
    text = re.sub('[^a-zA-Z]', ' ', text )
    text = re.sub(r'\s+', ' ', text)
    tokens = tokenizer.tokenize(text)
    tokens = [token.lower() for token in tokens]
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stopwords_en]
    return ' '.join(tokens)

In [13]:
def similarity(description, vectorizer, model, df):
    """ calculate cosine similarity between new job description and job descriptions in dataset
    
    description -- new job description to be analyzed
    vectorizer -- vectorizer for text processing
    model -- nmf model
    df -- df containing resume and job ids and topic weights
    """
    desc = preprocess(description)
    desc_vec = vectorizer.transform([desc])
    # get normalized topic weights for description
    desc_nmf = normalize(model.transform(desc_vec))
    
    # get topic weights from df
    topic_cols = [col for col in df.columns if 'Topic' in col]
    norm_features = df.as_matrix(columns=topic_cols)
    
    # calculate cosine similarity between input description and training descriptions
    cos_sim = norm_features.dot(desc_nmf.T)
    return cos_sim

In [15]:
cos_sim = similarity(description, vectorizer, model, df)

In [29]:
def transitions(description, vectorizer, model, df):
    """ get descriptive info about topic changes at job transitions. 
    
    description -- new job description to be analyzed
    vectorizer -- vectorizer for text processing
    model -- nmf model
    df -- df containing resume and job ids and topic weights
    """
    # find 100 most similar non current job descriptions
    df['cos_sim'] = similarity(description, vectorizer, model, df)
    df100 = df.loc[df.job_id!=0].sort_values(by=['cos_sim'], ascending=False)[:100]
    
    # dataframe containing 100 most similar non current jobs and their corresponding subsequent jobs
    idx_prev = df100.index.tolist()
    idx_current = [i-1 for i in df100.index]
    transitions = df[df.index.isin(idx_prev + idx_current)]
            
    # calculate topic weight difference between job i and job i-1 of same resume
    cols = [col for col in transitions.columns if 'Topic' in col]
    diff = transitions[cols].diff()
    diff = pd.concat([transitions[['resume_id', 'job_id', 'cos_sim']], diff], axis=1, sort=False)
    diff['same_resume'] = diff.resume_id.diff() # calculate diff beteen adjacent resume_id, should be 0 if from same resume
    diff = diff[diff.same_resume==0]
    diff[cols] = diff[cols].apply(lambda x: x*(-1)) # multiply by -1 to get topic change from recent to past job
    
    # find topic with largest difference as one moves from job i to job i-1,
    # compute percentage of people who have highest change in each of the 15 topics
    diff1 = diff[cols]
    diff1['max_weight_change'] = diff1.idxmax(axis=1)
    weight_change = diff1['max_weight_change'].value_counts(normalize=True).rename_axis('topic').reset_index(name='counts')
    weight_change['counts'] = round(weight_change['counts'] * 100)
    
    topic_dict = {'Topic 1':'data warehouse/governance', 'Topic 2':'software development',
                  'Topic 3':'database development', 'Topic 4':'network infrastructure',
                  'Topic 5':'data analytics and reporting', 'Topic 6':'product testing',
                  'Topic 7':'machine learning', 'Topic 8': 'customer support',
                  'Topic 9':'business solutions', 'Topic 10':'management/leadership',
                  'Topic 11': 'SQL server tools', 'Topic 12': 'marketing',
                  'Topic 13': 'academic research', 'Topic 14': 'enterprise data architecture',
                  'Topic 15': 'client relationship'}
    
    largest_change_list = []
    for index, row in weight_change.iterrows():
        largest_change = [row['counts'], topic_dict[row['topic']]]
        largest_change_list.append(largest_change)
        
    return largest_change_list

In [30]:
transitions(description, vectorizer, model, df)

[[12.0, 'management/leadership'],
 [11.0, 'academic research'],
 [11.0, 'machine learning'],
 [10.0, 'data analytics and reporting'],
 [9.0, 'marketing'],
 [7.0, 'customer support'],
 [7.0, 'data warehouse/governance'],
 [6.0, 'client relationship'],
 [6.0, 'software development'],
 [5.0, 'business solutions'],
 [4.0, 'database development'],
 [4.0, 'product testing'],
 [4.0, 'enterprise data architecture'],
 [3.0, 'network infrastructure'],
 [1.0, 'SQL server tools']]

In [35]:
def next_job_examples(description, vectorizer, model, df):
    """ give examples of next jobs and their top 3 topics
    
    description -- new job description to be analyzed
    vectorizer -- vectorizer for text processing
    model -- nmf model
    df -- df containing resume and job ids and topic weights
    """
    # out of jobs that are not current jobs, 6 jobs with highest cosine similarity
    df['cos_sim'] = similarity(description, vectorizer, model, df)
    df6 = df.loc[df.job_id!=0].sort_values(by=['cos_sim'], ascending=False)[:6]
    
    topic_dict = {'Topic 1':'data warehouse/governance', 'Topic 2':'software development',
                  'Topic 3':'database development', 'Topic 4':'network infrastructure',
                  'Topic 5':'data analytics and reporting', 'Topic 6':'product testing',
                  'Topic 7':'machine learning', 'Topic 8': 'customer support',
                  'Topic 9':'business solutions', 'Topic 10':'management/leadership',
                  'Topic 11': 'SQL server tools', 'Topic 12': 'marketing',
                  'Topic 13': 'academic research', 'Topic 14': 'enterprise data architecture',
                  'Topic 15': 'client relationship'}
    
    # for each of 6 most similar jobs, find their subsequent job and top 3 topics
    example_list = []
    for i in range(6):
        resume_id, job_id = df6.iloc[i, :][['resume_id', 'job_id']]
        next_job = df.loc[(df['resume_id']==resume_id)&(df['job_id']==job_id-1), 'job_title_processed'].values[0]
        highest_topic1 = df.loc[(df['resume_id']==resume_id)&(df['job_id']==job_id-1), 'highest_topic1'].values[0]
        highest_topic2 = df.loc[(df['resume_id']==resume_id)&(df['job_id']==job_id-1), 'highest_topic2'].values[0]
        highest_topic3 = df.loc[(df['resume_id']==resume_id)&(df['job_id']==job_id-1), 'highest_topic3'].values[0]
        example = [next_job, topic_dict[highest_topic1], topic_dict[highest_topic2], topic_dict[highest_topic3]]
        example_list.append(example)
    
    return example_list

In [36]:
next_job_examples(description, vectorizer, model, df)

[['independent data science consultant',
  'academic research',
  'management/leadership',
  'data warehouse/governance'],
 ['fibreline process engineer',
  'management/leadership',
  'business solutions',
  'software development'],
 ['data scientist',
  'data warehouse/governance',
  'client relationship',
  'data analytics and reporting'],
 ['software product engineer',
  'marketing',
  'client relationship',
  'customer support'],
 ['operation engineer', 'machine learning', 'business solutions', 'marketing'],
 ['project head',
  'management/leadership',
  'data analytics and reporting',
  'customer support']]