#### Transition matrix computation
To look at how topics, or skills involved in data jobs change as people transition from one job (job i) to the subsequent job (job i+1), I model such changes as a Markov process. In this notebook, I compute a transition matrix, which contains probability of transitioning from each of the 20 topics at job i to each of the 20 topics at job i+1 (Pandas is really great for doing this!). 

In [None]:
import warnings
warnings.filterwarnings('ignore')
import pickle
import os
import numpy as np
import pandas as pd
import re

In [2]:
df_work_train = pickle.load(open('work_exp_train.pkl', 'rb'))

In [3]:
# choose columns needed for computing transition matrix
# dominant_topic_job_i: topic with highest weight for job i
topics_train = df_work_train[['resume_id', 'job_id', 'highest_topic1']]
topics_train = topics_train.rename(columns = {'highest_topic1':'dominant_topic_job_i'})
topics_train.head()

Unnamed: 0,resume_id,job_id,dominant_topic_job_i
0,1,0,Topic 1
1,1,1,Topic 5
2,2,0,Topic 5
3,2,1,Topic 7
4,3,0,Topic 3


In [4]:
# remove 'Topic' in dominant_topic_job_i column
topics_train['dominant_topic_job_i'] = topics_train['dominant_topic_job_i'].apply(lambda x: re.sub('Topic ', '', x)) 
topics_train['dominant_topic_job_i'] = topics_train['dominant_topic_job_i'].apply(pd.to_numeric)

In [5]:
# shift dominant_topic_job_i to get dominant topic for job i+1
topics_train['dominant_topic_job_i+1'] = topics_train['dominant_topic_job_i'].shift(1)
# dominant_topic_job_i+1 is nan when job_id=0 (most recent job) - no data on job i+1
topics_train.loc[topics_train['job_id']==0, 'dominant_topic_job_i+1'] = np.nan
topics_train.head()

Unnamed: 0,resume_id,job_id,dominant_topic_job_i,dominant_topic_job_i+1
0,1,0,1,
1,1,1,5,1.0
2,2,0,5,
3,2,1,7,5.0
4,3,0,3,


In [6]:
# compute transition matrix: probability of having each topic as dominant topic for job i and having each
# topic as dominant topic for job i+1
transition_mat = topics_train.groupby(['dominant_topic_job_i'])['dominant_topic_job_i+1'] \
                 .value_counts(normalize=True).reset_index(name='prob')
transition_mat.sort_values(by=['dominant_topic_job_i', 'dominant_topic_job_i+1'], inplace=True)
transition_mat = transition_mat.pivot(index='dominant_topic_job_i', columns='dominant_topic_job_i+1', values='prob')
transition_mat

dominant_topic_job_i+1,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,11.0,12.0,13.0,14.0,15.0,16.0,17.0,18.0,19.0,20.0
dominant_topic_job_i,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,0.30102,0.075255,0.020408,0.002551,0.048469,0.022959,0.038265,0.020408,0.061224,0.05102,0.022959,0.052296,0.020408,0.033163,0.035714,0.043367,0.026786,0.058673,0.040816,0.024235
2,0.055286,0.343356,0.016489,0.026188,0.045587,0.020369,0.018429,0.076625,0.030068,0.054316,0.030068,0.016489,0.045587,0.007759,0.069835,0.030068,0.034918,0.009699,0.012609,0.056256
3,0.041118,0.029605,0.274671,0.014803,0.018092,0.055921,0.072368,0.016447,0.036184,0.042763,0.039474,0.049342,0.039474,0.027961,0.024671,0.024671,0.034539,0.034539,0.080592,0.042763
4,0.0134,0.041876,0.018425,0.510888,0.005025,0.020101,0.021776,0.050251,0.01005,0.053601,0.048576,0.050251,0.033501,0.0134,0.033501,0.01005,0.020101,0.005025,0.020101,0.020101
5,0.075342,0.065068,0.020548,0.003425,0.251712,0.020548,0.063356,0.054795,0.037671,0.056507,0.027397,0.015411,0.030822,0.018836,0.070205,0.039384,0.070205,0.010274,0.017123,0.05137
6,0.062657,0.06015,0.047619,0.0401,0.025063,0.310777,0.0401,0.032581,0.050125,0.035088,0.052632,0.02005,0.035088,0.035088,0.042607,0.022556,0.025063,0.025063,0.02005,0.017544
7,0.056723,0.035714,0.048319,0.010504,0.092437,0.016807,0.405462,0.018908,0.018908,0.018908,0.023109,0.016807,0.046218,0.042017,0.031513,0.018908,0.067227,0.002101,0.012605,0.016807
8,0.047109,0.184154,0.014989,0.06424,0.057816,0.017131,0.042827,0.14561,0.034261,0.049251,0.044968,0.03212,0.042827,0.023555,0.074946,0.036403,0.042827,0.004283,0.008565,0.03212
9,0.109015,0.060797,0.048218,0.008386,0.037736,0.027254,0.027254,0.025157,0.245283,0.069182,0.056604,0.090147,0.014675,0.014675,0.041929,0.035639,0.012579,0.020964,0.027254,0.027254
10,0.078712,0.093023,0.030411,0.039356,0.035778,0.028623,0.033989,0.039356,0.067979,0.214669,0.051878,0.057245,0.028623,0.025045,0.044723,0.0161,0.021467,0.0161,0.026834,0.050089


In [7]:
# change to matrix and pickle transition matrix
transition_mat = transition_mat.as_matrix()
with open('transition_mat.pkl','wb') as f:
    pickle.dump(transition_mat, f)