In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

In [2]:
df = pd.read_csv('data/skill_cluster_df.csv', index_col=False)

In [3]:
df.drop('Unnamed: 0', axis=1, inplace=True)
df['name'] = df['name'].apply(lambda x: x.lower())

In [4]:
df.head()

Unnamed: 0,id,roles,trend_id,name,kind
0,550246,['engineering'],81,iis,Software systems
1,550246,['engineering'],134814,c #sh%,Software systems
2,550246,['engineering'],41,matlab,Software systems
3,550246,['engineering'],323,jquery,Software systems
4,550246,['engineering'],35,javascript,Software systems


In [5]:
word_list = df['name'].unique()

In [6]:
word_list[:10]

array(['iis', 'c #sh%', 'matlab', 'jquery', 'javascript', 'php', 'python',
       'unix', 'assistant', 'java'], dtype=object)

In [7]:
word_len = []
for word in word_list:
    word_len.append(len(word.split()))

In [8]:
max(word_len)

28

In [9]:
word_dict_cnt = {}
for word in df['name']:
    if word in word_dict_cnt:
        word_dict_cnt[word] = word_dict_cnt[word] + 1
    else:
        word_dict_cnt[word] = 1

In [10]:
# sorted(word_dict_cnt.items(), key=lambda x: x[1], reverse=True)[:10]

In [12]:
word_dict_ord = {}
for word, ind in zip(sorted(word_list), xrange(len(word_list))):
    word_dict_ord[word] = ind

In [13]:
{k: word_dict_ord[k] for k in word_dict_ord.keys()[:10]}

{'collegearizona state university': 8941,
 'construction intern': 10027,
 'coo and marketing d epartment': 10296,
 'electronics and communications': 14921,
 'financial services assurance senior associate': 16917,
 'product management club': 35146,
 'programmer analyst/': 35443,
 'software developer intern toronto': 41435,
 'technology university of changsha': 45784,
 'ui automation tester': 47400}

In [14]:
df_txt = df.groupby(['id'])['name'].apply(lambda x: ' '.join(x)).reset_index()

In [15]:
df_txt.head()

Unnamed: 0,id,name
0,531968,postgres java jquery javascript ios linux tech...
1,531982,objective-c mysql computer science developer f...
2,532069,c++ ios stanford university catlin gabel schoo...
3,532082,php gni
4,532092,php mysql css python java jquery javascript co...


In [16]:
cv = CountVectorizer(vocabulary=word_dict_ord, ngram_range=(1,10))
tf = TfidfVectorizer(vocabulary=word_dict_ord, ngram_range=(1,10))

In [17]:
count = cv.fit_transform(df_txt['name'])
tfidf = tf.fit_transform(df_txt['name'])

In [19]:
count.shape, tfidf.shape

((20234, 52497), (20234, 52497))

In [28]:
nmf_cv = NMF(n_components=15, random_state=1, alpha=.1, l1_ratio=.5).fit(count)
nmf_tf = NMF(n_components=15, random_state=1, alpha=.1, l1_ratio=.5).fit(tfidf)

In [34]:
lda_cv = LatentDirichletAllocation(n_topics=15, max_iter=5,
                                learning_method='online', learning_offset=50.,
                                random_state=0).fit(count)
lda_tf = LatentDirichletAllocation(n_topics=15, max_iter=5,
                                learning_method='online', learning_offset=50.,
                                random_state=0).fit(tfidf)

In [29]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
        print "\n"

In [31]:
print("\nTopics in NMF model using count vectorizer:")
count_feature_names = cv.get_feature_names()
print_top_words(nmf_cv, count_feature_names, 10)


Topics in NMF model using count vectorizer:
Topic #0:
university, matlab, python, java, stanford, mellon, mellon university, carnegie mellon university, department, stanford university


Topic #1:
engineer, software engineer, senior software, senior software engineer, software development, engineer software, systems, development engineer, principal, consultant


Topic #2:
developer, software developer, web developer, ios, web applications, developer intern, team, jquery, lead developer, php


Topic #3:
ruby, rails, ruby on rails, javascript, css, jquery, postgresql, python, ios, founder


Topic #4:
assistant, research assistant, teaching assistant, department, matlab, computer science, python, department of, java, ieee


Topic #5:
sql, sql server, asp, xml, analyst, microsoft sql server, java, ms sql server, web services, data mining


Topic #6:
institute, institute of, institute of technology, indian institute, indian institute of, indian institute of technology, matlab, python, comp

In [35]:
print("\nTopics in NMF model using tfidf:")
tfidf_feature_names = tf.get_feature_names()
print_top_words(nmf_tf, tfidf_feature_names, 10)


Topics in NMF model using tfidf:
Topic #0:
jquery, css, javascript, mysql, php, postgresql, sql, photoshop, xml, jsp


Topic #1:
engineer, software engineer, senior software, senior software engineer, engineer software, software development, development engineer, principal, software development engineer, engineer intern


Topic #2:
ruby, rails, ruby on rails, postgresql, javascript, sql, css, jquery, founder, web applications


Topic #3:
university of, university, intern, university of california, software engineering, engineering intern, computer science, software engineering intern, data structures, engineer intern


Topic #4:
developer, software developer, web developer, developer intern, software developer intern, web applications, ios, php, lead developer, intern


Topic #5:
sql server, sql, asp, microsoft sql server, xml, ms sql server, analyst, university, web services, jsp


Topic #6:
institute of, institute, institute of technology, indian institute, indian institute of, indi

In [36]:
print("\nTopics in LDA model using count vectorizer:")
cv_feature_names = cv.get_feature_names()
print_top_words(lda_cv, cv_feature_names, 10)


Topics in LDA model using count vectorizer:
Topic #0:
university of hawaii, epa, integrated system, clark university, domain controller, national competition, constant contact, service management, foreman, central michigan university


Topic #1:
public school, public, university of management, consultant developer, mca, master of computer applications, major league, independent consultant, maharishi university, maharishi university of


Topic #2:
harvard, school, high, harvard university, high school, college, national merit, corps, network engineer, ocaml


Topic #3:
indian institute, indian institute of, indian institute of technology, cornell, cornell university, fudan, pennsylvania state, pennsylvania state university, fudan university, state college


Topic #4:
berkeley, university of california, uc, alpha, softwaredeveloper, illinois institute, illinois institute of technology, intern university, engineering intern university of california, university of california berkeley


To

In [37]:
print("\nTopics in LDA model using count vectorizer:")
tf_feature_names = tf.get_feature_names()
print_top_words(lda_tf, tf_feature_names, 10)


Topics in LDA model using count vectorizer:
Topic #0:
harvard, university of electronic science, university of electronic, national olympiad, college of computer and information science, harvard university, university of electronic science and technology, electronic science and technology of china, cal poly, university of electronic science and technology of china


Topic #1:
vit university, bilkent university, american express, minnesota state university, technical design authority, sous chef, information engineering university, dca, senior systems architect, brain sciences


Topic #2:
colgate university, international science, university of adelaide, indian student organization, security agency, international academy, ms office word, pi honor society, car buyers, national security agency


Topic #3:
servlets, artificial neural network, crude oil, mutual fund, university of quebec, mma, peace corps, unix os, bank customers, regional contest


Topic #4:
senior statistical scientist, s