In [98]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

In [75]:
df = pd.read_csv('data/skill_cluster_df.csv', index_col=False)

In [76]:
df.drop('Unnamed: 0', axis=1, inplace=True)
df['name'] = df['name'].apply(lambda x: x.lower())

In [77]:
df.head()

Unnamed: 0,id,roles,trend_id,name,kind
0,550246,['engineering'],81,iis,Software systems
1,550246,['engineering'],134814,c #sh%,Software systems
2,550246,['engineering'],41,matlab,Software systems
3,550246,['engineering'],323,jquery,Software systems
4,550246,['engineering'],35,javascript,Software systems


In [78]:
word_list = df['name'].unique()

In [79]:
word_len = []
for word in word_list:
    word_len.append(len(word.split()))

In [80]:
max(word_len)

28

In [81]:
word_dict_cnt = {}
for word in df['name']:
    if word in word_dict_cnt:
        word_dict_cnt[word] = word_dict_cnt[word] + 1
    else:
        word_dict_cnt[word] = 1

In [82]:
sorted(word_dict_cnt.items(), key=lambda x: x[1], reverse=True)[:10]

[('java', 14585),
 ('javascript', 10981),
 ('python', 10678),
 ('c++', 10323),
 ('css', 8400),
 ('computer science', 8366),
 ('mysql', 8130),
 ('linux', 7549),
 ('sql', 6881),
 ('android', 6389)]

In [83]:
word_dict_ord = {}
for word, ind in zip(sorted(word_list), xrange(len(word_list))):
    word_dict_ord[word] = ind

In [84]:
len(word_list), len(df)

(52497, 457816)

In [85]:
df_txt = df.groupby(['id'])['name'].apply(lambda x: ' '.join(x)).reset_index()

In [86]:
df_txt.head()

Unnamed: 0,id,name
0,531968,postgres java jquery javascript ios linux tech...
1,531982,objective-c mysql computer science developer f...
2,532069,c++ ios stanford university catlin gabel schoo...
3,532082,php gni
4,532092,php mysql css python java jquery javascript co...


In [104]:
cv = CountVectorizer(vocabulary=word_dict_ord, ngram_range=(1,10))
tf = TfidfVectorizer(vocabulary=word_dict_ord, ngram_range=(1,10))

In [105]:
count = cv.fit_transform(df_txt['name'])
tfidf = tf.fit_transform(df_txt['name'])

In [107]:
count.shape

(20234, 52497)

In [108]:
tfidf.shape

(20234, 52497)

In [109]:
nmf1 = NMF(n_components=10, random_state=1, alpha=.1, l1_ratio=.5).fit(count)
nmf2 = NMF(n_components=10, random_state=1, alpha=.1, l1_ratio=.5).fit(tfidf)

In [106]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

In [114]:
print("\nTopics in NMF model:")
count_feature_names = cv.get_feature_names()
print_top_words(nmf1, count_feature_names, 10)


Topics in NMF model:
Topic #0:
university university of university of california matlab python university of texas computer science java university of southern university of southern california
Topic #1:
engineer software engineer senior software senior software engineer software development engineer software systems development engineer principal consultant
Topic #2:
sql sql server xml asp java javascript analyst microsoft sql server mysql jsp
Topic #3:
institute institute of institute of technology indian institute indian institute of indian institute of technology matlab java computer science python
Topic #4:
developer software developer web developer jquery javascript php web applications ios developer intern team
Topic #5:
assistant research assistant teaching assistant department matlab python computer science department of java systems
Topic #6:
windows linux system unix systems os ubuntu mac os java operating system
Topic #7:
university state state university java matlab syste

In [111]:
print("\nTopics in NMF model:")
tfidf_feature_names = tf.get_feature_names()
print_top_words(nmf2, tfidf_feature_names, 10)


Topics in NMF model:
Topic #0:
javascript jquery css mysql python php java sql postgresql xml
Topic #1:
engineer software engineer senior software senior software engineer engineer software software development development engineer principal software development engineer engineer intern
Topic #2:
institute of institute institute of technology indian institute indian institute of indian institute of technology georgia institute georgia institute of georgia institute of technology national institute
Topic #3:
assistant research assistant teaching assistant university department matlab department of assistant computer science computer science graduate teaching assistant
Topic #4:
ruby rails ruby on rails postgresql jquery css javascript sql web applications ec2
Topic #5:
state state university jose state university san jose state san jose state university university state university of new state university of new york university of new york web services
Topic #6:
university of university