In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

In [2]:
df = pd.read_csv('data/skill_cluster_df.csv', index_col=False)

In [3]:
df.drop('Unnamed: 0', axis=1, inplace=True)
df['name'] = df['name'].apply(lambda x: x.lower())

In [4]:
df.head()

Unnamed: 0,id,roles,trend_id,name,kind
0,550246,['engineering'],81,iis,Software systems
1,550246,['engineering'],134814,c #sh%,Software systems
2,550246,['engineering'],41,matlab,Software systems
3,550246,['engineering'],323,jquery,Software systems
4,550246,['engineering'],35,javascript,Software systems


In [5]:
word_list = df['name'].unique()

In [6]:
word_list[:10]

array(['iis', 'c #sh%', 'matlab', 'jquery', 'javascript', 'php', 'python',
       'unix', 'assistant', 'java'], dtype=object)

In [7]:
word_len = []
for word in word_list:
    word_len.append(len(word.split()))

In [8]:
max(word_len)

28

In [9]:
word_dict_cnt = {}
for word in df['name']:
    if word in word_dict_cnt:
        word_dict_cnt[word] = word_dict_cnt[word] + 1
    else:
        word_dict_cnt[word] = 1

In [10]:
# sorted(word_dict_cnt.items(), key=lambda x: x[1], reverse=True)[:10]

In [11]:
word_dict_ord = {}
for word, ind in zip(sorted(word_list), xrange(len(word_list))):
    word_dict_ord[word] = ind

In [12]:
word_dict_nord = {}
for word, ind in zip(word_list, xrange(len(word_list))):
    word_dict_nord[word] = ind

In [26]:
{k: word_dict_nord[k] for k in word_dict_nord.keys()[:10]}

{'collegearizona state university': 31154,
 'construction intern': 15529,
 'electronics and communications': 15265,
 'financial services assurance senior associate': 7213,
 'outdoors club': 27829,
 'product management club': 38574,
 'programmer analyst/': 8466,
 'software developer intern toronto': 20570,
 'ui automation tester': 26243,
 'world academy platform': 2236}

In [27]:
{k: word_dict_ord[k] for k in word_dict_ord.keys()[:10]}

{'collegearizona state university': 8941,
 'construction intern': 10027,
 'coo and marketing d epartment': 10296,
 'electronics and communications': 14921,
 'financial services assurance senior associate': 16917,
 'product management club': 35146,
 'programmer analyst/': 35443,
 'software developer intern toronto': 41435,
 'technology university of changsha': 45784,
 'ui automation tester': 47400}

In [15]:
df_txt = df.groupby(['id'])['name'].apply(lambda x: ' '.join(x)).reset_index()

In [16]:
df_txt.head()

Unnamed: 0,id,name
0,531968,postgres java jquery javascript ios linux tech...
1,531982,objective-c mysql computer science developer f...
2,532069,c++ ios stanford university catlin gabel schoo...
3,532082,php gni
4,532092,php mysql css python java jquery javascript co...


In [17]:
cv = CountVectorizer(vocabulary=word_dict_ord, ngram_range=(1,10))
tf = TfidfVectorizer(vocabulary=word_dict_ord, ngram_range=(1,10))

In [18]:
cvn = CountVectorizer(vocabulary=word_dict_nord, ngram_range=(1,10))
tfn = TfidfVectorizer(vocabulary=word_dict_nord, ngram_range=(1,10))

In [21]:
count = cv.fit_transform(df_txt['name'])
tfidf = tf.fit_transform(df_txt['name'])

In [22]:
countn = cvn.fit_transform(df_txt['name'])
tfidfn = tfn.fit_transform(df_txt['name'])

In [41]:
cv.vocabulary

{'coo and marketing d epartment': 10296,
 'programmer analyst/': 35443,
 'technology university of changsha': 45784,
 'financial services assurance senior associate': 16917,
 'ui automation tester': 47400,
 'software developer intern toronto': 41435,
 'electronics and communications': 14921,
 'collegearizona state university': 8941,
 'construction intern': 10027,
 'product management club': 35146,
 'ballet folklorico de stanford': 4124,
 'software engineering engineering of concurrent and distributed': 41749,
 'spinal surgery': 42334,
 'assistant chief of police': 2892,
 'world academy platform': 51956,
 'development managers': 12966,
 'black hole': 5013,
 'technical organizations': 45417,
 'ngospolytechnic institute of nyu': 31227,
 'information technology analyst': 22075,
 'sigma phi epsilon fraternity': 40747,
 'rhodes university': 37513,
 'procurement. development': 35070,
 'lecturer of statistics and statistical consultant': 25993,
 'college of natural science,ut austin': 8881,
 '

In [51]:
cvn.vocabulary

{'outdoors club': 27829,
 'programmer analyst/': 8466,
 'world academy platform': 2236,
 'financial services assurance senior associate': 7213,
 'ui automation tester': 26243,
 'software developer intern toronto': 20570,
 'electronics and communications': 15265,
 'collegearizona state university': 31154,
 'construction intern': 15529,
 'product management club': 38574,
 'ballet folklorico de stanford': 4834,
 'software engineering engineering of concurrent and distributed': 34821,
 'spinal surgery': 21101,
 'assistant chief of police': 11350,
 'technology university of changsha': 42086,
 'preparatory institute for engineering studies of nabeul': 49274,
 'development managers': 5097,
 'lead software engineer front': 39685,
 'freeman spogli institute rural education action project': 21778,
 'technical organizations': 41578,
 'ngospolytechnic institute of nyu': 33999,
 'information technology analyst': 11184,
 'application engineer & development': 22195,
 'cmrit': 47956,
 'procurement. de

In [29]:
count.shape, countn.shape

((20234, 52497), (20234, 52497))

In [32]:
nmf1 = NMF(n_components=10, random_state=1, alpha=.1, l1_ratio=.5).fit(count)
nmf2 = NMF(n_components=10, random_state=1, alpha=.1, l1_ratio=.5).fit(countn)

In [36]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

In [37]:
print("\nTopics in NMF model:")
count_feature_names = cv.get_feature_names()
print_top_words(nmf1, count_feature_names, 10)


Topics in NMF model:
Topic #0:
university, university of, university of california, matlab, python, university of texas, computer science, java, university of southern, university of southern california
Topic #1:
engineer, software engineer, senior software, senior software engineer, software development, engineer software, systems, development engineer, principal, consultant
Topic #2:
sql, sql server, xml, asp, java, javascript, analyst, microsoft sql server, mysql, jsp
Topic #3:
institute, institute of, institute of technology, indian institute, indian institute of, indian institute of technology, matlab, java, computer science, python
Topic #4:
developer, software developer, web developer, jquery, javascript, php, web applications, ios, developer intern, team
Topic #5:
assistant, research assistant, teaching assistant, department, matlab, python, computer science, department of, java, systems
Topic #6:
windows, linux, system, unix, systems, os, ubuntu, mac os, java, operating syste

In [53]:
print("\nTopics in NMF model:")
count_feature_names = cvn.get_feature_names()
print_top_words(nmf2, count_feature_names, 10)


Topics in NMF model:
Topic #0:
university, university of, university of california, matlab, python, university of texas, computer science, java, university of southern, university of southern california
Topic #1:
engineer, software engineer, senior software, senior software engineer, software development, engineer software, systems, development engineer, principal, consultant
Topic #2:
sql, sql server, xml, asp, java, javascript, analyst, microsoft sql server, mysql, jsp
Topic #3:
institute, institute of, institute of technology, indian institute, indian institute of, indian institute of technology, matlab, java, computer science, python
Topic #4:
developer, software developer, web developer, jquery, javascript, php, web applications, ios, developer intern, team
Topic #5:
assistant, research assistant, teaching assistant, department, matlab, python, computer science, department of, java, systems
Topic #6:
windows, linux, system, unix, systems, os, ubuntu, mac os, java, operating syste

In [50]:
nmf1.components_.

TypeError: 'buffer' object is not callable