In [1]:
import pandas as pd,numpy as np

In [2]:
from sqlalchemy import create_engine
engine = create_engine('postgresql://pipecandy_user:pipecandy@192.168.3.6:5432/pipecandy_db1', echo=False)

In [3]:
df = pd.read_sql_query("select * from ("\
                       "SELECT * FROM linkedin_company_base where industry in ('Marketing and Advertising' "\
                    ",'Construction','Financial Services','Education Management','Hospital & Health Care') limit 10000"\
                       ")a order by random()",
                       engine)

In [4]:
df['industry'].value_counts()

Marketing and Advertising    3738
Construction                 2190
Financial Services           2123
Hospital & Health Care       1393
Education Management          556
Name: industry, dtype: int64

In [5]:
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from nltk.corpus import stopwords

In [8]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

In [7]:
vectorizer = CountVectorizer(min_df=20,max_df=0.9,stop_words=stopwords.words())
X = vectorizer.fit_transform(df['description'].fillna(''))
X.shape

(10000, 3331)

In [11]:
n_topics,n_top_words= 40,20
lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
                                learning_method='online', learning_offset=50.,
                                random_state=0)
lda.fit(X)
tf_feature_names = vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)

Topic #0:
education training students school community members programs program schools association educational provide support services professional development organization public mission provides
Topic #1:
many training rewards materials within global featured senior loyalty days developer sales customer users year help georgia companies new patients
Topic #2:
platform market trading global investors investment world international markets exchange access first investments india real bitcoin investor research funds network
Topic #3:
equipment fleet sale signs champion test rental vehicle leasing automotive plastic vehicles hills hire absolute middle parts sponsors source nationwide
Topic #4:
risk construction largest one focus brokerage positive selection based managers estate executive best maintain owned experienced llc bottom providing staff
Topic #5:
marketing media digital social mobile agency online advertising content brands brand campaigns technology business solutions based 

### using gensim, try hdp-lda (so that number of topics is found automatically)

In [12]:
import gensim

Couldn't import dot_parser, loading of dot files will not be possible.


In [29]:
stoplist = stopwords.words()
texts = [[word.strip() for word in document.lower().split() if word not in stoplist and word.strip() != '.']
         for document in df['description'].fillna('')]

In [30]:
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1
texts = [[token for token in text if frequency[token] > 20]
         for text in texts]

In [31]:
dictionary = gensim.corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

In [32]:
lda = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=20, update_every=1, chunksize=1000, passes=1)

In [33]:
lda.print_topics(20)

[(0,
  u'0.059*marketing + 0.030*media + 0.029*digital + 0.015*social + 0.014*agency + 0.014*advertising + 0.013*brand + 0.012*web + 0.012*creative + 0.011*marketing,'),
 (1,
  u'0.035*construction + 0.020*project + 0.014*building + 0.014*projects + 0.011*services + 0.011*general + 0.010*work + 0.009*design + 0.009*quality + 0.009*commercial'),
 (2,
  u'0.029*service + 0.021*quality + 0.016*residential + 0.016*roofing + 0.015*commercial + 0.014*& + 0.013*company + 0.013*customer + 0.011*provide + 0.011*services'),
 (3,
  u"0.026*business + 0.012*& + 0.012*get + 0.009*web + 0.009*we're + 0.009*design + 0.008*us + 0.008*website + 0.007*need + 0.006*one"),
 (4,
  u'0.025*marketing + 0.016*& + 0.012*digital + 0.011*\u2022 + 0.009*data + 0.009*/ + 0.009*business + 0.007*technology + 0.007*management + 0.007*sales'),
 (5,
  u'0.009*industry + 0.008*business + 0.008*new + 0.007*one + 0.006*years + 0.005*companies + 0.005*company + 0.005*team + 0.005*experience + 0.004*service'),
 (6,
  u'0.10

In [35]:
#hdp lda
hdp = gensim.models.HdpModel(corpus, dictionary)

In [37]:
hdp.print_topics(num_topics=-1, num_words=10) #no topics is 150 by default.. this is not useful

[u'topic 0: 0.010*marketing + 0.008*business + 0.008*services + 0.007*- + 0.006*clients + 0.006*& + 0.006*financial + 0.005*provide + 0.004*company + 0.004*help',
 u'topic 1: 0.013*marketing + 0.013*- + 0.008*\u2022 + 0.008*& + 0.007*business + 0.006*services + 0.005*digital + 0.005*media + 0.005*social + 0.004*service',
 u'topic 2: 0.013*company + 0.012*united + 0.011*states. + 0.010*located + 0.008*construction + 0.005*care + 0.005*health + 0.004*hospital + 0.004*services + 0.003*marketing',
 u'topic 3: 0.003*marketing + 0.003*services + 0.003*company + 0.002*monitoring + 0.002*quality + 0.002*financial + 0.002*united + 0.002*- + 0.002*clients + 0.002*business',
 u'topic 4: 0.002*instant + 0.002*company + 0.002*marketing + 0.002*indoor + 0.002*offices + 0.002*& + 0.001*benefits + 0.001*best + 0.001*services + 0.001*essential',
 u'topic 5: 0.004*marketing + 0.004*- + 0.003*services + 0.003*company + 0.002*\u2022 + 0.002*culture + 0.002*colleges + 0.002*act + 0.002*& + 0.002*digital',
