In [1]:
import pandas as pd,numpy as np
from sqlalchemy import create_engine

In [2]:
import pickle
with open('saas_companies_data.pkl','r') as f:
    df = pickle.load(f)

In [3]:
import nltk,re
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from text_processing import extract_phrases
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

In [4]:
grammar = r"""
  NP1: {<JJ><NN.*>+}          # Chunk sequences of JJ, NN
  NP2: {<NN.*>+<JJ>}          # Chunk sequences of JJ, NN
  NP3: {<NN.*>+}                  #Noun phrases
  VP: {<VB.*><NN.*>+} # Chunk verbs and their arguments
  """
phr_list = ['NP1','NP2','NP3','VP']
tag_list = ['NN','NNS','NNP','NNPS','VB','VBD','VBG','VBN','VBP','VBZ']
stop_words = stopwords.words()+['http','https','goo']
cp = nltk.RegexpParser(grammar)
pe = extract_phrases.PhraseExtractor()
snowball_stemmer = SnowballStemmer('english')
reg_exp = re.compile('[^a-zA-Z ]',re.IGNORECASE)
def tokenizer(text):
    pos_tags = nltk.pos_tag(nltk.word_tokenize(text))
    phrs = pe.extract_phrase_treeinput(cp.parse(pos_tags),['NP1','NP2','VP'])
    wrds = [snowball_stemmer.stem(i[0]) for i in pos_tags if i[1] in tag_list]
    wrds = [wrd for wrd in wrds if wrd not in stop_words]
    phrs = ['_'.join([snowball_stemmer.stem(wrd) for wrd in nltk.word_tokenize(phr)]) for phr in phrs]
    wrds = [reg_exp.sub('',i) for i in wrds]
    return wrds+phrs

In [5]:
with open('tfidf_saas_companies_phrases.pkl','r') as f:
    tmp = pickle.load(f)

tfidf,tfidf_vectorizer = tmp['tfidf'],tmp['vectorizer']
del tmp

In [6]:
with open('nmf_saas.pkl','r') as f:
    nmf = pickle.load(f)

In [7]:
topic_map_dic = {
    0:'business process/growth strategy',1:'contact details',2:'marketing/media/branding',
    3:'application development/web design',4:'analytics/big data',5:'project management/compliance',
    6:'investment/venture capital',7:'branding/customer engagement platform',
    8:'cloud computing/infrastructre/salesforce',9:'recruitment/hire talent',
    10:'software/saas product/erp/crm',11:'sales/crm/prospecting',
    12:'healthcare',13:'supply chain/logistics',14:'mobile/web development',15:'other language',
    16:'saas solution provider',17:'technology/product firm',18:'consulting service/outsourcing',
    19:'security/compliance'
}

In [8]:
import numpy as np,pandas as pd
preds = nmf.transform(tfidf)
preds_probs = preds.transpose()/np.sum(preds,1)
preds_probs = preds_probs.transpose()
preds_ind = np.where(preds_probs>0.1)
preds_df = pd.DataFrame({'row':preds_ind[0],'col':preds_ind[1]})

Now look at all the topics and find the topic ids which are matching the requirement and remove the ones which are not matching. In this case, we are looking at only SAAS products. Rest we will remove. 

In [9]:
preds_topics = preds_df.groupby('row')['col'].apply(lambda x: x.tolist())
preds_topics.head()

row
0     [0, 2, 7, 14]
1    [5, 7, 10, 18]
2    [5, 7, 10, 18]
3        [1, 7, 10]
4        [1, 7, 10]
Name: col, dtype: object

In [10]:
req_topic_ids = {10,17}
rm_topic_ids = {3,14,18}
match_inds = list(set([i for i in preds_topics.index
                      if ((set(preds_topics.ix[i]) & req_topic_ids) and not(set(preds_topics.ix[i]) & rm_topic_ids))]))
match_inds[:10]

[8192, 3, 4, 5, 6, 7, 8, 9, 10, 8203]

In [11]:
preds_topics = preds_df.groupby('row')['col'].apply(lambda x: [topic_map_dic[i] for i in x.tolist()])
preds_topics.head()

row
0    [business process/growth strategy, marketing/m...
1    [project management/compliance, branding/custo...
2    [project management/compliance, branding/custo...
3    [contact details, branding/customer engagement...
4    [contact details, branding/customer engagement...
Name: col, dtype: object

In [12]:
df.ix[8192,'description']

u'ScheduleSoft to Workloud, we have been in business for nearly 20 years providing workforce management solutions.  Workloud, our latest end-to-end SaaS product is aimed at disrupting the space of workforce management software. '

In [13]:
final_df = df.ix[match_inds,:].join(preds_topics)
final_df.columns = list(final_df.columns[:-1])+['topics']
final_df.head()

Unnamed: 0,linkedin_url,description,specialties,website,topics
8192,https://www.linkedin.com/company/workloud,"ScheduleSoft to Workloud, we have been in busi...",,http://workloud.com,"[business process/growth strategy, project man..."
3,https://www.linkedin.com/company/11giraffes,11Giraffes is a Digital Signage software compa...,Digital Signage Software as a Service,http://www.11giraffes.com,"[contact details, branding/customer engagement..."
4,https://www.linkedin.com/company/122822,11Giraffes is a Digital Signage software compa...,Digital Signage Software as a Service,http://www.11giraffes.com,"[contact details, branding/customer engagement..."
5,https://www.linkedin.com/company/121nexus,121nexus developed a patented cloud-based SaaS...,"personal media, packaging, technology, micro-t...",http://www.121nexus.com,"[healthcare, technology/product firm]"
6,https://www.linkedin.com/company/121nexus,121nexus developed a patented cloud-based SaaS...,"personal media, packaging, technology, micro-t...",http://www.121nexus.com,"[healthcare, technology/product firm]"


In [14]:
final_df.ix[5,'description']

u"121nexus developed a patented cloud-based SaaS micro-targeting and analytics platform that tracks physical asset performance.  121nexus' customers include large and small enterprises alike in industries like healthcare, education, and cosmetics. 121nexus' technology solves a problem so critical to public health that it was awarded a National Science Foundation grant to accelerate commercialization of the technology particularly in the healthcare industry.  The core technology has been claimed in issued US Patent 8,533,075."

In [75]:
final_df.to_csv('saas_product_companies.csv',index=False,quoting=1,encoding='utf-8')

In [18]:
tfidf1 = tfidf[match_inds,]

In [19]:
from sklearn.decomposition import NMF, LatentDirichletAllocation
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

In [21]:
n_topics,n_top_words=20,20
nmf1 = NMF(n_components=n_topics, random_state=1, alpha=.1, l1_ratio=.5).fit(tfidf1)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf1, tfidf_feature_names, n_top_words)

Topic #0:
solut manag provid client technolog industri system complianc process support deliv cost includ inform risk organ servic need offer oper
Topic #1:
softwar saa compani servic manag licens enterpris crm model use erp special account offer custom onlin vendor creat provid system
Topic #2:
market sale client strategi generat lead help media bb brand campaign revenu channel develop crm sell search content go agenc
Topic #3:
servicio empresa solucion client tecnologa sistema desd desarrollo consultora aplicacion ao herramienta producto modelo nuestro_client plataforma servio tecnolgica mercado especializada
Topic #4:
data analyt intellig big_data decis insight make inform analysi compani collect integr analyz visual research sourc tool inc big_data_analyt report
Topic #5:
cloud comput technolog salesforc solut oracl partner base migrat applic work paa environ ibm implement consult cloud_solut infrastructur iaa focus
Topic #6:
product saa_product build custom offer manufactur price 