In [2]:
import pandas as pd
df = pd.read_csv('company_data_for_clustering.csv',sep=';')

In [3]:
df = df.dropna()
df.index = range(df.shape[0])

In [None]:
import nltk,re
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from text_processing import extract_phrases
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

In [6]:
grammar = r"""
  NP1: {<JJ><NN.*>+}          # Chunk sequences of JJ, NN
  NP2: {<NN.*>+<JJ>}          # Chunk sequences of JJ, NN
  NP3: {<NN.*>+}                  #Noun phrases
  VP: {<VB.*><NN.*>+} # Chunk verbs and their arguments
  """
phr_list = ['NP1','NP2','NP3','VP']
tag_list = ['NN','NNS','NNP','NNPS','VB','VBD','VBG','VBN','VBP','VBZ']
stop_words = stopwords.words()+['http','https','goo']

In [7]:
cp = nltk.RegexpParser(grammar)
pe = extract_phrases.PhraseExtractor()
snowball_stemmer = SnowballStemmer('english')
reg_exp = re.compile('[^a-zA-Z ]',re.IGNORECASE)
def tokenizer(text):
    pos_tags = nltk.pos_tag(nltk.word_tokenize(text))
    phrs = pe.extract_phrase_treeinput(cp.parse(pos_tags),['NP1','NP2','VP'])
    wrds = [snowball_stemmer.stem(i[0]) for i in pos_tags if i[1] in tag_list]
    wrds = [wrd for wrd in wrds if wrd not in stop_words]
    phrs = ['_'.join([snowball_stemmer.stem(wrd) for wrd in nltk.word_tokenize(phr)]) for phr in phrs]
    wrds = [reg_exp.sub('',i) for i in wrds]
    return wrds+phrs

In [11]:
data_samples = df['description']
data_samples = data_samples.dropna()
print df.shape,data_samples.shape

(23430, 3) (23430,)


In [12]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=10,tokenizer=tokenizer)
tfidf = tfidf_vectorizer.fit_transform(data_samples)



In [15]:
import pickle
with open('tfidf_allcompanies_phrases.pkl','w') as f:
    pickle.dump({'tfidf':tfidf,'vectorizer':tfidf_vectorizer},f)

In [9]:
import pickle
with open('tfidf_allcompanies_phrases.pkl','r') as f:
    tmp = pickle.load(f)

tfidf,tfidf_vectorizer = tmp['tfidf'],tmp['vectorizer']
del tmp

In [10]:
from sklearn.decomposition import NMF, LatentDirichletAllocation
n_topics,n_top_words=100,20
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

nmf = NMF(n_components=n_topics, random_state=1, alpha=.1, l1_ratio=.5).fit(tfidf)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)



Topic #0:
solut provid deliv enterpris implement integr innov_solut offer expertis infrastructur challeng requir lead mobil deploy problem enabl custom_solut suit busi
Topic #1:
 info email empresa contact mai rate follow produto integr solu servio net melhor call a servic mercado projeto tel
Topic #2:
market campaign digit_market internet agenc email seo target result analyt social_media_market generat lead digit roi onlin_market digit_market_agenc social_media promot increas
Topic #3:
design graphic_design architectur web_design websit_design creat interior_design architect logo idea concept ident special creativ base graphic interior space fashion furnitur
Topic #4:
product line offer produc rang distributor distribut consum includ new_product film innov accessori supplier innov_product materi sell item select retail
Topic #5:
student school educ univers colleg teacher institut cours graduat campus teach faculti career studi provid prepar degre children scienc curriculum
Topic #6:
p

In [11]:
def gen_topic_names(model, feature_names, n_top_words):
    topic_dic = {}
    for topic_idx, topic in enumerate(model.components_):
        topic_dic[topic_idx] = " ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]])
    return topic_dic

In [12]:
topic_map_dic = gen_topic_names(nmf, tfidf_feature_names, 4)

In [13]:
import numpy as np
preds = nmf.transform(tfidf)
preds_probs = preds.transpose()/np.sum(preds,1)
preds_probs = preds_probs.transpose()
preds_ind = np.where(preds_probs>0.1)
preds_df = pd.DataFrame({'row':preds_ind[0],'col':preds_ind[1]})
preds_df.head()

Unnamed: 0,col,row
0,0,0
1,30,0
2,34,0
3,2,1
4,59,2


In [14]:
preds_topics = preds_df.groupby('row')['col'].apply(lambda x: [topic_map_dic[i] for i in x.tolist()])
preds_topics.head()

row
0    [solut provid deliv enterpris, busi small_busi...
1              [market campaign digit_market internet]
2                               [india ltd pvt mumbai]
4    [engin optim civil_engin gas, sale sell genera...
5                      [technolog inform innov integr]
Name: col, dtype: object

In [15]:
df_topics = df.join(preds_topics)
df_topics.head()

Unnamed: 0,industry,description,specialties,col
0,Information Technology and Services,Crystal Approach is Canberra's premier busines...,"Software Solutions, Business-to-ICT Consulting...","[solut provid deliv enterpris, busi small_busi..."
1,Internet,"Omnicity, Inc. provides wireless broadband se...","rural broadband, internet, wireless",[market campaign digit_market internet]
2,Information Technology and Services,A & J Global Solutions is a leading provider o...,"Information Technology and services, Customer ...",[india ltd pvt mumbai]
3,Publishing,"Editorial independiente afincada en Barcelona,...",libros para colorear infantiles y para adultos...,
4,Information Technology and Services,Leading Provider of Managed Services. Cisco Ce...,"Cloud Services, Managed Services, IT Technical...","[engin optim civil_engin gas, sale sell genera..."


In [16]:
df_topics.columns = [u'industry', u'description', u'specialties', u'col']

In [17]:
df_topics.to_csv('all_companies_topics.csv',index=False,quoting=1)