In [30]:
import warnings
warnings.filterwarnings('ignore')

In [1]:
import pandas as pd,numpy as np
from sqlalchemy import create_engine
import pickle

In [106]:
engine = create_engine('postgresql://pipecandy_user:pipecandy@192.168.1.142:5432/pipecandy_db1')
df = pd.read_sql_query("select distinct on (linkedin_url) linkedin_url,industry,description,specialties,website "\
                       " from (select  b.* from linkedin_company_domains a join "\
                       " linkedin_company_base b using(linkedin_url) where "\
                       " a.country='UNITED STATES' and "\
                       " b.company_size in ('5001-10,000 employees','10,001+ employees') )a",con=engine)

In [107]:
df = df.dropna()
df.index = range(df.shape[0])
df.shape

(8333, 5)

In [108]:
with open('us_large_companies_data.pkl','w') as f:
    pickle.dump(df,f)

In [2]:
with open('us_large_companies_data.pkl','r') as f:
    df = pickle.load(f)

In [3]:
import nltk,re
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from text_processing import extract_phrases
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from text_processing.tagging_methods import get_postag_listinput

In [4]:
grammar = r"""
  NP1: {<JJ><NN.*>+}          # Chunk sequences of JJ, NN
  NP2: {<NN.*>+<JJ>}          # Chunk sequences of JJ, NN
  NP3: {<NN.*>+}                  #Noun phrases
  VP: {<VB.*><NN.*>+} # Chunk verbs and their arguments
  """
phr_list = ['NP1','NP2','NP3','VP']
tag_list = ['NN','NNS','NNP','NNPS','VB','VBD','VBG','VBN','VBP','VBZ']
stop_words = stopwords.words()+['http','https','goo','null']
cp = nltk.RegexpParser(grammar)
pe = extract_phrases.PhraseExtractor()
snowball_stemmer = SnowballStemmer('english')
reg_exp = re.compile('[^a-zA-Z ]',re.IGNORECASE)

In [5]:
def tokenizer(text):
    pos_tags = nltk.pos_tag(nltk.word_tokenize(text))
    phrs = pe.extract_phrase_treeinput(cp.parse(pos_tags),['NP1','NP2','VP'])
    wrds = [snowball_stemmer.stem(i[0]) for i in pos_tags if i[1] in tag_list]
    wrds = [wrd for wrd in wrds if wrd not in stop_words]
    phrs = ['_'.join([snowball_stemmer.stem(wrd) for wrd in nltk.word_tokenize(phr)]) for phr in phrs]
    wrds = [reg_exp.sub('',i) for i in wrds]
    return wrds+phrs

In [6]:
df_descr = df['description']
df_descr[df_descr==''] = 'NULL'
df_descr_tags = get_postag_listinput(list(df_descr))

In [7]:
df_spec = df['specialties']
#df_spec = [ ' '.join(['_'.join(spec.split(' ')) for spec in specs.split(',')]) for specs in df_spec]
df_spec =  [ ' '.join(['_'.join([snowball_stemmer.stem(wrd) 
                                 for wrd in spec.strip().split(' ') if wrd.lower() not in stop_words]) 
                       for spec in specs.split(',') ]) 
            for specs in df_spec]
df_spec[:2]

['', u'brand_leadership product_leadership oper_excel peopl_excel']

In [8]:
import warnings
warnings.filterwarnings('ignore')
df_descr_phrs = [pe.extract_phrase_treeinput(cp.parse(pos_tags),['NP1','NP2','VP']) for pos_tags in df_descr_tags]
df_descr_phrs[0]

[u'diversified media',
 u'leading businesses',
 u'real estate services',
 u'digital education',
 u'pay-TV distribution']

In [45]:
df_descr_wrds = [[snowball_stemmer.stem(i[0]) for i in pos_tags if i[1] in tag_list] for pos_tags in df_descr_tags]

In [46]:
df_descr_wrds = [[wrd for wrd in wrds if wrd not in stop_words] for wrds in df_descr_wrds]

In [48]:
df_descr_phrs_stem = [['_'.join([snowball_stemmer.stem(wrd) for wrd in nltk.word_tokenize(phr)]) 
                      for phr in phrs] for phrs in df_descr_phrs]

In [51]:
df_descr_phrs_stem[0]

[u'diversifi_media',
 u'lead_busi',
 u'real_estat_servic',
 u'digit_educ',
 u'pay-tv_distribut']

In [52]:
df_descr_wrds = [[reg_exp.sub('',i) for i in wrds] for wrds in df_descr_wrds]

In [79]:
df_descr_wrds = [[wrd for wrd in wrds if wrd not in stop_words] for wrds in df_descr_wrds]

In [93]:
[i for i in range(len(df_descr_wrds)) if 'null' in df_descr_wrds[i]]

[]

In [80]:
len(df_descr_wrds),len(df_descr_phrs_stem)

(8333, 8333)

In [161]:
df_industry = [ re.sub('_+','_',re.sub('[^a-z_]','','_'.join(i.lower().split(' ')))) 
                       for i in df['industry']]
pd.Series(df_industry).value_counts()

hospital_health_care                    772
information_technology_and_services     455
retail                                  444
higher_education                        302
financial_services                      285
government_administration               254
computer_software                       242
automotive                              194
real_estate                             193
restaurants                             163
insurance                               162
consumer_goods                          161
education_management                    160
marketing_and_advertising               152
telecommunications                      150
oil_energy                              139
hospitality                             138
electricalelectronic_manufacturing      136
transportationtruckingrailroad          133
construction                            129
staffing_and_recruiting                 122
pharmaceuticals                         116
internet                        

In [162]:
data_samples = pd.Series([' '.join(df_descr_wrds[i])+' '+' '.join(df_descr_phrs_stem[i])+' '+ df_spec[i]
                          +' '+df_industry[i]
                          for i in range(len(df_descr_wrds))])

In [163]:
data_samples[20]

u'danon compani unifi govern mission bring health food peopl danon compani share goal grow busi custom area premium water cultur dairi danon north compani recogn leader danon compani subsidiari danon fortun compani food compani world lead posit dairi product water babi nutrit nutrit danon core valu foster growth reward entrepreneuri innov someth danon north compani see reflect peopl product cultur like learn encourag appli learn american_compani mani_peopl danon_north american_compani respect_custom fast-growth_area bottl_water american_compani global_leader danon_north american_compani health-focus_food_compani fresh_dairi_product medic_nutrit profession_growth american_compani cultur_dairi_product_manufactur_market premium_water food_beverages'

In [177]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=10,tokenizer = lambda x: x.split(' '))
tfidf = tfidf_vectorizer.fit_transform(data_samples)

In [158]:
tfidf.shape

(8333, 3960)

In [130]:
with open('tfidf_us_large_companies_phrases.pkl','w') as f:
    pickle.dump({'tfidf':tfidf,'vectorizer':tfidf_vectorizer},f)

In [103]:
from sklearn.decomposition import NMF, LatentDirichletAllocation
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

In [180]:
n_topics,n_top_words=30,20
nmf = NMF(n_components=n_topics, random_state=1, alpha=.1, l1_ratio=.5).fit(tfidf)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)

Topic #0:
 computer_software hospital_health_care entertainment pharmaceuticals publishing government_administration retail consumer_goods insurance real_estate broadcast_media electricalelectronic_manufacturing marketing_and_advertising transportationtruckingrailroad oil_energy construction defense_space education_management newspapers
Topic #1:
servic provid custom manag client support offer industri mainten facil qualiti need program serv includ valu offic deliv train focus
Topic #2:
health care hospit center hospital_health_care healthcar patient system medic physician communiti provid nurs clinic cancer medicin children facil home organ
Topic #3:
school student district educ education_management public teacher graduat high_school learn primarysecondary_education system achiev nation colleg children serv counti program prepar
Topic #4:
retail store merchandis fashion shop apparel custom accessori offer price sport groceri home oper specialti associ open style brand select
Topic #5:

In [183]:
#getting all food companies
import numpy as np,pandas as pd
preds = nmf.transform(tfidf)
preds_probs = preds.transpose()/np.sum(preds,1)
preds_probs = preds_probs.transpose()
preds_ind = np.where(preds_probs>0.1)
preds_df = pd.DataFrame({'row':preds_ind[0],'col':preds_ind[1]})

In [190]:
df_food1 = df.ix[preds_df[preds_df['col']==29]['row'],:]

In [191]:
data_samples_lda = pd.Series([' '.join(df_descr_wrds[i])+' '+' '.join(df_descr_phrs_stem[i])+' '+ df_spec[i]
                          for i in range(len(df_descr_wrds))])
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=10,tokenizer = lambda x: x.split(' '))
tf = tf_vectorizer.fit_transform(data_samples_lda)

In [192]:
n_topics,n_top_words=30,20
lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
                                learning_method='online', learning_offset=50.,
                                random_state=0)
lda.fit(tf)
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)

Topic #0:
www twitter inform facebook visit data cloud softwar follow san youtub applic technolog mobil solut storag busi platform connect run
Topic #1:
state citi secur depart govern servic nation agenc unit mission offic titl forc system coast defens law member provid general
Topic #2:
insur bank servic busi financ financi compani mortgag credit account payment offer financi_servic institut asset card invest loan citizen group
Topic #3:
market compani busi world brand network technolog solut product includ custom consum group countri servic media lead industri provid help
Topic #4:
cambridg lane staf admir import_role beacon spread global_manufactur work prove label sheet singapor assign fill carpet inject methodolog wireless entrust
Topic #5:
health care hospit center healthcar patient system provid medic servic includ medicin physician communiti cancer facil children nurs research clinic
Topic #6:
protect safeti fire emerg correct investig polic enforc scott law administ rescu supe

Trying without stemming

In [9]:
df_spec = df['specialties']
#df_spec = [ ' '.join(['_'.join(spec.split(' ')) for spec in specs.split(',')]) for specs in df_spec]
df_spec =  [ ' '.join(['_'.join([wrd 
                                 for wrd in spec.strip().split(' ') if wrd.lower() not in stop_words]) 
                       for spec in specs.split(',') ]) 
            for specs in df_spec]
df_spec[:2]

['',
 u'Brand_Leadership Product_Leadership Operational_Excellence People_Excellence']

In [10]:
df_descr_wrds = [[i[0].lower() for i in pos_tags if i[1] in tag_list] for pos_tags in df_descr_tags]

In [11]:
df_descr_wrds = [[wrd for wrd in wrds if wrd not in stop_words] for wrds in df_descr_wrds]

In [12]:
df_descr_phrs_stem = [['_'.join([wrd for wrd in nltk.word_tokenize(phr)]) 
                      for phr in phrs] for phrs in df_descr_phrs]

In [13]:
df_descr_phrs_stem[0]

[u'diversified_media',
 u'leading_businesses',
 u'real_estate_services',
 u'digital_education',
 u'pay-TV_distribution']

In [14]:
df_descr_wrds = [[reg_exp.sub('',i) for i in wrds] for wrds in df_descr_wrds]

In [15]:
df_descr_wrds = [[wrd for wrd in wrds if wrd not in stop_words] for wrds in df_descr_wrds]

In [16]:
len(df_descr_wrds),len(df_descr_phrs_stem)

(8333, 8333)

In [17]:
df_industry = [ re.sub('_+','_',re.sub('[^a-z_]','','_'.join(i.lower().split(' ')))) 
                       for i in df['industry']]

In [19]:
df_industry[:2],df_descr_wrds[:2],df_descr_phrs_stem[:2],df_spec[:2]

([u'online_media', u'consumer_goods'],
 [[u'news',
   u'corp',
   u'media',
   u'information',
   u'services',
   u'company',
   u'focused',
   u'creating',
   u'distributing',
   u'engaging',
   u'content',
   u'consumers',
   u'world',
   u'company',
   u'comprises',
   u'businesses',
   u'range',
   u'media',
   u'including',
   u'news',
   u'information',
   u'services',
   u'estate',
   u'services',
   u'book',
   u'publishing',
   u'education',
   u'sports',
   u'programming',
   u'distribution',
   u'headquartered',
   u'new',
   u'york',
   u'activities',
   u'news',
   u'corp',
   u'conducted',
   u'united',
   u'states',
   u'australia',
   u'united',
   u'kingdom'],
  [u'generations',
   u'whirlpool',
   u'corporation',
   u'helping',
   u'people',
   u'make',
   u'time',
   u'focus',
   u'matters',
   u'',
   u'families',
   u'lives',
   u'way',
   u'thing',
   u'guided',
   u'everything',
   u'generations',
   u'sustain',
   u'commitment',
   u'make',
   u'experience',
   

In [20]:
data_samples = pd.Series([' '.join(df_descr_wrds[i])+' '+' '.join(df_descr_phrs_stem[i])+' '+ df_spec[i]
                          +' '+df_industry[i]
                          for i in range(len(df_descr_wrds))])

In [22]:
data_samples[10]

u'land olakes inc americas cooperatives offer cooperatives producers nation line supplies production business services marketer food products consumers foodservice professionals food manufacturers career land olakes connects loved brands america youre becoming part something today employees cooperative nation fortune sales exceeding business states countries allows share solutions people world member-owned_cooperatives local_cooperatives agricultural_producers extensive_line agricultural_supplies state-of-the-art_production leading_marketer dairy-based_food_products agricultural_solutions Agricultural_Products Food_Manufacturer Foodservice Animal_Feed_Nutrition food_production'

In [227]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=10,tokenizer = lambda x: x.split(' '))
tfidf = tfidf_vectorizer.fit_transform(data_samples)

In [228]:
tfidf.shape

(8333, 5193)

In [207]:
n_topics,n_top_words=30,20
nmf = NMF(n_components=n_topics, random_state=1, alpha=.1, l1_ratio=.5).fit(tfidf)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)

Topic #0:
 computer_software hospital_health_care pharmaceuticals publishing retail oil_energy government_administration broadcast_media electricalelectronic_manufacturing transportationtruckingrailroad marketing_and_advertising real_estate defense_space education_management airlinesaviation newspapers military medical_devices nonprofit_organization_management
Topic #1:
solutions technology data software business computer_software cloud information customers enterprise applications management help systems analytics infrastructure technologies provider organizations healthcare
Topic #2:
health care hospital hospital_health_care hospitals healthcare center system medical patients physicians centers cancer medicine community patient clinics facilities home nursing
Topic #3:
school schools district students education education_management student public teachers learning primarysecondary_education system districts children high_schools achievement college graduate nation elementary_schools


In [224]:
data_samples_lda = pd.Series([' '.join(df_descr_wrds[i])+' '+' '.join(df_descr_phrs_stem[i])+' '+ df_spec[i]
                          for i in range(len(df_descr_wrds))])

In [231]:
tf_vectorizer_lda = CountVectorizer(max_df=0.95, min_df=10,tokenizer = lambda x: x.split(' '))
tf_lda = tf_vectorizer_lda.fit_transform(data_samples_lda)
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=10,tokenizer = lambda x: x.split(' '))
tf = tf_vectorizer.fit_transform(data_samples)

In [223]:
n_topics,n_top_words=30,20
lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
                                learning_method='online', learning_offset=50.,
                                random_state=0)
lda.fit(tf_lda)
tf_lda_feature_names = tf_vectorizer_lda.get_feature_names()
print_top_words(lda, tf_lda_feature_names, n_top_words)

Topic #0:
company service states  customers united inc stores customer business years locations america year operates associates chain quality supply distribution
Topic #1:
standard electronic fluid cooper arlington phones fuel emissions june fix established smart screen transfer samsung automotive_industry iphone delivery foundation brake
Topic #2:
entertainment sports media television events music content radio golf event group disney universal clubs publishing stations station include studios owns
Topic #3:
list fortune florida ranked state half recognized sun company percent bill employer admired ethics parent average customer worldwide efforts employees
Topic #4:
people  life employees work company help make provide service business mission team success needs benefits customers communities values opportunity
Topic #5:
 hospital national saint childrens commission nationwide news firstservice va magnet jefferson best corning org valley children center heart certification
Topic #6:


In [215]:
#try lsa
n_topics,n_top_words=30,20
from sklearn.decomposition import TruncatedSVD
lsa = TruncatedSVD(n_components=n_topics,random_state=0)
lsa.fit(tf)
print_top_words(lsa, tf_feature_names, n_top_words)

Topic #0:
 services health company care solutions business products service customers world management www people technology employees including center companies states
Topic #1:
 novartis spine diseases facts chemistry install red biology joint_commission html surgical charlotte commission computer days jefferson montgomery theater guys
Topic #2:
health care hospital center system hospitals medical university healthcare medicine research physicians patients community cancer centers school education nursing rehabilitation
Topic #3:
services management solutions clients provider  data provides consulting care healthcare government outsourcing title support security property maintenance risk staffing
Topic #4:
university education students school research schools county state programs district community college center city learning service business faculty nation medicine
Topic #5:
solutions technology products customers business systems data software technologies center help information

NMF was good at finding food related topics, but could not identify electronics. LDA could not identify food, but is better at identifying electronics. LSA was not giving good results.
Start with low number of topics and enlarge it.

In [232]:
n_topics,n_top_words=10,20
lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
                                learning_method='online', learning_offset=50.,
                                random_state=0)
lda.fit(tf)
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)

Topic #0:
 company stores www retail states food brands restaurants facebook store home products inc twitter brand customers united restaurant visit
Topic #1:
transportation energy states construction united service services gas company  water logistics state oil america supply safety power north operations
Topic #2:
estate services real_estate management insurance property investment company  agents title home investors mortgage professionals clients service offices ibm properties
Topic #3:
people business work  help world make service team company clients provide employees customers opportunities services success mission needs experience
Topic #4:
county city banking  san district bank department financial_services including government_administration citizens state services california area government finance community public
Topic #5:
staffing accounting tax networking staffing_and_recruiting personnel semiconductors professionals devices payroll finance semiconductor computer recrui

In [244]:
n_topics,n_top_words=10,20
nmf = NMF(n_components=n_topics, random_state=1, alpha=.1, l1_ratio=.5).fit(tfidf)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)

Topic #0:
 financial_services information_technology_and_services telecommunications retail computer_software entertainment hospital_health_care automotive government_administration pharmaceuticals publishing consumer_goods oil_energy broadcast_media electricalelectronic_manufacturing marketing_and_advertising transportationtruckingrailroad real_estate construction
Topic #1:
services solutions management technology business clients data security systems information customers provider software information_technology_and_services companies consulting customer support industry service
Topic #2:
health care hospital hospital_health_care hospitals healthcare center system medical patients physicians services centers community cancer medicine communities patient home quality
Topic #3:
university research higher_education education students state programs universities faculty campus college medicine graduate center colleges institution teaching campuses learning world
Topic #4:
school schools

Small topics approach do not seem to work. Food is nowhere in the topics. NMF is clearly not suited here.
Try clustering and then topic extraction from each cluster?
Another option is to look at interesting words and their distribution in topics. Take the top topics and do topic extraction on them.