In [1]:
import pandas as pd
from sqlalchemy import create_engine

In [57]:
engine = create_engine('postgresql://postgres:postgres@localhost:5432/crawler_service_test')
df = pd.read_sql_query("select distinct linkedin_url,company_name,description,specialties from crawler.linkedin_company_base "\
                       " where list_id in ('549e8696-92b1-11e6-9f5b-7778f52b0d6b','8571854e-92b5-11e6-a864-8b705e5bfa79')",con=engine)

In [58]:
df = df.dropna()
df.index = range(df.shape[0])
df.shape

(402, 4)

In [59]:
import nltk,re
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from text_processing import extract_phrases
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from text_processing.tagging_methods import get_postag_listinput

In [60]:
grammar = r"""
  NP1: {<JJ><NN.*>+}          # Chunk sequences of JJ, NN
  NP2: {<NN.*>+<JJ>}          # Chunk sequences of JJ, NN
  NP3: {<NN.*>+}                  #Noun phrases
  VP: {<VB.*><NN.*>+} # Chunk verbs and their arguments
  """
phr_list = ['NP1','NP2','NP3','VP']
tag_list = ['NN','NNS','NNP','NNPS','VB','VBD','VBG','VBN','VBP','VBZ']
stop_words = stopwords.words()+['http','https','goo','null']
cp = nltk.RegexpParser(grammar)
pe = extract_phrases.PhraseExtractor()
snowball_stemmer = SnowballStemmer('english')
reg_exp = re.compile('[^a-zA-Z ]',re.IGNORECASE)

In [7]:
def tokenizer(text):
    pos_tags = nltk.pos_tag(nltk.word_tokenize(text))
    phrs = pe.extract_phrase_treeinput(cp.parse(pos_tags),['NP1','NP2','VP'])
    wrds = [snowball_stemmer.stem(i[0]) for i in pos_tags if i[1] in tag_list]
    wrds = [wrd for wrd in wrds if wrd not in stop_words]
    phrs = ['_'.join([snowball_stemmer.stem(wrd) for wrd in nltk.word_tokenize(phr)]) for phr in phrs]
    wrds = [reg_exp.sub('',i) for i in wrds]
    return wrds+phrs

In [61]:
df_descr = df['description']
df_descr[df_descr==''] = 'NULL'
df_descr_tags = get_postag_listinput(list(df_descr))

In [62]:
df_spec = df['specialties']
#df_spec = [ ' '.join(['_'.join(spec.split(' ')) for spec in specs.split(',')]) for specs in df_spec]
df_spec =  [ ' '.join(['_'.join([snowball_stemmer.stem(wrd) 
                                 for wrd in spec.strip().split(' ') if wrd.lower() not in stop_words]) 
                       for spec in specs.split(',') ]) 
            for specs in df_spec]
df_spec[:2]

['', u'hospit_cater']

In [63]:
import warnings
warnings.filterwarnings('ignore')
df_descr_phrs = [pe.extract_phrase_treeinput(cp.parse(pos_tags),['NP1','NP2','VP']) for pos_tags in df_descr_tags]
df_descr_phrs[0]

[u'local travel information',
 u'retail store',
 u'representative offices',
 u'overseas airline tickets',
 u'broad range',
 u'tour trips']

In [64]:
df_descr_wrds = [[snowball_stemmer.stem(i[0]) for i in pos_tags if i[1] in tag_list] for pos_tags in df_descr_tags]

In [65]:
df_descr_wrds = [[wrd for wrd in wrds if wrd not in stop_words] for wrds in df_descr_wrds]

In [66]:
df_descr_phrs_stem = [['_'.join([snowball_stemmer.stem(wrd) for wrd in nltk.word_tokenize(phr)]) 
                      for phr in phrs] for phrs in df_descr_phrs]

In [67]:
df_descr_phrs_stem[0]

[u'local_travel_inform',
 u'retail_store',
 u'repres_offic',
 u'oversea_airlin_ticket',
 u'broad_rang',
 u'tour_trip']

In [68]:
df_descr_wrds = [[reg_exp.sub('',i) for i in wrds] for wrds in df_descr_wrds]

In [69]:
df_descr_wrds = [[wrd for wrd in wrds if wrd not in stop_words] for wrds in df_descr_wrds]

In [70]:
[i for i in range(len(df_descr_wrds)) if 'null' in df_descr_wrds[i]]

[]

In [71]:
len(df_descr_wrds),len(df_descr_phrs_stem)

(402, 402)

In [72]:
data_samples = pd.Series([' '.join(df_descr_wrds[i])+' '+' '.join(df_descr_phrs_stem[i])+' '+ df_spec[i]
                          for i in range(len(df_descr_wrds))])

In [73]:
data_samples[20]

u'combin communic network subsidiari cabcharg australia limit employ profession headquart oriordan street alexandria ccn year experi deliv communic dispatch servic taxi industri vehicl driver fleet includ  taxi combin servic tcs silver servic sydney premium taxi servic south cab  servic passeng wheelchair newcastl taxi  abc cab specialis sydney lower north shore adelaid  comput cab cab sydney  lime taxi  appl taxi addit oper taxi fleet ccn provid associ specialist servic owner oper driver includ  taxi train australia  uniform shop  call centr dispatch  radio frequenc spectrum  stratacom  smash repair cabsur insur  regul complianc own_subsidiari skill_profession purpose-design_headquart high-tech_communic logist_servic western_cab \u2022_newcastl_taxi yellow_adelaid yellow_cab \u2022_cabsur_insur contact_centr transport vehicl_dispatch taxi'

In [74]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=10,tokenizer = lambda x: x.split(' '))
tfidf = tfidf_vectorizer.fit_transform(data_samples)

In [75]:
tfidf.shape

(402, 461)

In [76]:
from sklearn.decomposition import NMF, LatentDirichletAllocation
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

In [77]:
n_topics,n_top_words=10,20
nmf = NMF(n_components=n_topics, random_state=1, alpha=.1, l1_ratio=.5).fit(tfidf)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)

Topic #0:
servic provid custom busi freight australia oper manag client deliveri includ compani team melbourn solut distribut express commit industri support
Topic #1:
travel group world compani holiday busi manag agenc tour experi leisur adventur technolog brand market offic help australia corpor flight
Topic #2:
 australia compani www base leisur lead sydney state unit locat hospit adelaid oper budget taxi road flight level truck
Topic #3:
cruis line ship holiday tour island offer day part repres option destin oper brand airfar world america perth select luxuri
Topic #4:
rail train south network track freight passeng infrastructur queensland wale oper kilometr mainten australia pacif access servic journey aim sydney
Topic #5:
airport airlin aviat pacif fli group aircraft asia australia destin air airway flight peopl oper passeng singapor melbourn network qanta
Topic #6:
tourism event coast visitor promot govern destin attract queensland gold world park develop australia nsw state ope

In [78]:
data_samples_lda = pd.Series([' '.join(df_descr_wrds[i])+' '+' '.join(df_descr_phrs_stem[i])+' '+ df_spec[i]
                          for i in range(len(df_descr_wrds))])
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=10,tokenizer = lambda x: x.split(' '))
tf = tf_vectorizer.fit_transform(data_samples_lda)

In [79]:
n_topics,n_top_words=10,20
lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
                                learning_method='online', learning_offset=50.,
                                random_state=0)
lda.fit(tf)
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)

Topic #0:
 go queensland brand travel network compani peopl oper transport group get truck leas budget provid airlin list includ rout
Topic #1:
travel australia compani  servic group oper airport busi car provid world holiday offer experi year industri new custom airlin
Topic #2:
rail australia experi oper servic world provid locat passeng tour attract  tourism cairn includ citi transport coast network railway
Topic #3:
entertain sport re transport media airfar industri perform home compani properti accommod travel connect servic australia day rang get www
Topic #4:
servic transport  custom australia provid freight busi compani oper manag deliveri road express includ industri deliv solut vehicl logist
Topic #5:
cruis line repres world market australia reput ship event tourism america industri island  brand achiev sydney growth excel strive
Topic #6:
asia pacif resort south hotel travel offic east region coast queensland gold famili group brand accommod australia worldwid compani part
T

NMF seems to give a topic related to hotels (topic 7). Remove them.

In [80]:
import numpy as np,pandas as pd
preds = nmf.transform(tfidf)
preds_probs = preds.transpose()/np.sum(preds,1)
preds_probs = preds_probs.transpose()
preds_ind = np.where(preds_probs>0.2)
preds_df = pd.DataFrame({'row':preds_ind[0],'col':preds_ind[1]})

In [81]:
hotel_inds = set(preds_df[preds_df['col']==8]['row'])
other_inds = list(set(range(df.shape[0]))-hotel_inds)

In [82]:
df_target = df.ix[other_inds,:]

In [83]:
df_target.head()

Unnamed: 0,linkedin_url,company_name,description,specialties
0,https://www.linkedin.com/company/3549182,H.I.S. Oceania,"H. I. S. was first established in Tokyo, Japa...",
2,https://www.linkedin.com/company/flightradar24-ab,Flightradar24 AB,Flightradar24 tracks positions of aircraft in ...,"Flight tracking, App development, Aviation"
3,https://www.linkedin.com/company/scoot,Scoot,Scoot™ is a wholly owned subsidiary of Singapo...,"Low-cost mid to long haul air travel, Empoweri..."
4,https://www.linkedin.com/company/2443027,Monson Agencies Australia Pty Ltd,"As a leading shipping agent in Australia, our ...","Full Agency Appointments, Protective Agency Ap..."
5,https://www.linkedin.com/company/3285436,Raw Hire,"Raw Hire, previously McLaren, is a vehicle ren...","Vehicle Rental, Fleet Management, Ute, Wagon a..."


In [85]:
df_target.to_sql('tmp_freshdesk_aus_companies',engine,index=False,schema='crawler')