Text processing functions

In [50]:
grammar = r"""
  NP1: {<JJ><NN.*>+}          # Chunk sequences of JJ, NN
  NP2: {<NN.*>+<JJ>}          # Chunk sequences of NN and JJ
  NP3: {<NN.*>+}                  #Noun phrases
  VP: {<VB.*><NN.*>+} # Chunk verbs and their arguments
  """
phr_list = ['NP1','NP2','NP3','VP']
tag_list = ['NN','NNS','NNP','NNPS','VB','VBD','VBG','VBN','VBP','VBZ']
import nltk,re
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from text_processing import extract_phrases
stop_words = stopwords.words()+['http','https','goo','isnt']
cp = nltk.RegexpParser(grammar)
pe = extract_phrases.PhraseExtractor()
snowball_stemmer = SnowballStemmer('english')
wordnet_lemmatizer = WordNetLemmatizer()
reg_exp = re.compile('[^a-zA-Z ]',re.IGNORECASE)
def tokenizer(text,stem_type='lemmatize'):
    pos_tags = nltk.pos_tag(nltk.word_tokenize(text))
    phrs = pe.extract_phrase_treeinput(cp.parse(pos_tags),['NP1','NP2','VP'])
    if stem_type == 'stem':
        wrds = [snowball_stemmer.stem(i[0]) for i in pos_tags if i[1] in tag_list]
    elif stem_type == 'lemmatize':
        wrds = [wordnet_lemmatizer.lemmatize(i[0]) for i in pos_tags if i[1] in tag_list]
    wrds = [wrd for wrd in wrds if wrd not in stop_words]
    if stem_type == 'stem':
        phrs = ['_'.join([snowball_stemmer.stem(wrd) for wrd in nltk.word_tokenize(phr)]) for phr in phrs]
    elif stem_type == 'lemmatize':
        phrs = ['_'.join([wordnet_lemmatizer.lemmatize(wrd) for wrd in nltk.word_tokenize(phr)]) for phr in phrs]
    else:
        phrs = ['_'.join([wrd for wrd in nltk.word_tokenize(phr)]) for phr in phrs]
    wrds = [reg_exp.sub('',i) for i in wrds]
    return wrds+phrs


Reading and cleaning data

In [3]:
import pandas as pd
df = pd.read_csv('ecommerce_analysis/ecommerce_companies.csv',sep=None)
df = df.dropna()
df.index = range(df.shape[0])

  from ipykernel import kernelapp as app


In [4]:
df['specialties1'] = [' '.join(set(' '.join([wrds.lower().strip()+' '+'_'.join([wrd.lower().strip() 
                                     for wrd in wrds.split(' ') if wrd])
      for wrds in spec.split(',')]).split(' ')))
        if spec != 'NULL' else '' for spec in df['specialties']]
df['specialties1'].head()

0    and digital_marketing_agency_specialised_in_of...
1    e-marketing experience e-commerce user user_ex...
2    shop and photo flexible_iterations iterations ...
3    shop and photo flexible_iterations iterations ...
4    e-commerce_solutions design&development experi...
Name: specialties1, dtype: object

In [6]:
df['description_phr'] = df['description'].apply(lambda x: ' '.join(tokenizer(unicode(x,'ascii','ignore'))))

In [7]:
df.to_csv('ecommerce_analysis/ecommerce_companies_with_phrases.csv',index=False,quoting=1)

Creating Matrix from 

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=10,tokenizer=tokenizer)
tfidf = tfidf_vectorizer.fit_transform(data_samples)

In [None]:
import pickle
with open('tfidf_ecommerce_phrases.pkl','w') as f:
    pickle.dump({'tfidf':tfidf,'vectorizer':tfidf_vectorizer},f)

In [None]:
with open('tfidf_ecommerce_phrases.pkl','r') as f:
    tmp = pickle.load(f)

In [6]:
tfidf,tfidf_vectorizer = tmp['tfidf'],tmp['vectorizer']

In [7]:
del tmp

In [10]:
from sklearn.decomposition import NMF, LatentDirichletAllocation
n_topics,n_top_words=20,20
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

nmf = NMF(n_components=n_topics, random_state=1, alpha=.1, l1_ratio=.5).fit(tfidf)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)



Topic #0:
solut provid custom deliv client e-commerc_solut web_solut need offer requir busi integr enterpris implement meet cost ecommerc_solut base innov_solut rang
Topic #1:
 contact email info mai sale mobil pleas call phone partner support produto twitter rate follow servio data php expertis
Topic #2:
design websit_design web_design graphic_design develop print logo ident studio offer graphic base web includ brochur engin seo packag special creativ_design
Topic #3:
technolog compani consult inform industri enterpris innov use partner expertis focus experi firm lead data organ world leverag applic invest
Topic #4:
product shop custom onlin sell store retail price buy consum fashion offer marketplac platform sale make home purchas world order
Topic #5:
client work team project experi deliv make agenc creat year result believ peopl time understand success need take build idea
Topic #6:
empresa client solucion servicio diseo somo internet mai mercado desarrollo proyecto onlin resultado

In [45]:
import numpy as np,pandas as pd
preds = nmf.transform(tfidf)
preds_probs = preds.transpose()/np.sum(preds,1)
preds_probs = preds_probs.transpose()

In [47]:
preds_ind = np.where(preds_probs>0.1)
preds_df = pd.DataFrame({'row':preds_ind[0],'col':preds_ind[1]})
preds_df.head()

Unnamed: 0,col,row
0,1,2
1,8,2
2,12,2
3,18,2
4,1,3


In [16]:
# create mappings for topics 
topic_map_dic = {
    0:'e-commerce solution',1:'contact info',2:'design (web,graphic)',3:'consulting',
    4:'e-commerce store',5:'project delivery',6:'description not in english',7:'application development',
    8:'outsourcing firm',9:'software developing',10:'payment processing',11:'help business grow',
    12:'e-commerce platform',13:'brand strategy/marketing',14:'SEO',15:'build website/SEO',16:'social media links',
    17:'SEO/social media marketing',18:'content management system/CRM',19:'mobile app development'
}

In [48]:
# group by row to get all topics for a row
preds_topics = preds_df.groupby('row')['col'].apply(lambda x: [topic_map_dic[i] for i in x.tolist()])
preds_topics.head()

row
2    [contact info, outsourcing firm, e-commerce pl...
3    [contact info, outsourcing firm, e-commerce pl...
4    [contact info, design (web,graphic), applicati...
5    [e-commerce platform, SEO/social media marketing]
6    [consulting, application development, software...
Name: col, dtype: object

In [20]:
df = pd.read_csv('ecommerce_companies.csv')
df = df.dropna()
df.index = range(df.shape[0])
data_samples = df['description']
data_samples = data_samples.dropna()

In [22]:
data_samples.head()

0                                                   . 
1    .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  ....
2    004 Arabia is a privately owned small to mediu...
3    004 Arabia is a privately owned small to mediu...
4    01 Technosys is a web & mobile(iOS/Android) ap...
Name: description, dtype: object

In [50]:
df_topics = df.join(preds_topics)
df_topics.to_csv('ecommerce_topics.csv',index=False,quoting=1)

trying clustering

In [6]:
import pickle
with open('ecommerce_analysis/tfidf_ecommerce_with_phrases.pkl','r') as f:
    tmp = pickle.load(f)

In [3]:
tfidf,tfidf_vectorizer = tmp['tfidf'],tmp['vectorizer']

IndexError: invalid index

In [7]:
type(tmp)

scipy.sparse.csr.csr_matrix

Company classification : Dec 17

In [1]:
import pandas as pd

In [65]:
df = pd.read_excel('ecommerce_analysis/ecommerce_sample_data_for_ui_with_people1.xls')
df = df.fillna('')

In [86]:
cols = ['description','description_phr_lemma','description_phr']
data_samples = df[cols[0]]
if len(cols)>1:
    for col in cols[1:]:
        data_samples = data_samples + df[col]
data_samples[0]

u'Article (formerly Bryght) delivers beautifully designed modern furniture, with outstanding attention to detail, at fair prices. Find us at our new page: https://www. linkedin. com/company/7950401Article Bryght delivers designed furniture attention detail price Find page www linkedin comcompany modern_furniture outstanding_attention fair_price new_pagearticl bryght deliv design furnitur attent detail price find page www linkedin comcompany modern_furnitur outstand_attent fair_price new_page'

In [66]:
df = df.drop_duplicates('company_name')
df.shape

(1941, 43)

In [67]:
df['specialties1'] = [' '.join(set(' '.join([wrds.lower().strip()+' '+'_'.join([wrd.lower().strip() 
                                     for wrd in wrds.split(' ') if wrd])
      for wrds in spec.split(',')]).split(' ')))
        if spec != 'NULL' else '' for spec in df['specialties']]

In [68]:
df['description_phr'] = df['description'].apply(lambda x: ' '.join(tokenizer(x,stem_type='stem')))



In [69]:
df['description_phr_lemma'] = df['description'].apply(lambda x: ' '.join(tokenizer(x,stem_type='lemmatize')))



In [47]:
from sklearn.decomposition import NMF, LatentDirichletAllocation
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

In [78]:
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, min_df=5)
tfidf = tfidf_vectorizer.fit_transform(df['description_phr_lemma']+df['specialties1'])
tfidf.shape

(1941, 3314)

In [79]:
n_topics,n_top_words=40,20
nmf = NMF(n_components=n_topics, random_state=1, alpha=.1, l1_ratio=.5).fit(tfidf)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)

Topic #0:
customer people service work business team make value world quality day offer year need employee life community experience part career
Topic #1:
fashion woman designer style trend look accessory shopping collection wear inspired boutique styling world clothes stylist destination size women label
Topic #2:
platform solution commerce ecommerce technology consumer retailer experience customer marketing sale business data content shopping mobile user software channel analytics
Topic #3:
www visit information commerce retail co please english read founded brands germany com internet career site website job europe learn
Topic #4:
group market leading leader operates portfolio distribution retail groups employee segment textile country player label good electronics dag outlet entertainment
Topic #5:
design quality collection style custom designer manufacturing made production create accessory world fabric new material craftsmanship york italy shirt leather
Topic #6:
luxury lingerie 

In [80]:
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=5)
tf = tf_vectorizer.fit_transform(df['description_phr_lemma']+df['specialties1'])
tf.shape

(1941, 3314)

In [81]:
n_topics,n_top_words=40,20
lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
                                learning_method='online', learning_offset=50.,
                                random_state=0)
lda.fit(tf)
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)

Topic #0:
kitchen bathroom reader elevate writing publishing sustainability rewards loyalty impact content universal appliances perk allows portal resources performance engagement insight
Topic #1:
extra rent rental costume upload anyone rentals collaborative peer doorstep consumption appliances hotel books booking camping tools product date toys
Topic #2:
laboratory lab fisher science healthier innovative_technology programs patient nyse productivity solve services enable combination purchasing world customer improve research serving
Topic #3:
illinois king store part job www find lawn power center virginia know chemical year camping product including missouri pennsylvania feed
Topic #4:
king illinois chemical enjoying medication store location trip power michigan fishing missouri part including year virginia product record ohio cup
Topic #5:
shirts graphic age theme blend living clothes technology story option art lifestyle_brand delivery selection affordable_price platform closet co

In [89]:
def get_top_words_df(model, feature_names, n_top_words):
    '''
    :param model:
    :param feature_names:
    :param n_top_words:
    :return:
    '''
    topic_list = []
    for topic_idx, topic in enumerate(model.components_):
        topic_list.append((topic_idx," ".join([feature_names[i]
                                        for i in topic.argsort()[:-n_top_words - 1:-1]])))
    return pd.DataFrame.from_records(topic_list,columns=['topic_id','top_words'])


In [164]:
topics_df = get_top_words_df(lda, tf_feature_names, n_top_words)
nmf_topics_df = get_top_words_df(nmf,tfidf_feature_names,n_top_words)
topics_df['topic_name'] = topics_df['top_words'].apply(lambda x: '_'.join(x.split()[:3]))
list(topics_df.ix[topics_df['topic_id']==1]['topic_name'])[0]

u'extra_rent_rental'

In [161]:
import numpy as np,pandas as pd
data_samples = df['description_phr_lemma']
data_samples.index = range(data_samples.shape[0])
data_samples.shape,tf.shape

((1941,), (1941, 3314))

In [103]:
preds = lda.transform(tf)
preds_probs = preds.transpose()/np.sum(preds,1)
preds_probs = preds_probs.transpose()
preds_ind = np.where(preds_probs>0.1)
preds_df = pd.DataFrame({'row':preds_ind[0],'col':preds_ind[1]})
preds_df.head()

Unnamed: 0,col,row
0,18,0
1,30,0
2,30,1
3,18,2
4,14,3


In [146]:
pd.merge(preds_df,topics_df,how='left',left_on='col',right_on='topic_id').groupby('row')['topic_name'].\
                            apply(lambda x: '|'.join([i for i in x.tolist() if i !='no_topic'])).head()

row
0          brand_fashion_design|product_customer_store
1                               product_customer_store
2                                 brand_fashion_design
3    art_artist_artwork|brand_fashion_design|produc...
4    kitchen_bathroom_reader|brand_fashion_design|p...
Name: topic_name, dtype: object

In [154]:
#below is slow implementation
#preds_topics = preds_df.groupby('row')['col'].apply(lambda x: 
#                                    {list(topics_df.ix[(topics_df['topic_id']==i)]['topic_name'])[0] for i in x.tolist()})
#preds_topics.head()
pred_topics = pd.merge(preds_df,topics_df,how='left',left_on='col',right_on='topic_id').groupby('row')['topic_name'].\
                            apply(lambda x: '|'.join([i for i in x.tolist() if i !='no_topic'])).head()
pred_topics.head()

row
0          brand_fashion_design|product_customer_store
1                               product_customer_store
2                                 brand_fashion_design
3    art_artist_artwork|brand_fashion_design|produc...
4    kitchen_bathroom_reader|brand_fashion_design|p...
Name: topic_name, dtype: object

In [168]:
topic_names_df = pd.concat([data_samples,pred_topics,pred_topics],axis=1)
topic_names_df.columns = ['text','nmf_topics','lda_topics']
topic_names_df.head()

Unnamed: 0,text,nmf_topics,lda_topics
0,Article Bryght delivers designed furniture att...,brand_fashion_design|product_customer_store,brand_fashion_design|product_customer_store
1,something buy Want price Demand offer Flubit s...,product_customer_store,product_customer_store
2,friend Mark Talucci Todd Elliott found inspira...,brand_fashion_design,brand_fashion_design
3,BucketFeet BucketFeets mission people art emer...,art_artist_artwork|brand_fashion_design|produc...,art_artist_artwork|brand_fashion_design|produc...
4,founding Basic Resources held mission elevate ...,kitchen_bathroom_reader|brand_fashion_design|p...,kitchen_bathroom_reader|brand_fashion_design|p...


In [134]:
topics_df

Unnamed: 0,topic_id,top_words,topic_name
0,0,kitchen bathroom reader elevate writing publis...,kitchen_bathroom_reader
1,1,extra rent rental costume upload anyone rental...,extra_rent_rental
2,2,laboratory lab fisher science healthier innova...,laboratory_lab_fisher
3,3,illinois king store part job www find lawn pow...,illinois_king_store
4,4,king illinois chemical enjoying medication sto...,king_illinois_chemical
5,5,shirts graphic age theme blend living clothes ...,shirts_graphic_age
6,6,book photography photo camera analysis site we...,book_photography_photo
7,7,bike india times business product news grocery...,bike_india_times
8,8,life employee retail_location program brand ga...,life_employee_retail_location
9,9,food group retailer business customer india on...,food_group_retailer
