In [4]:
grammar = r"""
  NP1: {<JJ><NN.*>+}          # Chunk sequences of JJ, NN
  NP2: {<NN.*>+<JJ>}          # Chunk sequences of NN and JJ
  NP3: {<NN.*>+}                  #Noun phrases
  VP: {<VB.*><NN.*>+} # Chunk verbs and their arguments
  """
phr_list = ['NP1','NP2','NP3','VP']
tag_list = ['NN','NNS','NNP','NNPS','VB','VBD','VBG','VBN','VBP','VBZ']
import nltk,re
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from text_processing import extract_phrases
stop_words = stopwords.words()+['http','https','goo']
cp = nltk.RegexpParser(grammar)
pe = extract_phrases.PhraseExtractor()
snowball_stemmer = SnowballStemmer('english')
reg_exp = re.compile('[^a-zA-Z ]',re.IGNORECASE)
def tokenizer(text):
    pos_tags = nltk.pos_tag(nltk.word_tokenize(text))
    phrs = pe.extract_phrase_treeinput(cp.parse(pos_tags),['NP1','NP2','VP'])
    wrds = [snowball_stemmer.stem(i[0]) for i in pos_tags if i[1] in tag_list]
    wrds = [wrd for wrd in wrds if wrd not in stop_words]
    phrs = ['_'.join([snowball_stemmer.stem(wrd) for wrd in nltk.word_tokenize(phr)]) for phr in phrs]
    wrds = [reg_exp.sub('',i) for i in wrds]
    return wrds+phrs


In [None]:
import pandas as pd
df = pd.read_csv('ecommerce_companies.csv',sep=';')
df = df.dropna()
df.index = range(df.shape[0])
data_samples = df['description']
data_samples = data_samples.dropna()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=10,tokenizer=tokenizer)
tfidf = tfidf_vectorizer.fit_transform(data_samples)

In [None]:
import pickle
with open('tfidf_ecommerce_phrases.pkl','w') as f:
    pickle.dump({'tfidf':tfidf,'vectorizer':tfidf_vectorizer},f)

In [None]:
with open('tfidf_ecommerce_phrases.pkl','r') as f:
    tmp = pickle.load(f)

In [6]:
tfidf,tfidf_vectorizer = tmp['tfidf'],tmp['vectorizer']

In [7]:
del tmp

In [10]:
from sklearn.decomposition import NMF, LatentDirichletAllocation
n_topics,n_top_words=20,20
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

nmf = NMF(n_components=n_topics, random_state=1, alpha=.1, l1_ratio=.5).fit(tfidf)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)



Topic #0:
solut provid custom deliv client e-commerc_solut web_solut need offer requir busi integr enterpris implement meet cost ecommerc_solut base innov_solut rang
Topic #1:
 contact email info mai sale mobil pleas call phone partner support produto twitter rate follow servio data php expertis
Topic #2:
design websit_design web_design graphic_design develop print logo ident studio offer graphic base web includ brochur engin seo packag special creativ_design
Topic #3:
technolog compani consult inform industri enterpris innov use partner expertis focus experi firm lead data organ world leverag applic invest
Topic #4:
product shop custom onlin sell store retail price buy consum fashion offer marketplac platform sale make home purchas world order
Topic #5:
client work team project experi deliv make agenc creat year result believ peopl time understand success need take build idea
Topic #6:
empresa client solucion servicio diseo somo internet mai mercado desarrollo proyecto onlin resultado

In [45]:
import numpy as np,pandas as pd
preds = nmf.transform(tfidf)
preds_probs = preds.transpose()/np.sum(preds,1)
preds_probs = preds_probs.transpose()

In [47]:
preds_ind = np.where(preds_probs>0.1)
preds_df = pd.DataFrame({'row':preds_ind[0],'col':preds_ind[1]})
preds_df.head()

Unnamed: 0,col,row
0,1,2
1,8,2
2,12,2
3,18,2
4,1,3


In [16]:
# create mappings for topics 
topic_map_dic = {
    0:'e-commerce solution',1:'contact info',2:'design (web,graphic)',3:'consulting',
    4:'e-commerce store',5:'project delivery',6:'description not in english',7:'application development',
    8:'outsourcing firm',9:'software developing',10:'payment processing',11:'help business grow',
    12:'e-commerce platform',13:'brand strategy/marketing',14:'SEO',15:'build website/SEO',16:'social media links',
    17:'SEO/social media marketing',18:'content management system/CRM',19:'mobile app development'
}

In [48]:
# group by row to get all topics for a row
preds_topics = preds_df.groupby('row')['col'].apply(lambda x: [topic_map_dic[i] for i in x.tolist()])
preds_topics.head()

row
2    [contact info, outsourcing firm, e-commerce pl...
3    [contact info, outsourcing firm, e-commerce pl...
4    [contact info, design (web,graphic), applicati...
5    [e-commerce platform, SEO/social media marketing]
6    [consulting, application development, software...
Name: col, dtype: object

In [20]:
df = pd.read_csv('ecommerce_companies.csv')
df = df.dropna()
df.index = range(df.shape[0])
data_samples = df['description']
data_samples = data_samples.dropna()

In [22]:
data_samples.head()

0                                                   . 
1    .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  ....
2    004 Arabia is a privately owned small to mediu...
3    004 Arabia is a privately owned small to mediu...
4    01 Technosys is a web & mobile(iOS/Android) ap...
Name: description, dtype: object

In [50]:
df_topics = df.join(preds_topics)
df_topics.to_csv('ecommerce_topics.csv',index=False,quoting=1)