# Initial attempt at creating Naive Bayes and SVM models to perform categorization of companies based on test descriptions and categories.
### Note: Most of the sci-kit learn code comes from this tutorial: http://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html


In [2]:
import pandas as pd
# Have a look at the category labeled training data 
full_categories_df = pd.read_csv("data/category_training_labeled_fixed.csv",  encoding = "ISO-8859-1")
print(full_categories_df.shape)
full_categories_df.info()

(3000, 9)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 9 columns):
domain         3000 non-null object
rs_category    3000 non-null object
tx_industry    1669 non-null object
cb_category    1580 non-null object
tx_category    1669 non-null object
pb_desc        2336 non-null object
cb_desc        2288 non-null object
pb_industry    1390 non-null object
pb_category    2335 non-null object
dtypes: object(9)
memory usage: 211.0+ KB


In [3]:
full_categories_df.head()

Unnamed: 0,domain,rs_category,tx_industry,cb_category,tx_category,pb_desc,cb_desc,pb_industry,pb_category
0,conferencecloud.co,Business Communication Application,,information technology,,Provider of an online conferencing platform. T...,ConferenceCloud provides state-of-the-art live...,,Social/Platform Software
1,terminus.com,Marketing Software and Service,"AdTech,AdTech,Enterprise Applications",advertising|advertising platforms|b2b,"AdTech,MarketingTech,SaaS",Developer of a B2B advertising platform. The c...,Terminus is a platform that seamlessly integra...,"AdTech, Marketing Tech",Business/Productivity Software
2,galileoprocessing.com,Payment Application,Fintech,,Payment Cards,Provider of payment processing services. The c...,Next generation card processing platform,FinTech,Other Financial Services
3,pubble.co,Business Communication Application,,digital media|education|software,,Operator of a community engagement platform. T...,Pubble is a messaging platform that simplifies...,,Software Development Applications
4,cajo.fi,Printing Technology,,,,Developer of a stainless steel colour patterni...,,Manufacturing,Machinery (B2B)


In [4]:
# get the distinct categories as labeled by rocketship
labels = full_categories_df['rs_category'].unique()
print(labels)


['Business Communication Application' 'Marketing Software and Service'
 'Payment Application' 'Printing Technology' 'Alternative Lending'
 'Fashion' 'EdTech' 'Biotech & Pharmaceuticals'
 'Business Equipment and Supplies' 'Social Content'
 'Business Intelligence Software' 'Industrial Hardware, Supplies and Parts'
 'Consumer Finance' 'Human Capital Services' 'Food Product'
 'Personal Care Products and Service' 'Health and Fitness Application'
 'Medical Equipment' 'Healthcare Information Technology' 'Online Gifting'
 'Broadcasting, Radio and Television' 'Social Network' 'Data Analytics'
 'E-commerce Application' 'Travel' 'Software Development Applications'
 'Hospital and Clinical Service' 'Home Improvements' 'Music and Audio'
 'Transportation Tech' 'Real Estate' 'Alternative Energy' 'Agriculture'
 'Construction' 'Environmental Services' 'insurance' 'General Commerce'
 'Connectivity Products' 'Productivity Applications' 'Database Service'
 'Electronic Equipment and Hardware' 'Mining' 'Auto

In [5]:
# Create a dictionary where key = category, integer = unique integer

category_dict = {category: n for n, category in enumerate(labels)}
print(category_dict)

{'Business Communication Application': 0, 'Marketing Software and Service': 1, 'Payment Application': 2, 'Printing Technology': 3, 'Alternative Lending': 4, 'Fashion': 5, 'EdTech': 6, 'Biotech & Pharmaceuticals': 7, 'Business Equipment and Supplies': 8, 'Social Content': 9, 'Business Intelligence Software': 10, 'Industrial Hardware, Supplies and Parts': 11, 'Consumer Finance': 12, 'Human Capital Services': 13, 'Food Product': 14, 'Personal Care Products and Service': 15, 'Health and Fitness Application': 16, 'Medical Equipment': 17, 'Healthcare Information Technology': 18, 'Online Gifting': 19, 'Broadcasting, Radio and Television': 20, 'Social Network': 21, 'Data Analytics': 22, 'E-commerce Application': 23, 'Travel': 24, 'Software Development Applications': 25, 'Hospital and Clinical Service': 26, 'Home Improvements': 27, 'Music and Audio': 28, 'Transportation Tech': 29, 'Real Estate': 30, 'Alternative Energy': 31, 'Agriculture': 32, 'Construction': 33, 'Environmental Services': 34, '

In [6]:
# Take the training dataset and perform some operations to combine both descriptions into one column

subset_categories_df = full_categories_df.loc[:,('domain','rs_category')]
subset_categories_df['description'] = full_categories_df[['pb_desc','cb_desc']].fillna('').sum(axis=1)
print(subset_categories_df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 3 columns):
domain         3000 non-null object
rs_category    3000 non-null object
description    3000 non-null object
dtypes: object(3)
memory usage: 70.4+ KB
None


In [7]:
# Now enumerate the categories, so that they can be used in a classification algorithm
subset_categories_df['labels'] = subset_categories_df['rs_category'].map(category_dict).astype(int)

In [8]:
subset_categories_df.head()


Unnamed: 0,domain,rs_category,description,labels
0,conferencecloud.co,Business Communication Application,Provider of an online conferencing platform. T...,0
1,terminus.com,Marketing Software and Service,Developer of a B2B advertising platform. The c...,1
2,galileoprocessing.com,Payment Application,Provider of payment processing services. The c...,2
3,pubble.co,Business Communication Application,Operator of a community engagement platform. T...,0
4,cajo.fi,Printing Technology,Developer of a stainless steel colour patterni...,3


In [9]:
# Now get the company name (assuming it is the domain name) by splitting the domain on '.' 
# store this in a new column called 'company'

subset_categories_df['company'] = subset_categories_df['domain'].str.split('.').str[0]
subset_categories_df.head()

Unnamed: 0,domain,rs_category,description,labels,company
0,conferencecloud.co,Business Communication Application,Provider of an online conferencing platform. T...,0,conferencecloud
1,terminus.com,Marketing Software and Service,Developer of a B2B advertising platform. The c...,1,terminus
2,galileoprocessing.com,Payment Application,Provider of payment processing services. The c...,2,galileoprocessing
3,pubble.co,Business Communication Application,Operator of a community engagement platform. T...,0,pubble
4,cajo.fi,Printing Technology,Developer of a stainless steel colour patterni...,3,cajo


In [10]:
### Write the company names to a txt file to be used by web scraper script
### Only do this once
import numpy as np
company_name_df = subset_categories_df.loc[:,('domain','company')]
np.savetxt('training_companies.txt', subset_categories_df.domain.values, fmt='%s')


In [11]:
# An example of where my theory breaks: searching 2020london returns nothing about the company
subset_categories_df.loc[subset_categories_df['domain'] == '2020london.com']

Unnamed: 0,domain,rs_category,description,labels,company
37,2020london.com,Marketing Software and Service,Provider of mobile marketing services. The com...,1,2020london


In [12]:
# Now split into training and testing data
num_samp = subset_categories_df.shape[0]
train_index = int(num_samp*2/3 - 1)
test_index = num_samp - 1

# Use these indexes found above to slice the necessary columns into text and training data (70/30 split)
features_train = subset_categories_df["description"][0:train_index].as_matrix()
features_test = subset_categories_df["description"][train_index+1:test_index].as_matrix()

labels_train = subset_categories_df["labels"][0:train_index].as_matrix()
labels_test = subset_categories_df["labels"][train_index+1:test_index].as_matrix()


# Note: If the index is needed, remove the as_matrix function at the end and the data will be stored as a pandas Series
# whcih retians the index (if you shuffle randomly)

In [13]:
features_train

array([ 'Provider of an online conferencing platform. The company provides an online platform which allows conducting of video conferences.ConferenceCloud provides state-of-the-art live communications and interactivity for hybrid conferences, lectures and meetings.',
       "Developer of a B2B advertising platform. The company enables B2B marketers to simplify account-based marketing to reach and engage targeted accounts across all stages of the buyer's journey.Terminus is a platform that seamlessly integrate salesforce CRM and build segments of best fit accounts.",
       'Provider of payment processing services. The company provides pre-paid visa, mastercard and bill payment processing services and also offers its clients business intelligence, analytics and program management services.Next generation card processing platform',
       ...,
       'Provider of a localized accounting platform. The company offers a localized industry-specific accounting platform for small and medium ent

In [14]:
## Now go through all the description columns and get count of each word --> used in other notebook and needed 
# a place to copy it to
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(stop_words='english')
temp_df = full_categories_df['pb_desc'].fillna('the')
X_train_counts = count_vect.fit_transform(temp_df)
print(X_train_counts.shape)
count_vect.vocabulary_

(3000, 6868)


{'provider': 4864,
 'online': 4200,
 'conferencing': 1363,
 'platform': 4588,
 'company': 1278,
 'provides': 4866,
 'allows': 261,
 'conducting': 1357,
 'video': 6607,
 'conferences': 1362,
 'developer': 1797,
 'b2b': 523,
 'advertising': 165,
 'enables': 2105,
 'marketers': 3702,
 'simplify': 5675,
 'account': 78,
 'based': 569,
 'marketing': 3703,
 'reach': 4972,
 'engage': 2130,
 'targeted': 6135,
 'accounts': 82,
 'stages': 5877,
 'buyer': 851,
 'journey': 3352,
 'payment': 4423,
 'processing': 4788,
 'services': 5575,
 'pre': 4687,
 'paid': 4332,
 'visa': 6633,
 'mastercard': 3714,
 'offers': 4170,
 'clients': 1158,
 'business': 845,
 'intelligence': 3209,
 'analytics': 291,
 'program': 4815,
 'management': 3670,
 'operator': 4214,
 'community': 1273,
 'engagement': 2132,
 'software': 5748,
 'new': 4067,
 'question': 4925,
 'answer': 316,
 'websites': 6733,
 'changes': 1040,
 'way': 6718,
 'interaction': 3217,
 'owner': 4312,
 'users': 6528,
 'continually': 1432,
 'enhancing': 214

In [15]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

text_clf = Pipeline([('vect', CountVectorizer(stop_words='english')),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),
                    ])
text_clf = text_clf.fit(features_train, labels_train)


In [16]:
# Evaluation of the model
import numpy as np
predicted = text_clf.predict(features_test)
#print(predicted)
np.mean(predicted == labels_test) 

0.24224224224224225

In [17]:
from sklearn.linear_model import SGDClassifier
text_clf_svm = Pipeline([('vect', CountVectorizer(stop_words='english',ngram_range=(1, 2))),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                           alpha=0.001, n_iter=5, random_state=42)),
])
text_clf_svm = text_clf_svm.fit(features_train, labels_train)

In [18]:
predicted = text_clf_svm.predict(features_test)
np.mean(predicted == labels_test)

0.52452452452452447

In [19]:
from sklearn import metrics
print(metrics.classification_report(labels_test, predicted,
    target_names=labels))

                                         precision    recall  f1-score   support

     Business Communication Application       0.50      0.14      0.22        21
         Marketing Software and Service       0.55      0.80      0.65        44
                    Payment Application       0.60      0.82      0.69        11
                    Printing Technology       0.55      0.86      0.67         7
                    Alternative Lending       0.47      0.94      0.62        16
                                Fashion       0.62      0.75      0.68        20
                                 EdTech       0.64      0.70      0.67        33
              Biotech & Pharmaceuticals       0.44      0.84      0.58        63
        Business Equipment and Supplies       0.00      0.00      0.00         1
                         Social Content       0.50      0.27      0.35        11
         Business Intelligence Software       0.71      0.48      0.57        21
Industrial Hardware, Suppli

  'precision', 'predicted', average, warn_for)


In [20]:
from sklearn.model_selection import GridSearchCV
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
              'tfidf__use_idf': (True, False),
              'clf__alpha': (1e-2, 1e-3),
}
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)

In [21]:
gs_clf = gs_clf.fit(features_train, labels_train)
gs_clf.predict(features_test)
#twenty_train.target_names[gs_clf.predict(['God is love'])[0]]



array([ 6,  8,  4,  9, 18, 21, 42, 27,  7, 12,  7, 43, 24, 16, 59, 43, 40,
       43, 48, 54, 41, 16, 14, 43, 21, 23, 22, 69, 54, 24,  1, 10,  7,  1,
        7,  7, 40, 27, 15,  3, 53, 12, 21,  7,  1, 45, 24, 55,  2, 27,  1,
       11, 12,  7, 22,  1,  1, 32,  7, 21, 59, 22, 29, 38, 10, 21,  4, 60,
       62, 34,  4, 62, 61, 18,  2, 21,  4, 18,  0,  9, 29,  7, 21, 71,  7,
       27,  1, 40,  3, 51,  7,  7, 40, 39, 62, 22, 51, 53, 21,  4, 44, 24,
       21, 22, 18, 15, 26,  6, 21, 45, 13, 16, 21, 21, 22, 25, 13,  1,  1,
       26, 12, 14, 24, 57,  1, 49, 49,  6, 39,  5, 38, 21, 16,  1, 74, 22,
       22, 43, 24, 11, 40,  7,  9, 13, 21,  7, 43,  7, 13,  7,  7, 23,  1,
       10,  1, 49, 21,  6, 45,  7, 18, 40, 14,  6, 16, 29,  1, 25, 21,  7,
        7,  0, 33, 46,  4, 13, 22, 21, 40,  1,  1,  7,  7, 23, 18, 22, 22,
       21, 15, 29, 25, 51,  4, 63, 68, 60,  7, 40,  8,  7,  5, 18,  5, 24,
       27, 22, 13,  9, 10, 61, 28, 53, 21, 15, 69, 17, 15,  7, 40, 15, 22,
       25, 55, 10, 14, 15

In [22]:
print(gs_clf.best_score_)                                 

for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

0.473236618309
clf__alpha: 0.001
tfidf__use_idf: True
vect__ngram_range: (1, 2)


In [23]:
import pandas as pd
# Have a look at the train_pairs data
# Not going to use this data right away, will need later
# If needed, here is code to look at the raw data
#raw_df = pd.read_csv("raw_data_fixed.csv", encoding = "ISO-8859-1",\
#                usecols = ["domain","tx_industry", "cb_category", "tx_category", "cb_desc", "pb_desc", "pb_category"])
#raw_df.head()
train_pairs_df = pd.read_csv("data/company_pairs_training.csv")
print(train_pairs_df.shape)
train_pairs_df.head()

(3000, 17)


Unnamed: 0,domain1,domain2,rating,tx_industry1,cb_category1,tx_category1,pb_desc1,cb_desc1,pb_industry1,pb_category1,tx_industry2,cb_category2,tx_category2,pb_desc2,cb_desc2,pb_industry2,pb_category2
0,stockflare.com,sigfig.com,2,"Fintech,Fintech,Enterprise Applications",analytics|big data|personal finance,"Mobile Finance,Investment Tech,Data as a Service",Operator of an online platform for stock manag...,Stockflare helps anyone find new stock ideas. ...,"FinTech, SaaS",Information Services (B2C),"Fintech,Fintech",accounting|finance|financial services,"Consumer Finance,Robo Advisors",Provider of an investment services and wealth ...,SigFig is the easiest way to manage your money...,"FinTech, SaaS",Financial Software
1,2trg.com,2u.com,3,,Provider of waste recycling services. The comp...,Environmental Services (B2B),CleanTech,,,Environmental Services,"Edtech,Enterprise Applications,Edtech",Provider of online higher education services. ...,Educational Software,SaaS,2U is a cloud-based Software-as-a-Service plat...,cloud computing|edtech|education|software,EdTech
2,ntsretail.com,iqmetrix.com,1,,,,,NTS Retail provides high-end retail management...,,,Enterprise Applications,customer service|digital signage|point of sale...,RetailTech,,iQmetrix is an IT and service company that pro...,,
3,hootsuite.com,hearsaysystems.com,2,"AdTech,Enterprise Applications,Enterprise Appl...",advertising|apps|brand marketing|messaging|soc...,"MarketingTech,SaaS,Customer Service Software,H...",Provider of a social media management system. ...,Hootsuite is the world's most widely used plat...,SaaS,Media and Information Services (B2B),,digital marketing|financial services|marketing...,,,Hearsay offers the complete client engagement ...,,
4,mozenda.com,dexi.io,3,Enterprise Infrastructure,analytics|business intelligence,Open Source,Provider of data software program that gathers...,Mozenda automates data collection from the web...,"Big Data, Internet of Things, SaaS",Automation/Workflow Software,"Enterprise Applications,Enterprise Applications",analytics|big data|data mining|software,"Business Intelligence,SaaS",,"Extract, Enrich & Connect ANY data. Web data e...",,
