In [1]:
# Importing libraries
import pickle
import itertools
import string
import re
import numpy as np
import pandas as pd
import multiprocessing
from collections import Counter
from scipy.sparse import csr_matrix
import gensim
from gensim.models.doc2vec import TaggedDocument
from gensim.models import Doc2Vec
from gensim.models import Word2Vec 
from gensim.models.keyedvectors import KeyedVectors
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from tqdm import tqdm
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sn
from sklearn import utils
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import roc_auc_score
import scipy.sparse
import warnings
warnings.filterwarnings('ignore')

### Load the preprocessed data from the data_directory 

In [2]:
#!pip install testfixtures 

In [3]:
data_directory = "Generated_Files/data_after_preprocessing.csv"

### We devide the data into 3 groups:
* Group 1: full data
* Group 2: data with four large categories which have more than 1000 companies each
* Group 3: seven categories of data, number of companies in each category is same but small

### In the function selectGroup, giving 1, 2 or 3 as input parameter to selet the relevant data for experiment

In [4]:
# read the data from directory, then select the group 
# of data we want to process.
def selectGroup(directory, group_nr):
    data = pd.read_csv(directory, sep='\t')
    if group_nr == 1:
        return data
    if group_nr == 2:
        df_healthcare_group=data[data['Category'] == 'HEALTHCARE GROUP'].sample(n=1041,replace=False)
        df_business_financial_services=data[data['Category'] == 'BUSINESS & FINANCIAL SERVICES'].sample(n=1041,replace=False)
        df_consumer_service_group=data[data['Category'] == 'CONSUMER SERVICES GROUP'].sample(n=1041,replace=False)
        df_information_technology_group=data[data['Category'] == 'INFORMATION TECHNOLOGY GROUP'].sample(n=1041,replace=False)
        df_clean = pd.concat([df_healthcare_group, df_business_financial_services,df_consumer_service_group,df_information_technology_group])
        return df_clean.sample(frac=1)
    if group_nr == 3:
        df_healthcare_group=data[data['Category'] == 'HEALTHCARE GROUP'].sample(n=219,replace=False)
        df_business_financial_services=data[data['Category'] == 'BUSINESS & FINANCIAL SERVICES'].sample(n=219,replace=False)
        df_consumer_service_group=data[data['Category'] == 'CONSUMER SERVICES GROUP'].sample(n=219,replace=False)
        df_information_technology_group=data[data['Category'] == 'INFORMATION TECHNOLOGY GROUP'].sample(n=219,replace=False)
        df_industry_goods=data[data['Category'] == 'INDUSTRIAL GOODS & MATERIALS GROUP'].sample(n=219,replace=False)
        df_consumer_goods=data[data['Category'] == 'CONSUMER GOODS GROUP'].sample(n=219,replace=False)
        df_energy=data[data['Category'] == 'ENERGY & UTILITIES GROUP'].sample(n=219,replace=False)
        df_clean = pd.concat([df_healthcare_group, df_business_financial_services,df_consumer_service_group,df_information_technology_group,df_industry_goods,df_consumer_goods,df_energy])
        return df_clean.sample(frac=1)

In [5]:
# Select and Split the data
data = selectGroup(data_directory, 1)
train, test = train_test_split(data, test_size=0.2, random_state=42)
Web=train['Web'].append(test['Web'])

### Process the data and generate vectors through different methods - Doc2Vec, TF-IDF, Word2Vec

In [6]:
#Generate vectors from Doc2ec
#Load the doc2vec model and Generate tagged documents
filename = 'Generated_Files/doc2vec_model.sav'
new_model = pickle.load(open(filename, 'rb'))
def tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sent):
            if len(word) < 2:
                continue
            tokens.append(word.lower())
    return tokens
train_tagged = train.apply(
    lambda r: TaggedDocument(words=tokenize_text(r['clean']), tags=[r.Category]), axis=1)
test_tagged = test.apply(
    lambda r: TaggedDocument(words=tokenize_text(r['clean']), tags=[r.Category]), axis=1)
cores = multiprocessing.cpu_count()

In [7]:
# Infer vectors from doc2vec model
def get_vectors(model, tagged_docs):
    sents = tagged_docs.values
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in sents])
    return targets, regressors
y_train, X_train = get_vectors(new_model, train_tagged)
print("Infer vector done for train data")
y_test, X_test = get_vectors(new_model, test_tagged)

X_doc2vec = X_train + X_test
y_doc2vec = y_train + y_test
X_doc2vec = csr_matrix(pd.DataFrame(list(X_doc2vec)))

Infer vector done for train data


In [8]:
#Generate vectors from TF-IDF
feature_extraction = TfidfVectorizer( min_df = 5,
    max_df = 0.95,
    max_features = 8000,
    #ngram_range=(1, 2),
    stop_words = 'english')
X_tfidf = feature_extraction.fit_transform(data['clean'].values)
y_tfidf = data['Category'].values

In [11]:
#Generate vectors from Word2Vec
def tf_idf_func(df_document, max_features):
    feature_extraction = TfidfVectorizer(max_features = max_features, stop_words = 'english')
    score_matrix = feature_extraction.fit_transform(df_document.values)
    return score_matrix, feature_extraction

def get_top_keywords_with_frequence(Top_N, score_matrix, df_data, feature_extraction):
    df = pd.DataFrame(score_matrix.todense())
    df['Category'] = df_data['Category']
    dfg = df.groupby(['Category']).mean()

    labels = feature_extraction.get_feature_names()

    categories = df_data['Category'].unique()
    col_names = ['Category', 'Top_N', 'Score']
    df_top = pd.DataFrame(columns = col_names)

    Dict = {}

    for i,r in dfg.iterrows():
        category = i 
        top_series = np.argsort(r)[-Top_N:]
        label_series = top_series.apply(lambda x: labels[x])
        top_scores = np.sort(r)[-Top_N:]
        df_each = pd.DataFrame({'Category':category,'Top_N':label_series,'Score':top_scores})
        df_top = df_top.append(df_each, ignore_index = True)
        for key in label_series:
            if key in Dict:
                Dict[key] = Dict[key]+1
            else:
                Dict[key] = 1
    
    df_reshape = df_top.pivot(index='Top_N', columns='Category')
    sortedDict = sorted(Dict.items(), key=lambda x: x[1])
    
    return sortedDict, df_reshape

def get_word_occurence_stopwordslist(max_occurence, dict_list):
    word = []
    occurence = []
    frequent_stopwords = []
    for key, value in dict_list:
        word.append(key)
        occurence.append(value)
        if value > max_occurence:
            frequent_stopwords.append(key)
    return word, occurence, frequent_stopwords
    
def remove_frequent_stopwords(sentences, frequent_stopwords):
    splitted_string = sentences.split()
    remove_stopwords = [w for w in splitted_string if not w in frequent_stopwords]
    return ' '.join(remove_stopwords)

def remove_frequent_stopwords_and_get_updated_tfidfscore(data, feature_extraction, top_n, frequent_stopwords):
    df_update = data['clean'].apply(lambda x: remove_frequent_stopwords(x, frequent_stopwords))
    score_matrix_update = feature_extraction.fit_transform(df_update.values)
    return score_matrix_update

model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)
score_matrix, feature_extraction = tf_idf_func(data['clean'], 8000)
sortedDict, df_reshape = get_top_keywords_with_frequence(50, score_matrix, data, feature_extraction)
word, occurence, frequent_stopwords = get_word_occurence_stopwordslist(1, sortedDict)
score_matrix_update = remove_frequent_stopwords_and_get_updated_tfidfscore(data, feature_extraction, 10, frequent_stopwords)
score_value = score_matrix_update.todense()
website_word_count=np.asarray(np.count_nonzero(score_value, axis=1)).reshape(-1)

df_score=pd.DataFrame(score_value)
df_score.columns=feature_extraction.get_feature_names()
df_score['Keep']=website_word_count>200
category_temp = data['Category'].reset_index(drop=True)
df_score['Category']=category_temp
df_score['Web'] = data['Web'].reset_index(drop=True)
df_score_valid = df_score[df_score['Keep']]
df_final = df_score_valid[df_score_valid.columns.difference(['Web','Keep', 'Category'])]
top_n = 100
df_top_N = pd.DataFrame({n: df_final.T[col].nlargest(top_n).index.tolist() 
                  for n, col in enumerate(df_final.T)}).T

df_category = df_score_valid['Category'].reset_index(drop=True)
df_web = df_score_valid['Web'].reset_index(drop=True)
df_top_N['Category'] = df_category
df_top_N['Web'] = df_web

def get_vector_from_df(df):
    x_df = df[df.columns.difference(['Category'])]
    x_word = x_df.to_numpy()
    x = np.zeros([len(x_word), 300])
    for i in range(len(x_word)):
        initial_vector = np.zeros(300)
        unseen_word = 0
        for j in range(top_n):
            try:
                initial_vector = initial_vector + model.wv[x_word[i,j]]
            except KeyError as e:
                unseen_word = unseen_word + 1
        final_vector = initial_vector/(top_n-unseen_word)
        if np.isnan(np.sum(final_vector)):
            print(i)
            final_vector = np.zeros([1,300])
        x[i] = final_vector
    return x

X_word2vec = get_vector_from_df(df_top_N)
X_word2vec=np.nan_to_num(X_word2vec)
y_word2vec = df_top_N['Category'].to_numpy()
X_word2vec = csr_matrix(pd.DataFrame(list(X_word2vec)))

In [12]:
#Store the corresponding website names
Website_tfidf = data['Web'].values
Website_word2vec = df_top_N['Web'].values
Website_doc2vec = Web.reset_index(drop=True)

### Define the recommend function based on cosine similarity

In [13]:
def recommend(Input_Company, top_k, X_vector, y_vector, Website): 
    Input=X_vector[np.where(Website==Input_Company)[0][0]]
    Similarity = np.zeros((X_vector.shape)[0])
    for index, vector in enumerate(X_vector):
        Similarity[index]=np.round(cosine_similarity(Input, vector), 2)
    output=np.flipud(Similarity.argsort()[(-1-top_k):-1])
    for i in output:
        print("Website: ", Website[i], " Category:", y_vector[i], " Similarity:", Similarity[i],)

### Define the Inputs such as the number of companies to recommend and the input company name

In [14]:
# Define the input company and the number of companies to recommend
top_k = 5
Input_Company = "www.sbamerica.com"
data[data['Web']==Input_Company]

Unnamed: 0.1,Unnamed: 0,Web,Category,IndustrySegment,content,clean,size
5170,5519,www.sbamerica.com,CONSUMER GOODS GROUP,Food and Beverage,"Specialty Brands of America, Inc. [""SBA""] was ...",specialty brands america inc sba established b...,5886


### Recommend companies based on different methods - Doc2Vec, TF-IDF, Word2Vec

In [15]:
#Doc2Vec Similarity
recommend(Input_Company, top_k, X_doc2vec, y_doc2vec, Website_doc2vec)

Website:  www.celiant.com  Category: INFORMATION TECHNOLOGY GROUP  Similarity: 0.93
Website:  www.packetmotion.com  Category: INFORMATION TECHNOLOGY GROUP  Similarity: 0.93
Website:  www.huskietools.com  Category: INDUSTRIAL GOODS & MATERIALS GROUP  Similarity: 0.93
Website:  www.multiwavenetworks.com  Category: INFORMATION TECHNOLOGY GROUP  Similarity: 0.93
Website:  www.appbackr.com  Category: BUSINESS & FINANCIAL SERVICES  Similarity: 0.93


In [16]:
#TFIDF Similarity
recommend(Input_Company, top_k, X_tfidf, y_tfidf, Website_tfidf)

Website:  www.republicind.com  Category: CONSUMER GOODS GROUP  Similarity: 0.35
Website:  www.sirkensingtons.com  Category: CONSUMER GOODS GROUP  Similarity: 0.23
Website:  www.wholesomesweeteners.com  Category: CONSUMER GOODS GROUP  Similarity: 0.21
Website:  www.bakewisebrands.com  Category: CONSUMER GOODS GROUP  Similarity: 0.15
Website:  www.pure360.com  Category: BUSINESS & FINANCIAL SERVICES  Similarity: 0.15


In [17]:
#Word2Vec Similarity
recommend(Input_Company, top_k, X_word2vec, y_word2vec, Website_word2vec)

Website:  www.thanasi.com  Category: CONSUMER SERVICES GROUP  Similarity: 0.92
Website:  www.mainstreetgourmet.com  Category: CONSUMER GOODS GROUP  Similarity: 0.91
Website:  www.bellisiofoods.com  Category: CONSUMER GOODS GROUP  Similarity: 0.91
Website:  www.caesarspasta.com  Category: CONSUMER GOODS GROUP  Similarity: 0.9
Website:  www.jjsbakery.net  Category: CONSUMER GOODS GROUP  Similarity: 0.89


### Save the data processed by word2vec for GUI usage

In [19]:
# Save the word2vec results to files for GUI
scipy.sparse.save_npz('Generated_Files/X_word2vec.npz', X_word2vec)
np.save('Generated_Files/y_word2vec.npy', y_word2vec)
np.save('Generated_Files/Website_word2vec.npy', Website_word2vec)