# Libraries

In [54]:
import os
import operator
import functools

import numpy as np
import pandas as pd

from sklearn.manifold import TSNE, smacof
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.pipeline import make_pipeline, make_union
from sklearn.decomposition import TruncatedSVD, LatentDirichletAllocation, PCA
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_extraction import stop_words
from sklearn.metrics import pairwise_distances

import holoviews as hv
import hvplot.pandas

from gensim import sklearn_api
from gensim.models import Word2Vec

# Data

In [178]:
description_path = os.path.join('..','data','Bloomberg_Meta.csv')
description = pd.read_csv(description_path)
description = description.loc[description.CODE.str.contains(' SJ Equity'),:]

description.DESCRIPTION = description.DESCRIPTION.str.lower().str.replace('[^a-z]',' ').str.replace(f"({' | '.join(ENGLISH_STOP_WORDS)})",' ')
description['NAME'] = description.DESCRIPTION.apply(lambda s: ' '.join(s.split()[0:3]))
description = description.reset_index()

# Models

LSI

In [137]:
LSI = make_pipeline(TfidfVectorizer(), TruncatedSVD(n_components=100))
lsi = LSI.fit_transform(description.DESCRIPTION)
lsi_d = pairwise_distances(lsi,metric='euclidean', n_jobs=-1)

In [199]:
lsi_tsne = pd.DataFrame(TSNE(metric='precomputed').fit_transform(lsi_d), columns=['x','y'])
lsi_tsne['companies'] = description['NAME']

lsi_smacof = pd.DataFrame(smacof(lsi_d)[0], columns=['x','y'])
lsi_smacof['companies'] = description['NAME']

In [255]:
%%opts Scatter [width=600 height=400 tools=['hover']] (size=5)
hv.Scatter(lsi_tsne, vdims=['x', 'companies'],kdims=['y'], label='LSI TSNE')*functools.reduce(operator.mul,lsi_tsne.sample(frac=0.25).apply(lambda x: hv.Text(x[1], x[0], x[2],fontsize=5),axis=1)) + \
hv.Scatter(lsi_smacof, vdims=['x', 'companies'],kdims=['y'], label='LSI SMACOF')*functools.reduce(operator.mul,lsi_smacof.sample(frac=0.25).apply(lambda x: hv.Text(x[1], x[0], x[2],fontsize=5),axis=1))

LDA

In [140]:
LDA = make_pipeline(CountVectorizer(), LatentDirichletAllocation(n_components=100))
lda = LDA.fit_transform(description.DESCRIPTION)
lda_d = pairwise_distances(lda,metric='euclidean', n_jobs=-1)



In [201]:
lda_tsne = pd.DataFrame(TSNE(metric='precomputed').fit_transform(lda_d), columns=['x','y'])
lda_tsne['companies'] = description['NAME']

lda_smacof = pd.DataFrame(smacof(lda_d)[0], columns=['x','y'])
lda_smacof['companies'] = description['NAME']

In [225]:
=1)

In [251]:
%%opts Scatter [width=600 height=400 tools=['hover']] (size=5)
hv.Scatter(lda_tsne, vdims=['x', 'companies'],kdims=['y'], label='LDA TSNE')*functools.reduce(operator.mul,lda_tsne.sample(frac=0.25).apply(lambda x: hv.Text(x[1], x[0], x[2],fontsize=5),axis=1)) + \
hv.Scatter(lda_smacof, vdims=['x', 'companies'],kdims=['y'], label='LDA SMACOF')*functools.reduce(operator.mul,lda_smacof.sample(frac=0.25).apply(lambda x: hv.Text(x[1], x[0], x[2],fontsize=5),axis=1))

In [None]:
word2vec_model =  Word2Vec(description['DESCRIPTION'].str.split().tolist(), size=100, window=5, min_count=1, workers=6, iter=100)
Word2Vec_TFIDF = TfidfVectorizer()
word2vec_tfidf = Word2Vec_TFIDF.fit_transform(description['DESCRIPTION'])

vocab = pd.DataFrame(Word2Vec_TFIDF.get_feature_names(), columns=['vocab'])
vectors = pd.DataFrame(data=word2vec_model.wv.vectors, index=word2vec_model.wv.vocab.keys(), dtype='f8')
words = vocab.merge(vectors, how='left', left_on='vocab', right_index=True).drop('vocab', axis=1)
companies = pd.DataFrame.dot(pd.DataFrame(word2vec_tfidf.todense()), words)

word2vec_d = pairwise_distances(companies, metric='euclidean', n_jobs=-1)

In [203]:
word2vec_tsne = pd.DataFrame(TSNE(metric='precomputed').fit_transform(word2vec_d), columns=['x','y'])
word2vec_tsne['companies'] = description['NAME']

word2vec_smacof = pd.DataFrame(smacof(word2vec_d)[0], columns=['x','y'])
word2vec_smacof['companies'] = description['NAME']

In [248]:
%%opts Scatter [width=600 height=400 tools=['hover']] (size=5)
hv.Scatter(word2vec_tsne, vdims=['x', 'companies'],kdims=['y'], label='Word2Vec TSNE')*functools.reduce(operator.mul,word2vec_tsne.sample(frac=0.25).apply(lambda x: hv.Text(x[1], x[0], x[2],fontsize=5),axis=1)) + \
hv.Scatter(word2vec_smacof, vdims=['x', 'companies'],kdims=['y'], label='Word2Vec SMACOF')*functools.reduce(operator.mul,word2vec_smacof.sample(frac=0.25).apply(lambda x: hv.Text(x[1], x[0], x[2],fontsize=5),axis=1))

Doc2Vec

In [None]:
Doc2Vec =  sklearn_api.d2vmodel.D2VTransformer(size=100, iter=100)
doc2vec = Doc2Vec.fit_transform(X=description['DESCRIPTION'].str.split().tolist())
doc2vec_d = pairwise_distances(doc2vec,metric='euclidean', n_jobs=-1)

In [196]:
doc2vec_tsne = pd.DataFrame(TSNE(metric='precomputed').fit_transform(doc2vec_d), columns=['x','y'])
doc2vec_tsne['companies'] = description['NAME']

doc2vec_smacof = pd.DataFrame(smacof(doc2vec_d)[0], columns=['x','y'])
doc2vec_smacof['companies'] = description['NAME']

In [245]:
%%opts Scatter [width=600 height=400 tools=['hover']] (size=5)
hv.Scatter(doc2vec_tsne, vdims=['x', 'companies'],kdims=['y'], label='Doc2Vec TSNE')*functools.reduce(operator.mul,doc2vec_tsne.sample(frac=0.25).apply(lambda x: hv.Text(x[1], x[0], x[2],fontsize=5),axis=1)) + \
hv.Scatter(doc2vec_smacof, vdims=['x', 'companies'],kdims=['y'], label='Doc2Vec SMACOF')*functools.reduce(operator.mul,doc2vec_smacof.sample(frac=0.25).apply(lambda x: hv.Text(x[1], x[0], x[2],fontsize=5),axis=1))