# Libraries

In [7]:
import os
import operator
import functools

import numpy as np
import pandas as pd

from sklearn.manifold import TSNE, smacof
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.pipeline import make_pipeline, make_union
from sklearn.decomposition import TruncatedSVD, LatentDirichletAllocation, PCA
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_extraction import stop_words
from sklearn.metrics import pairwise_distances

import holoviews as hv
import hvplot.pandas

from gensim import sklearn_api
from gensim.models import Word2Vec

In [8]:
hv.extension('bokeh')
#hv.archive.auto()

# Data

In [9]:
description_path = os.path.join('..','data','Bloomberg_Meta.csv')
description = pd.read_csv(description_path)
description = description.loc[description.CODE.str.contains(' SJ Equity'),:]

description.DESCRIPTION = description.DESCRIPTION.str.lower().str.replace('[^a-z]',' ').str.replace(f"({' | '.join(ENGLISH_STOP_WORDS)})",' ')
description['NAME'] = description.DESCRIPTION.apply(lambda s: ' '.join(s.split()[0:3]))
description = description.reset_index()

# Models

LSI

In [4]:
LSI = make_pipeline(TfidfVectorizer(), TruncatedSVD(n_components=100))
lsi = LSI.fit_transform(description.DESCRIPTION)
lsi_d = pairwise_distances(lsi,metric='euclidean', n_jobs=-1)

In [5]:
lsi_tsne = pd.DataFrame(TSNE(metric='precomputed').fit_transform(lsi_d), columns=['x','y'])
lsi_tsne['companies'] = description['NAME']

lsi_smacof = pd.DataFrame(smacof(lsi_d)[0], columns=['x','y'])
lsi_smacof['companies'] = description['NAME']

In [6]:
%%opts Scatter [width=800 height=600 tools=['hover']] (size=5)
%%output filename="./media/LSI TSNE" fig="png"
hv.Scatter(lsi_tsne, vdims=['x', 'companies'],kdims=['y'], label='LSI TSNE')*functools.reduce(operator.mul,lsi_tsne.sample(frac=0.25).apply(lambda x: hv.Text(x[1], x[0], x[2],fontsize=5),axis=1))

In [7]:
%%opts Scatter [width=800 height=600 tools=['hover']] (size=5)
%%output filename="./media/LSI SMACOF" fig="png"
hv.Scatter(lsi_smacof, vdims=['x', 'companies'],kdims=['y'], label='LSI SMACOF')*functools.reduce(operator.mul,lsi_smacof.sample(frac=0.25).apply(lambda x: hv.Text(x[1], x[0], x[2],fontsize=5),axis=1))

LDA

In [9]:
LDA = make_pipeline(CountVectorizer(), LatentDirichletAllocation(n_components=100))
lda = LDA.fit_transform(description.DESCRIPTION)
lda_d = pairwise_distances(lda,metric='euclidean', n_jobs=-1)



In [10]:
lda_tsne = pd.DataFrame(TSNE(metric='precomputed').fit_transform(lda_d), columns=['x','y'])
lda_tsne['companies'] = description['NAME']

lda_smacof = pd.DataFrame(smacof(lda_d)[0], columns=['x','y'])
lda_smacof['companies'] = description['NAME']

In [374]:
%%opts Scatter [width=800 height=600 tools=['hover']] (size=5)
%%output filename="./media/LDA TSNE" fig="png"
hv.Scatter(lda_tsne, vdims=['x', 'companies'],kdims=['y'], label='LDA TSNE')*functools.reduce(operator.mul,lda_tsne.sample(frac=0.25).apply(lambda x: hv.Text(x[1], x[0], x[2],fontsize=5),axis=1))

In [375]:
%%opts Scatter [width=800 height=600 tools=['hover']] (size=5)
%%output filename="./media/LDA SMACOF" fig="png"
hv.Scatter(lda_smacof, vdims=['x', 'companies'],kdims=['y'], label='LDA SMACOF')*functools.reduce(operator.mul,lda_smacof.sample(frac=0.25).apply(lambda x: hv.Text(x[1], x[0], x[2],fontsize=5),axis=1))

# Word2Vec

In [25]:
# def Association(doc, vocab, idf):
#     model = Word2Vec(sentences=[doc.split()], min_count=1, workers=3, iter=100)
#     vectors = pd.DataFrame(data=model.wv.vectors, index=model.wv.vocab.keys(), dtype='f8')
#     words = vocab.merge(vectors, how='left', left_on='vocab', right_index=True).drop('vocab', axis=1)
    
#     companies = pd.DataFrame(idf).apply(lambda x: pd.Series(np.multiply(x.values.reshape(-1,1),words).sum(0)), axis=1)
#     distances = pairwise_distances(companies, metric='cosine', n_jobs=-1)
    
#     return distances
def Association(doc, vocab, idf):
    model = Word2Vec(sentences=doc, min_count=1, workers=3, iter=100)
    vectors = pd.DataFrame(data=model.wv.vectors, index=model.wv.vocab.keys(), dtype='f8')
    words = vocab.merge(vectors, how='left', left_on='vocab', right_index=True).drop('vocab', axis=1)
    
    companies = pd.DataFrame(idf).apply(lambda x: pd.Series(np.multiply(x.values.reshape(-1,1),words).sum(0)), axis=1)
    distances = pairwise_distances(companies, metric='cosine', n_jobs=1)
    #portfolios = pd.DataFrame(P).apply(lambda x: (x.values.reshape(-1,1) * x.values.reshape(-1,1).T * distances).sum(), axis=0)
    
    return distances

In [26]:
w_description_path = os.path.join('..','data','Bloomberg_Meta.csv')
w_description = pd.read_csv(w_description_path)
w_description = description.loc[w_description.CODE.str.contains(' SJ Equity'),:]

In [27]:
w_description = w_description.loc[:,'DESCRIPTION'].str.lower().str.replace('[^a-z.]',' ').str.split('.').apply(lambda x: [i.split() for i in x]).sum()

In [28]:
docs = pd.Series(description['DESCRIPTION'].sum())

Word2Vec_TFIDF = TfidfVectorizer()
word2vec_tfidf = Word2Vec_TFIDF.fit_transform(description['DESCRIPTION']).todense()

vocab = pd.DataFrame(Word2Vec_TFIDF.get_feature_names(), columns=['vocab'])

In [None]:
word2vec_d = pd.Series([w_description]).apply(lambda doc: Association(doc, vocab, word2vec_tfidf))

In [30]:
word2vec_d = pd.DataFrame(word2vec_d[0])

In [31]:
word2vec_tsne = pd.DataFrame(TSNE(metric='precomputed').fit_transform(word2vec_d), columns=['x','y'])
word2vec_tsne['companies'] = description['NAME']

word2vec_smacof = pd.DataFrame(smacof(word2vec_d)[0], columns=['x','y'])
word2vec_smacof['companies'] = description['NAME']

In [76]:
%%opts Scatter [width=800 height=600 tools=['hover']] (size=5)
%%output filename="./media/Word2Vec TSNE" fig="png"
hv.Scatter(word2vec_tsne, vdims=['x', 'companies'],kdims=['y'], label='Word2Vec TSNE')*functools.reduce(operator.mul,word2vec_tsne.sample(frac=0.25).apply(lambda x: hv.Text(x[1], x[0], x[2],fontsize=5),axis=1))

In [75]:
%%opts Scatter [width=800 height=600 tools=['hover']] (size=5)
%%output filename="./media/Word2Vec SMACOF" fig="png"
hv.Scatter(word2vec_smacof, vdims=['x', 'companies'],kdims=['y'], label='Word2Vec SMACOF')*functools.reduce(operator.mul,word2vec_smacof.sample(frac=0.25).apply(lambda x: hv.Text(x[1], x[0], x[2],fontsize=5),axis=1))

Doc2Vec

In [None]:
Doc2Vec =  sklearn_api.d2vmodel.D2VTransformer(size=100, iter=100)
doc2vec = Doc2Vec.fit_transform(X=description['DESCRIPTION'].str.split().tolist())
doc2vec_d = pairwise_distances(doc2vec,metric='cosine', n_jobs=-1)

In [127]:
doc2vec_tsne = pd.DataFrame(TSNE(metric='precomputed').fit_transform(doc2vec_d), columns=['x','y'])
doc2vec_tsne['companies'] = description['NAME']

doc2vec_smacof = pd.DataFrame(smacof(doc2vec_d)[0], columns=['x','y'])
doc2vec_smacof['companies'] = description['NAME']

In [378]:
%%opts Scatter [width=800 height=600 tools=['hover']] (size=5)
%%output filename="./media/Doc2Vec TSNE" fig="png"

hv.Scatter(doc2vec_tsne, vdims=['x', 'companies'],kdims=['y'], label='Doc2Vec TSNE')*functools.reduce(operator.mul,doc2vec_tsne.sample(frac=0.25).apply(lambda x: hv.Text(x[1], x[0], x[2],fontsize=5),axis=1))

In [379]:
%%opts Scatter [width=800 height=600 tools=['hover']] (size=5)
%%output filename="./media/Doc2Vec SMACOF" fig="png"
hv.Scatter(doc2vec_smacof, vdims=['x', 'companies'],kdims=['y'], label='Doc2Vec SMACOF')*functools.reduce(operator.mul,doc2vec_smacof.sample(frac=0.25).apply(lambda x: hv.Text(x[1], x[0], x[2],fontsize=5),axis=1))

# Portfolios

In [10]:
# def Association(doc, vocab, idf):
#     model = Word2Vec(sentences=[doc.split()], min_count=1, workers=3, iter=100)
#     vectors = pd.DataFrame(data=model.wv.vectors, index=model.wv.vocab.keys(), dtype='f8')
#     words = vocab.merge(vectors, how='left', left_on='vocab', right_index=True).drop('vocab', axis=1)
    
#     companies = pd.DataFrame(idf).apply(lambda x: pd.Series(np.multiply(x.values.reshape(-1,1),words).sum(0)), axis=1)
#     distances = pairwise_distances(companies, metric='cosine', n_jobs=-1)
    
#     return distances
def Association(doc, vocab, P, idf):
    model = Word2Vec(sentences=[doc.split()], min_count=1, workers=3, iter=100)
    vectors = pd.DataFrame(data=model.wv.vectors, index=model.wv.vocab.keys(), dtype='f8')
    words = vocab.merge(vectors, how='left', left_on='vocab', right_index=True).drop('vocab', axis=1)
    
    companies = pd.DataFrame(idf).apply(lambda x: pd.Series(np.multiply(x.values.reshape(-1,1),words).sum(0)), axis=1)
    distances = pairwise_distances(companies, metric='cosine', n_jobs=1)
    portfolios = pd.DataFrame(P).apply(lambda x: (x.values.reshape(-1,1) * x.values.reshape(-1,1).T * distances).sum(), axis=0)
    
    return pd.Series(portfolios)

In [11]:
docs = pd.Series(description['DESCRIPTION'].sum())

Word2Vec_TFIDF = TfidfVectorizer()
word2vec_tfidf = Word2Vec_TFIDF.fit_transform(description['DESCRIPTION']).todense()

vocab = pd.DataFrame(Word2Vec_TFIDF.get_feature_names(), columns=['vocab'])

In [12]:
#docs = pd.concat([docs, docs])

In [18]:
ass = docs.apply(lambda doc: Association(doc, vocab, P, word2vec_tfidf))

#word2vec_d = ass[0]

INFO:gensim.models.base_any2vec:worker thread finished; awaiting finish of 2 more threads
INFO:gensim.models.base_any2vec:worker thread finished; awaiting finish of 1 more threads
INFO:gensim.models.base_any2vec:worker thread finished; awaiting finish of 0 more threads
INFO:gensim.models.base_any2vec:EPOCH - 91 : training on 6168 raw words (4593 effective words) took 0.0s, 455380 effective words/s
INFO:gensim.models.base_any2vec:worker thread finished; awaiting finish of 2 more threads
INFO:gensim.models.base_any2vec:worker thread finished; awaiting finish of 1 more threads
INFO:gensim.models.base_any2vec:worker thread finished; awaiting finish of 0 more threads
INFO:gensim.models.base_any2vec:EPOCH - 92 : training on 6168 raw words (4640 effective words) took 0.0s, 470371 effective words/s
INFO:gensim.models.base_any2vec:worker thread finished; awaiting finish of 2 more threads
INFO:gensim.models.base_any2vec:worker thread finished; awaiting finish of 1 more threads
INFO:gensim.models

In [19]:
def RandomPortolio(stocks=174, size=10, samples=5000, p=None):
    E = np.zeros((stocks,samples), dtype='f8')
    
    for j in range(samples):
        E[ np.random.choice(stocks, size, replace=False, p=None) , j] = 1
    
    return E

In [20]:
P = RandomPortolio(stocks=239, size=15, samples=5000, p=None)

In [21]:
ass.index = ['Association', 'Association']

In [22]:
%%opts Histogram [width=800 height=600 tools=['hover']]
%%output filename="./media/Benchmark Association Distibution" fig="png"
ass.iloc[0].hvplot.hist()

# Graph Example

In [32]:
C = np.random.choice(list(range(word2vec_tsne.shape[0])), size=5, replace=False)

edges_df = np.array([np.tile(C, 15).tolist(), np.repeat(C,15, axis=0).tolist()]).T

nodes = word2vec_tsne.loc[:,['x','y']].reset_index()

nodes = nodes.iloc[:,[1,2,0]]

In [65]:
nodes = hv.Nodes(nodes)
graph = hv.Graph((edges_df, nodes)).options(node_size=5,width=1000,height=800)

In [35]:
%%output filename="./media/Association Computation Diagram" fig="png"
hv.Scatter(word2vec_tsne, vdims=['y', 'companies'],kdims=['x'], label='Doc2Vec TSNE Portfolio') * \
graph