In [4]:
# The libriaries for this experiment were imported into the python enviroment
import os

import numpy as np
import scipy as sp
import pandas as pd

from dask.distributed import Client, LocalCluster
import dask.delayed
import dask.dataframe as dd

from sklearn.feature_extraction import stop_words
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import pairwise_distances

from gensim.models.word2vec import Word2Vec

# Cluster

In [None]:
cluster = LocalCluster(n_workers=1, threads_per_worker=8)

In [None]:
client = Client(cluster)

client

# Import

In [None]:
text_path = os.path.join('..','scraper','News','*.csv')
text = dd.read_csv(text_path).dropna()

text.date = text.date.astype('M8[D]')

text.text = text.text.str.lower().str.replace('[^a-z]',' ').str.replace(f"({' | '.join(ENGLISH_STOP_WORDS)})",' ').apply(lambda x: f' {x} ', meta='U').astype('U')

# Portfoliios

In [None]:
def RandomPortolio(stocks=174, size=10, samples=5000, p=None):
    E = np.zeros((stocks,samples), dtype='f8')
    
    for j in range(samples):
        E[ np.random.choice(stocks, size, replace=False, p=None) , j] = 1
    
    return E

In [None]:
P = RandomPortolio()

# Volatility

# Beta

# Association

In [None]:
def Association(doc, vocab, P, idf):
    model = Word2Vec(sentences=[doc.split()], min_count=1, workers=3, iter=100)
    vectors = pd.DataFrame(data=model.wv.vectors, index=model.wv.vocab.keys(), dtype='f8')
    words = vocab.merge(vectors, how='left', left_on='vocab', right_index=True).drop('vocab', axis=1)
    
    companies = pd.DataFrame(idf).apply(lambda x: pd.Series(np.multiply(x.values.reshape(-1,1),words).sum(0)), axis=1)
    distances = pairwise_distances(companies, metric='cosine', n_jobs=1)
    portfolios = pd.DataFrame(P).apply(lambda x: (x.values.reshape(-1,1) * x.values.reshape(-1,1).T * distances).sum(), axis=0)
    
    return pd.Series(portfolios)

In [None]:
docs = pd.Series(description['DESCRIPTION'].sum())

Word2Vec_TFIDF = TfidfVectorizer()
word2vec_tfidf = Word2Vec_TFIDF.fit_transform(description['DESCRIPTION']).todense()

vocab = pd.DataFrame(Word2Vec_TFIDF.get_feature_names(), columns=['vocab'])

In [None]:
ass = docs.apply(lambda doc: Association(doc, vocab, P, word2vec_tfidf))