In [1]:
# The libriaries for this experiment were imported into the python enviroment
import os
from functools import reduce

import numpy as np
import scipy as sp
import pandas as pd

from dask.distributed import Client, LocalCluster
import dask.delayed
import dask.dataframe as dd

from sklearn.feature_extraction import stop_words
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import pairwise_distances

from gensim.models.word2vec import Word2Vec

# Cluster

In [2]:
cluster = LocalCluster(n_workers=1, threads_per_worker=7)

In [3]:
client = Client(cluster)

client

0,1
Client  Scheduler: tcp://127.0.0.1:35367  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 1  Cores: 7  Memory: 14.67 GB


# Parameters

In [4]:
days_news = 30

# Import

Text

In [5]:
text_path = os.path.join('..','scraper','News','*.csv')
text = dd.read_csv(text_path).dropna()

text.date = text.date.astype('M8[D]')

text.text = text.text.str.lower().str.replace('[^a-z\\.]',' ').str.replace(f"({' | '.join(ENGLISH_STOP_WORDS)})",' ').str.split('.').apply(lambda x: [i.split() for i in x], meta='O')

text = text.groupby('date').sum()

In [6]:
T = text.compute()

In [7]:
U = pd.DataFrame(index=pd.date_range(start='2003-05-16', end='2018-05-17')).join(T)

U = U.reset_index()

U['value'] = range(U.shape[0])

U.value = U.value.apply(lambda x: reduce(sum, [ [] if type(i) != list else i for i in [U.text.loc[x-days_news:x].tolist()]]))

In [8]:
U = U.set_index('index')

Price

In [9]:
price_path = os.path.join('..','data','JSE_RI_2003_2008.csv')
price = pd.read_csv(price_path,  sep=';', decimal=b',', 
                    skiprows=1, dtype={'R:ISAJ(RI)': 'float64',
                                       'R:PPEJ(RI)': 'float64',
                                       'R:ZCIJ(RI)': 'float64'})

In [10]:
price = price.drop(labels=price.columns[price.columns.str.startswith('Unnamed:')], axis=1)
price.Code = price.Code.astype('M8[D]')

In [11]:
price = price.set_index('Code')

Descriptions

Joins

In [12]:
U = price.loc[:,['Code']].join(U, how='left')

In [13]:
U.head()

Unnamed: 0_level_0,R:NPNJ(RI),R:FSRJ(RI),R:SBKJ(RI),R:SOLJ(RI),R:MTNJ(RI),R:SLMJ(RI),R:NEDJ(RI),R:BGAJ(RI),R:SHPJ(RI),R:AMSJ(RI),...,R:OMLJ(RI),R:AONJ(RI),R:CULPJ(RI),R:EMNJ(RI),R:RTNJ(RI),R:CFRJ(RI),source,company,text,value
Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2003-05-16,61.44,4971.77,1341.2,7399.11,413.55,120.46,2771.9,489.08,158.81,31072.29,...,98.22,70.45,99.4,188.57,273.17,741.69,,,,[nan]
2003-05-19,61.55,4909.54,1352.47,7653.34,407.51,118.5,2613.72,492.93,153.29,31060.84,...,95.67,70.45,99.4,188.57,273.17,717.27,,,,"[nan, nan, nan, nan]"
2003-05-20,60.48,4895.71,1343.45,7397.4,410.53,119.48,2606.19,492.93,153.29,30373.15,...,95.23,70.45,99.4,188.57,273.17,705.06,,,,"[nan, nan, nan, nan, nan]"
2003-05-21,61.72,5034.0,1375.01,7312.38,412.34,121.44,2621.26,508.34,153.29,29777.13,...,92.94,70.45,99.4,188.57,273.17,731.92,,,,"[nan, nan, nan, nan, nan, nan]"
2003-05-22,64.32,5020.17,1384.03,7405.91,416.56,121.44,2621.26,507.56,150.25,29513.53,...,92.59,70.45,99.4,188.57,273.17,744.74,,,,"[nan, nan, nan, nan, nan, nan, nan]"


# Portfoliios

In [None]:
def RandomPortolio(stocks=174, size=10, samples=5000, p=None):
    E = np.zeros((stocks,samples), dtype='f8')
    
    for j in range(samples):
        E[ np.random.choice(stocks, size, replace=False, p=None) , j] = 1
    
    return E

In [None]:
P = RandomPortolio()

# Volatility

# Beta

# Association

In [None]:
def Association(doc, vocab, P, idf):
    model = Word2Vec(sentences=[doc.split()], min_count=1, workers=1, iter=100)
    vectors = pd.DataFrame(data=model.wv.vectors, index=model.wv.vocab.keys(), dtype='f8')
    words = vocab.merge(vectors, how='left', left_on='vocab', right_index=True).drop('vocab', axis=1)
    
    companies = pd.DataFrame(idf).apply(lambda x: pd.Series(np.multiply(x.values.reshape(-1,1),words).sum(0)), axis=1)
    distances = pairwise_distances(companies, metric='cosine', n_jobs=1)
    portfolios = pd.DataFrame(P).apply(lambda x: (x.values.reshape(-1,1) * x.values.reshape(-1,1).T * distances).sum(), axis=0)
    
    return pd.Series(portfolios)

In [None]:
docs = pd.Series(description['DESCRIPTION'].sum())

Word2Vec_TFIDF = TfidfVectorizer()
word2vec_tfidf = Word2Vec_TFIDF.fit_transform(description['DESCRIPTION']).todense()

vocab = pd.DataFrame(Word2Vec_TFIDF.get_feature_names(), columns=['vocab'])

In [None]:
ass = docs.apply(lambda doc: Association(doc, vocab, P, word2vec_tfidf))