In [4]:
import numpy as np
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
import json

from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import silhouette_score
import gensim.models

# Data

In [7]:
df = pd.read_csv('../data/ticker_data_preprocessed.csv', index_col=0)
print(df.shape)
df.head()

(482, 1196)


Unnamed: 0,2018-01-03,2018-01-04,2018-01-05,2018-01-08,2018-01-09,2018-01-10,2018-01-11,2018-01-12,2018-01-16,2018-01-17,...,2022-09-20,2022-09-21,2022-09-22,2022-09-23,2022-09-26,2022-09-27,2022-09-28,2022-09-29,2022-09-30,sector
A,0.025444,-0.007501,0.015988,0.002146,0.024554,-0.013655,0.000141,0.013136,-0.006971,0.011652,...,-0.019737,-0.012955,-0.016524,-0.007316,-0.009475,-0.005723,0.017351,-0.007921,-0.009695,Healthcare
AAL,-0.012266,0.006305,-0.00038,-0.009877,-0.000959,0.032642,0.049089,0.036335,-0.00838,0.003105,...,-0.016889,-0.052971,-0.039305,-0.039339,-0.028665,0.03457,0.03912,-0.039216,-0.017143,Industrials
AAP,0.009049,0.036899,0.010631,-0.007042,-0.00808,0.000905,0.02134,0.026472,-0.017595,0.01273,...,-0.013735,-0.002231,-0.008399,-0.021997,-0.01757,0.010337,0.025171,-0.02241,-0.020794,Consumer Cyclical
AAPL,-0.000174,0.004645,0.011385,-0.003714,-0.000115,-0.000229,0.00568,0.010326,-0.005082,0.016516,...,0.015665,-0.020268,-0.006375,-0.015124,0.00226,0.006566,-0.012652,-0.049119,-0.030039,Technology
ABBV,0.015649,-0.005703,0.017408,-0.016022,0.007538,-0.005487,-0.004213,0.010779,0.021427,0.018246,...,-0.006239,-0.010298,0.019243,0.00035,-0.012932,0.003612,0.020322,-0.013001,-0.059627,Healthcare


In [8]:
df_pct = df.drop(['sector'], axis=1).T
df_pct.index = pd.to_datetime(df_pct.index)

df_pct.head()

Unnamed: 0,A,AAL,AAP,AAPL,ABBV,ABC,ABMD,ABT,ACN,ADI,...,WYNN,XEL,XOM,XRAY,XYL,YUM,ZBH,ZBRA,ZION,ZTS
2018-01-03,0.025444,-0.012266,0.009049,-0.000174,0.015649,0.003722,0.0173,0.002211,0.004615,0.012406,...,-0.010834,-0.006693,0.01964,-0.003426,0.012193,-0.000858,0.006932,0.019863,-0.001183,0.004598
2018-01-04,-0.007501,0.006305,0.036899,0.004645,-0.005703,-0.002225,0.017516,-0.001697,0.011841,-0.001094,...,0.005415,-0.007791,0.001384,-0.000149,0.006676,0.01018,-0.001441,0.01976,0.004147,0.005964
2018-01-05,0.015988,-0.00038,0.010631,0.011385,0.017408,0.012104,0.015408,0.00289,0.008249,0.004053,...,0.006671,-0.007003,-0.000806,0.014051,-0.001874,0.005828,0.009941,0.015576,0.000393,0.011444
2018-01-08,0.002146,-0.009877,-0.007042,-0.003714,-0.016022,0.016576,0.027086,-0.002882,0.007991,0.001745,...,-0.013314,0.00748,0.004496,0.006781,0.003611,0.00169,0.001905,0.009951,-0.004914,0.011996
2018-01-09,0.024554,-0.000959,-0.00808,-0.000115,0.007538,0.006398,0.009432,0.0017,0.003335,-0.002069,...,0.006778,-0.011667,-0.004246,-0.041728,0.000288,-0.002651,-0.016083,0.030643,0.023509,0.011719


In [61]:
df_pct_train = df_pct[df_pct.index < '2021-01-01']

In [62]:
X = df_pct.values.reshape(-1,1)

In [63]:
X.shape

(575990, 1)

# Tokenization

In [64]:
model = KMeans(n_clusters=200)
model.fit(X)
labels = model.predict(X)

In [65]:
X.shape, labels.shape

((575990, 1), (575990,))

In [66]:
mapping = {x:token for x, token in zip(X.flatten(), labels)}

In [67]:
df_pct_tok = df_pct.copy()
for col in df_pct:
    df_pct_tok[col] = df_pct[col].map(mapping)

In [68]:
df_pct_tok.head()

Unnamed: 0,A,AAL,AAP,AAPL,ABBV,ABC,ABMD,ABT,ACN,ADI,...,WYNN,XEL,XOM,XRAY,XYL,YUM,ZBH,ZBRA,ZION,ZTS
2018-01-03,105,12,0,143,182,121,177,87,53,36,...,122,85,188,27,36,9,125,188,9,53
2018-01-04,163,30,152,53,38,98,177,160,149,9,...,147,3,139,143,30,134,160,188,53,86
2018-01-05,182,143,74,149,177,36,13,18,62,53,...,30,163,9,80,98,86,134,182,81,149
2018-01-08,87,40,163,137,118,82,6,186,62,139,...,132,125,53,30,121,139,139,134,173,36
2018-01-09,195,9,3,143,125,30,0,139,121,98,...,30,66,63,68,81,186,118,106,43,149


# Word2Vec

In [69]:
sentences = df_pct_tok.values.astype(str).tolist()

In [70]:
model = gensim.models.Word2Vec(sentences=sentences, min_count=1, window=3, size=100, workers=4, sg=1, seed=8)

## Embeddings

In [71]:
embeddings = pd.DataFrame()
for col in df_pct_tok:
    embedding = np.zeros(100)
    for i in df_pct_tok[col]:
        embedding += model.wv.get_vector(str(i))
    embedding /= df_pct_tok.shape[0]
    embeddings = pd.concat((embeddings, pd.DataFrame(embedding)), axis=1)
embeddings = embeddings.T

In [72]:
embeddings.index = df_pct.columns

In [73]:
embeddings.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
A,0.066425,-0.119089,-0.10051,-0.075699,-0.090417,-0.175559,0.015433,-0.119962,0.32121,-0.113819,...,-0.105995,-0.012783,0.070658,-0.032939,0.158279,0.066928,0.003969,-0.01304,-0.125138,0.064119
AAL,0.057628,-0.117532,-0.099341,-0.072114,-0.054971,-0.147655,0.016055,-0.146537,0.285763,-0.112398,...,-0.077181,-0.00855,0.026518,-0.035812,0.151659,0.071709,-0.016661,-0.022985,-0.085417,0.055275
AAP,0.064455,-0.11799,-0.10486,-0.07429,-0.088588,-0.175452,0.015684,-0.125347,0.318566,-0.11245,...,-0.099855,-0.01449,0.062954,-0.03329,0.158782,0.06889,-0.002175,-0.016107,-0.118366,0.063268
AAPL,0.067201,-0.113636,-0.10342,-0.072286,-0.089059,-0.171586,0.01421,-0.123297,0.318681,-0.114815,...,-0.101856,-0.013111,0.064164,-0.031468,0.15551,0.067969,-0.000302,-0.013463,-0.118198,0.061905
ABBV,0.064673,-0.124164,-0.10024,-0.080954,-0.09214,-0.180342,0.016927,-0.120703,0.322226,-0.110743,...,-0.105789,-0.013534,0.073446,-0.034576,0.163239,0.066189,0.007752,-0.011236,-0.12838,0.065403


In [74]:
embeddings.to_csv("../results/signal2vec_embds.csv", index=None)