# Vanilla Signal

In this notebook, we will be constructing an initial signal without controlling for sector or any other clustering that may occur with these companies.

In [3]:
import numpy as np
import pandas as pd
import pickle
import os

# Used to find business days
import datetime
from pandas.tseries.offsets import BDay

In [4]:
os.listdir('../1-data')

['stopwords',
 '.DS_Store',
 'returns.h5',
 '1-Returns_Data_Exploration.ipynb',
 'Text_Tokenizer.ipynb',
 '3-Text_Embedding.ipynb',
 '.ipynb_checkpoints',
 'sample_tokenized',
 'alltokens.pickle',
 'company_filings.pickle',
 '2-Text_Tokenizer.ipynb']

In [5]:
with open('../1-data/company_filings.pickle', 'rb') as handle:
    company_filings = pickle.load(handle)

In [6]:
company_filings[list(company_filings.keys())[0]]

Unnamed: 0,year,file,Cosine_Similarity
500,2006,000782.SZ.txt,
862,2007,000782.SZ.txt,
1239,2008,000782.SZ.txt,
1363,2009,000782.SZ.txt,
1637,2010,000782.SZ.txt,0.996726
1817,2011,000782.SZ.txt,0.989403
1916,2012,000782.SZ.txt,0.96444
1378,2013,000782.SZ.txt,0.991539
2048,2014,000782.SZ.txt,0.976864
2200,2015,000782.SZ.txt,0.993328


There seems to be reports that contain cid instead of a chinese character. This is something I will need to fix. 

## Finding Next Business Day After Each Filing

For starters, we will be taking a naive approach of assuming each document is filed on the business day after April 30. Businesses are asked to report any time between Feb 1 and April 30 each year. 

In [22]:
cosine_similarity = pd.DataFrame()

for key in list(company_filings.keys()):
    
    # Select company dictionary value
    sample = company_filings[key]
    
    # Create a DataFrame containing only the cosine scores, index is year
    sample_df = sample.loc[:,['year', 'Cosine_Similarity']].set_index('year')
    
    # Rename column to company ticker
    sample_df.columns = [key[:-4]]
    
    cosine_similarity = pd.concat([cosine_similarity,sample_df], axis=1)

In [23]:
cosine_similarity

Unnamed: 0_level_0,000782.SZ,002723.SZ,002025.SZ,002227.SZ,603778.SH,002813.SZ,600385.SH,002685.SZ,600560.SH,002352.SZ,...,600408.SH,300210.SZ,300530.SZ,002799.SZ,300339.SZ,300280.SZ,000637.SZ,000876.SZ,002213.SZ,002521.SZ
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2006,,,,,,,,,,,...,,,,,,,,,,
2007,,,,,,,,,,,...,,,,,,,,,,
2008,,,,,,,,,,,...,,,,,,,0.980038,,0.968123,
2009,,,,0.991257,,,0.992253,,0.993208,,...,0.97446,,,,,,0.986314,0.991674,0.980689,
2010,0.996726,,0.994487,0.992241,,,0.985127,,0.984205,0.980128,...,0.984934,,,,,,0.995721,0.995408,0.993079,
2011,0.989403,,0.973755,0.989706,,,0.976313,,0.990302,0.988313,...,0.986567,,,,,,0.986827,0.95692,0.995573,0.980849
2012,0.96444,,0.970347,0.969242,,,0.99027,,0.986087,0.960063,...,0.985301,0.959825,,,,0.957184,0.968652,0.963851,0.9423,0.943026
2013,0.991539,,0.986257,0.98911,,,,0.976804,,0.984484,...,,0.994983,,,0.990634,0.997375,0.995117,0.991162,0.993062,0.977609
2014,0.976864,0.979147,0.975534,0.990196,,,0.97071,0.985088,0.948658,0.975158,...,0.956528,0.984976,,,0.978396,0.987504,0.988837,0.959072,0.985283,0.985175
2015,0.993328,0.985523,0.996832,0.996267,,,0.990306,0.993712,0.995052,0.993763,...,0.984848,0.993953,,,0.993648,0.994549,0.994431,0.992396,0.994848,0.990761


In [25]:
# Rearrange to numerical order
cosine_similarity = cosine_similarity.loc[:, sorted(cosine_similarity.columns)]
cosine_similarity

Unnamed: 0_level_0,000001.SZ,000002.SZ,000004.SZ,000005.SZ,000006.SZ,000007.SZ,000008.SZ,000009.SZ,000010.SZ,000011.SZ,...,603986.SH,603987.SH,603988.SH,603989.SH,603990.SH,603993.SH,603996.SH,603997.SH,603998.SH,603999.SH
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2006,,,,,,,,,,,...,,,,,,,,,,
2007,,0.979097,0.963648,,0.965768,,,0.96509,,,...,,,,,,,,,,
2008,0.993695,0.989635,,,0.834884,,,0.981457,,,...,,,,,,,,,,
2009,0.995412,0.978715,,0.98321,0.985631,,0.982124,0.980832,0.978361,,...,,,,,,,,,,
2010,0.990485,0.989937,0.975736,0.990197,0.991685,0.99355,0.974104,0.980742,0.991779,,...,,,,,,,,,,
2011,0.983076,0.990228,0.964521,0.989575,0.996554,0.992779,0.977176,0.99514,0.989862,0.984326,...,,,,,,,,,,
2012,0.991812,0.996491,0.979537,0.982205,0.977356,0.969526,0.948592,0.972817,0.956411,0.963898,...,,,,,,,,,,
2013,0.971637,0.983837,0.988565,0.995151,0.993817,0.979255,0.991283,0.990976,0.980396,0.984529,...,,,,,,0.992681,,,,
2014,0.993993,0.9832,0.986775,,0.989643,0.987519,0.983337,0.968777,0.98285,0.981958,...,,,,,,0.993243,,,,
2015,0.995194,1.0,0.99261,0.987176,0.994848,0.983091,0.92679,0.983294,0.982755,0.997901,...,,,0.964072,,,0.9959,,0.969318,0.950069,


In [31]:
# Check that we can accurately print the next business day after April 30
datetime.datetime(2006,4,1) + BDay(1)

Timestamp('2006-04-03 00:00:00')

In [32]:
# Do this for each year
dates = [datetime.datetime(year,4,1) + BDay(1) for year in cosine_similarity.index]

In [34]:
cosine_similarity.index = dates

In [35]:
cosine_similarity

Unnamed: 0,000001.SZ,000002.SZ,000004.SZ,000005.SZ,000006.SZ,000007.SZ,000008.SZ,000009.SZ,000010.SZ,000011.SZ,...,603986.SH,603987.SH,603988.SH,603989.SH,603990.SH,603993.SH,603996.SH,603997.SH,603998.SH,603999.SH
2006-04-03,,,,,,,,,,,...,,,,,,,,,,
2007-04-02,,0.979097,0.963648,,0.965768,,,0.96509,,,...,,,,,,,,,,
2008-04-02,0.993695,0.989635,,,0.834884,,,0.981457,,,...,,,,,,,,,,
2009-04-02,0.995412,0.978715,,0.98321,0.985631,,0.982124,0.980832,0.978361,,...,,,,,,,,,,
2010-04-02,0.990485,0.989937,0.975736,0.990197,0.991685,0.99355,0.974104,0.980742,0.991779,,...,,,,,,,,,,
2011-04-04,0.983076,0.990228,0.964521,0.989575,0.996554,0.992779,0.977176,0.99514,0.989862,0.984326,...,,,,,,,,,,
2012-04-02,0.991812,0.996491,0.979537,0.982205,0.977356,0.969526,0.948592,0.972817,0.956411,0.963898,...,,,,,,,,,,
2013-04-02,0.971637,0.983837,0.988565,0.995151,0.993817,0.979255,0.991283,0.990976,0.980396,0.984529,...,,,,,,0.992681,,,,
2014-04-02,0.993993,0.9832,0.986775,,0.989643,0.987519,0.983337,0.968777,0.98285,0.981958,...,,,,,,0.993243,,,,
2015-04-02,0.995194,1.0,0.99261,0.987176,0.994848,0.983091,0.92679,0.983294,0.982755,0.997901,...,,,0.964072,,,0.9959,,0.969318,0.950069,


# Compare to Ranked Returns

In [46]:
os.listdir('../1-data')

['stopwords',
 '.DS_Store',
 'returns.h5',
 '1-Returns_Data_Exploration.ipynb',
 'Text_Tokenizer.ipynb',
 '3-Text_Embedding.ipynb',
 '.ipynb_checkpoints',
 'sample_tokenized',
 'alltokens.pickle',
 'company_filings.pickle',
 '2-Text_Tokenizer.ipynb']

In [76]:
rets = pd.read_hdf('../1-data/returns.h5')
rets = rets.loc[sorted(rets.index), sorted(rets.columns)]
rets.tail()

Unnamed: 0_level_0,000001,000002,000005,000006,000008,000009,000012,000016,000021,000024,...,603868,603877,603882,603883,603885,603888,603899,603939,603986,603993
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-04-21,0.035412,-0.021068,,-0.004301,-0.021807,-0.026275,-0.026549,,-0.036804,,...,-0.020027,0.021362,-0.040831,-0.013797,-0.015658,-0.005175,0.002927,-0.028704,-0.037739,-0.01983
2020-04-22,-0.016357,0.004228,,0.099352,0.012739,-0.003175,0.015909,,-0.001006,,...,0.013808,0.026144,0.035399,0.010218,0.025451,0.006402,0.030551,0.015335,-0.004008,0.00578
2020-04-23,0.0,-0.004975,,0.001965,-0.00629,0.001592,-0.011186,,-0.032712,,...,0.03514,-0.000637,0.081939,0.00554,-0.011375,0.035786,0.002266,-0.009802,-0.016759,-0.002873
2020-04-24,0.000756,-0.012308,,-0.001961,0.018987,-0.019078,-0.015837,,0.007284,,...,-0.08,-0.033142,-0.026667,0.000599,-0.024059,0.010748,0.001884,-0.014142,-0.022288,-0.011527
2020-04-27,0.019637,0.007789,,-0.007859,0.018634,0.040519,0.013793,,-0.028926,,...,-0.023456,-0.032301,0.014384,0.007779,-0.012862,-0.040258,0.01542,0.006762,-0.000366,-0.002916


Let's compare the dimensions of the two DataFrames

In [48]:
np.shape(rets)

(3218, 1649)

In [63]:
np.shape(cosine_similarity)

(11, 3154)

There seems to be some data missing... let's see if the missing columns belong to one exchange

In [69]:
SZ = [x[:-3] for x in cosine_similarity.columns if x[-2:] == 'SZ']
sum([x in SZ for x in rets.columns])

758

In [70]:
SH = [x[:-3] for x in cosine_similarity.columns if x[-2:] == 'SH']
sum([x in SH for x in rets.columns])

830

In [72]:
758 + 830

1588

Even more surprising, there appears to be stocks that belong to neither exchange!

In [77]:
[x for x in SZ if x in SH]

[]