In [1115]:
% mkdir filing_texts
% mkdir prices 

mkdir: filing_texts: File exists
mkdir: prices: File exists


In [1116]:
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer, CountVectorizer
import glob
import os 
import pandas as pd
import datetime
from scipy.stats import iqr
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split

from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

import math

spm = None
english_words = None
excepted_stock_names = []
limit = -1

In [1117]:
import keras
from keras.layers import Dense, Dropout, RepeatVector, BatchNormalization, Convolution1D, Flatten, Lambda, Permute, MaxPooling1D, AlphaDropout
from keras.models import Sequential
from keras.utils import to_categorical
import keras.backend as K
from sklearn.model_selection import train_test_split
from keras.models import load_model
from tqdm import *

In [1118]:
class DateRange():
    def __init__(self, start_int, end_int):
        self.start_int = start_int
        self.end_int = end_int
        self.range = (range(start_int, end_int))

    def transform(self, date):
        output = {}
        dates = []
        for delta in self.range:
            date_delta = datetime.timedelta(days=delta)
            date_string = datetime.datetime.strptime(date, '%Y-%m-%d').date()
            dates.append(str(date_string + date_delta))
        return dates

In [1119]:
class SNDM():
    def __init__(self, limit=-1):
        self.limit = limit
        self.sndm = self.__get_sndm()

    def __get_sndm(self):
        spm = self.__stock_name_date_mapping()
        output = {}
        for stock_name in spm.keys():
            numerical_prices = 0
            for date in DateRange(-3,2).transform(spm[stock_name]['date']):
                try:
                    isnan = math.isnan(PriceData(stock_name).on_date(date, 'open'))
                except:
                    isnan = True
                if not (isnan):
                    numerical_prices += 1 
            # Delete stock_name from spm if numerical prices under a threshold. (nans)
            if numerical_prices >= 3:
                output[stock_name] = spm[stock_name]
        return output
    
    def __stock_name_date_mapping(self):
        spm = {}
        for i, file in enumerate(glob.glob('filing_texts/*')[0:self.limit]):
            try:
                st_name = self.__stock_name_from_filename(file)
                file_data = open(file, 'r').read()[:200]
                time_data = file_data.split('<ACCEPTANCE-DATETIME>')[1].split('\\n')[0][:8]
                year = time_data[0:4]
                month = time_data[4:6]
                day = time_data[6:8]
                date_stamp = ("%s-%s-%s" %(year, month, day))
                spm[st_name] = {'date': date_stamp}
            except:
                print("No data for ", file)
                continue
        return spm
    def __stock_name_from_filename(self, filename):
        return filename.split('/')[-1].split('_')[0]

In [1120]:
spm = SNDM().sndm

In [1121]:
class PriceData:
    def __init__(self, stock_name):
        self.stock_name = stock_name
        try:
            self.price_data = pd.read_csv('prices/' + stock_name + '.csv')
        except:
            self.price_data = pd.DataFrame.from_dict({})

    def on_date(self, date, market_time = 'open'): 
        try:
            return float(self.price_data.loc[self.price_data['date'] == date][market_time])
        except: 
            return None

In [1122]:
class FilenamesToStockNamesTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        return None
    
    def fit(self, X, y=None):
        return self
    
    def fit_transform(self, X, y=None):
        return self.fit(X, y).transform(X)
        
    def transform(self, X):
        output = []
        for filename in X:
            stock_name = self.__stock_name_from_filename(filename)
            output.append(stock_name)
        return output
    def __stock_name_from_filename(self, filename):
        return filename.split('/')[-1].split('_')[0]

In [1123]:
## Used in y pipeline
class SpmToFileNamesTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        return None
    
    def fit(self, X, y=None):
        return self
    
    def fit_transform(self, X, y=None):
        return self.fit(X, y).transform(X)
    def transform(self, X):
        output = []
        for key in X.keys():
            date = X[key]['date']
            output.append(f'filing_texts/{key}_{date}')
        return output
    
class SpmToStockNamesTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        return None
    
    def fit(self, X, y=None):
        return self
    
    def fit_transform(self, X, y=None):
        return self.fit(X, y).transform(X)
        
    def transform(self, X):
        return X.keys()
    
class StockNamesToFileNamesTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        return None
    
    def fit(self, X, y=None):
        return self
    
    def fit_transform(self, X, y=None):
        return self.fit(X, y).transform(X)
        
    def transform(self, X):
        return [f'filing_texts/{stock_name}' for stock_name in X]
    
# class MapStockNamesToDatesTransformer(BaseEstimator, TransformerMixin):
#     def __init__(self, start_int, end_int):
#         self.start_int = start_int
#         self.end_int = end_int
#         self.range = (range(start_int, end_int))
    
#     def fit(self, X, y=None):
#         return self
    
#     def fit_transform(self, X, y=None):
#         return self.fit(X, y).transform(X)
        
#     def transform(self, X):
#         output = {}
#         for i, stock_name in enumerate(X):
#             try:
#                 date = spm[stock_name]['date']
#             except:
#                 continue # Ignore stocks which don't have a date
#             dates = []
#             for delta in self.range:
#                 date_delta = datetime.timedelta(days=delta)
#                 date_string = datetime.datetime.strptime(date, '%Y-%m-%d').date()
#                 dates.append(str(date_string + date_delta))
#             output[stock_name] = dates
#         return output
# class StockNameDatesMapToPricesListTransformer(BaseEstimator, TransformerMixin):
#     def __init__(self):
#         return None
    
#     def fit(self, X, y=None):
#         return self
    
#     def fit_transform(self, X, y=None):
#         return self.fit(X, y).transform(X)
        
#     def transform(self, X):
#         output = []
#         for stock_name in X.keys():
#             prices = []
#             for date in X[stock_name]:
#                 prices.append(PriceData(stock_name).on_date(date, 'open'))
#             output.append(prices)
#         return np.array(output)
    
# class LabelsTransform(BaseEstimator, TransformerMixin):
#     # Returns the interquartile-range and median.
#     def __init__(self):
#         return None
        
#     def fit(self, X, y=None):
#         return self
    
#     def fit_transform(self, X, y=None):
#         return self.fit(X, y).transform(X)
        
#     def transform(self, X):
#         # ldcom = last_day_change_over_median
#         print(X.shape)
#         ldcom = []
#         for prices in X:
#             this_median = np.median(prices[0:-3])
#             ldcom.append(((prices[-1]-this_median)/this_median))
#         return np.array(ldcom)
            
class StatisticalMeasuresTransformer(BaseEstimator, TransformerMixin):
    # Returns the interquartile-range and median.
    def __init__(self):
        return None
        
    def fit(self, X, y=None):
        # ldcom = last_day_change_over_median
        
        output = []
        self.iqr_var = []
        self.median = []
        for prices in X:
            this_iqr = iqr(prices[0:-3])
            this_median = np.median(prices[0:-3])
            self.iqr_var.append(this_iqr)
            self.median.append(this_median)
        return self
    
    def fit_transform(self, X, y=None):
        return self.fit(X, y).transform(X)
        
    def transform(self, X):
        X_output = []
        for i, prices in enumerate(X):
            stats   = []
            iqr_var = self.iqr_var[i] or iqr(prices)
            median  = self.median[i]  or np.median(prices)
            
            stats.append(iqr_var)
            stats.append(median)
            
            X_output.append(stats)
        return np.array(X_output)
class SparseToArray(BaseEstimator, TransformerMixin):
    def __init__(self):
        return None
        
    def fit(self, X, y=None):
        return self
    
    def fit_transform(self, X, y=None):
        return self.fit(X, y).transform(X)
        
    def transform(self, X):
        return np.array(X.toarray()) #[ar.toarray() for ar in X]
    
class ReadFiles(BaseEstimator, TransformerMixin): 
    def __init__(self):
        return None
        
    def fit(self, X, y=None):
        return self
    
    def fit_transform(self, X, y=None):
        return self.fit(X, y).transform(X)
        
    def transform(self, X):
        return (open(filename, 'r').read() for filename in tqdm(X))
            

In [1124]:
def get_filenames(limit = -1):
    filenames = []
    directory_files = glob.glob('filing_texts/*')
    excepted_files = [('filing_texts/' + sn) for sn in excepted_stock_names]
    filenames_with_data = [x for x in directory_files[:limit] if x not in excepted_files]
    for filename in filenames_with_data:
        filenames.append(filename)
    return filenames

In [1125]:
class StockNamesToLabelsTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, start_int, end_int):
        self.filing_int = start_int * -1
        self.start_int = start_int
        self.end_int = end_int
        self.range = (range(start_int, end_int))
    # Not implemented since only used to generate labels
    def fit(self, X, y=None):
        return self
    # Not implemented since only used to generate labels
    def fit_transform(self, X, y=None):
        return self.fit(X, y).transform(X)
        
    def transform(self, X):
        h = {}
        for i, stock_name in enumerate(X):
            try:
                date = spm[stock_name]['date']
            except:
                continue # Ignore stocks which don't have a date
            dates = []
            for delta in self.range:
                date_delta = datetime.timedelta(days=delta)
                date_string = datetime.datetime.strptime(date, '%Y-%m-%d').date()
                dates.append(str(date_string + date_delta))
            h[stock_name] = dates
        
        ldcom = []
        for stock_name in h.keys():
            earliest_price_after_filing = None
            hist_p = []
            for i, date in enumerate(h[stock_name]):
                price = PriceData(stock_name).on_date(date, 'open')
                if price and not math.isnan(price):
                    if i > self.filing_int:
                        earliest_price_after_filing = earliest_price_after_filing or price
                    else:
                        hist_p.append(price)
            # Closing price on day of filing
            price_close_filing = PriceData(stock_name).on_date(h[stock_name][self.filing_int], 'close')
            # Use either the next open day of trading or the close price on day of filing
            comparison_price = earliest_price_after_filing or price_close_filing
            # Remove nans from historical prices before taking the mean
            this_mean = np.mean(hist_p)
            ldcom.append(((comparison_price-this_mean)/this_mean))
        return np.array(ldcom)

In [1126]:
StockNamesToLabelsTransformer(-5,5).transform(['XOXO'])

array([ 0.01682823])

In [1127]:
# Used to build other pipelines
from sklearn.preprocessing import Imputer

# prices_pipeline = Pipeline([
#     ('spm_to_filenames', SpmToFileNamesTransformer()),
#     ('filenames_to_stock_names', FilenamesToStockNamesTransformer()),
#     ('stock_names_to_dates', MapStockNamesToDatesTransformer(-5, 2)),
#     ('dates_to_prices_transformer', StockNameDatesMapToPricesListTransformer()),
#     ('imputer', Imputer(axis=1))
# ])

labels_pipeline = Pipeline([
    ('spm_to_filenames', SpmToFileNamesTransformer()),
    ('filenames_to_stock_names', FilenamesToStockNamesTransformer()),
    ('dates_to_prices_transformer', StockNamesToLabelsTransformer(-5,2))
])

# Used as the final y values 
# labels_pipeline = Pipeline([
#     ('prices_pipeline', prices_pipeline),
#     ('labels_transform', LabelsTransform())
# ])

# Used for training and test set features
stock_stats = Pipeline([
    ('prices_pipeline', prices_pipeline),
    ('stats_transform', StatisticalMeasuresTransformer())
])

# Used for training and test set features 
# ('stock_names_to_file_names', StockNamesToFileNamesTransformer()),
text_word_counts = Pipeline([
    ('spm_to_file_names', SpmToFileNamesTransformer()),
    ('read_files', ReadFiles()),
    ('vect', TfidfVectorizer(
                token_pattern=r"[a-zA-Z]+", 
                min_df = 0.10,
                max_df = 0.80,
                stop_words = 'english',
                max_features=9000,
                ngram_range=(1, 1))),
    ('sparse_to_array', SparseToArray()),
    ('std_scaler', StandardScaler()),
])


In [1128]:
X = text_word_counts.fit_transform(spm)
y = labels_pipeline.fit_transform(spm)
print(X.shape)
print(y.shape)






  0%|          | 0/188 [00:00<?, ?it/s][A[A[A[A[A




  1%|          | 2/188 [00:00<00:20,  9.14it/s][A[A[A[A[A




  5%|▌         | 10/188 [00:00<00:12, 14.09it/s][A[A[A[A[A




  7%|▋         | 14/188 [00:00<00:10, 16.44it/s][A[A[A[A[A




  9%|▊         | 16/188 [00:02<00:22,  7.68it/s][A[A[A[A[A




 13%|█▎        | 24/188 [00:02<00:15, 10.83it/s][A[A[A[A[A




 14%|█▍        | 27/188 [00:02<00:14, 11.12it/s][A[A[A[A[A




 18%|█▊        | 34/188 [00:02<00:13, 11.59it/s][A[A[A[A[A




 21%|██        | 39/188 [00:03<00:11, 12.72it/s][A[A[A[A[A




 22%|██▏       | 42/188 [00:03<00:11, 12.80it/s][A[A[A[A[A




 24%|██▍       | 46/188 [00:03<00:10, 13.60it/s][A[A[A[A[A




 30%|██▉       | 56/188 [00:03<00:08, 16.01it/s][A[A[A[A[A




 32%|███▏      | 61/188 [00:03<00:07, 16.82it/s][A[A[A[A[A




 35%|███▌      | 66/188 [00:04<00:07, 16.44it/s][A[A[A[A[A




 37%|███▋      | 70/188 [00:06<00:11, 10.67it/s][A[

(188, 6004)
(188,)


In [1129]:
X_train, X_test, y_train_continuous, y_test_continuous = train_test_split(X, y, test_size=0.2, random_state=42)

In [1130]:
def bool_arr(arr, limit=0.5):
    y_bool = []
    for num in arr:
        if num >= limit:
            y_bool.append(1)
        else:
            y_bool.append(0)
    return np.array(y_bool)
y_train = bool_arr(y_train_continuous, 0.05)
y_test = bool_arr(y_test_continuous, 0.00)

In [1131]:
from sklearn.feature_selection import SelectFpr
from sklearn.model_selection import GridSearchCV, cross_val_score

svc_train_pipeline = Pipeline([
    ('reduce_false_pos', SelectFpr(alpha=0.1)),
    ('svc', GridSearchCV(estimator=SVC(gamma='auto'), param_grid=dict(C=np.logspace(-1,3)), n_jobs=1))
])
svc_train_pipeline.fit(X_train, y_train)
svc_train_pipeline.score(X_test, y_test)

0.44736842105263158

In [1132]:
from sklearn.svm import SVC

clf = SVC(C=100)
# clf = RandomForestRegressor(n_estimators=200, max_depth=3, verbose=1, n_jobs=2)
clf.fit(X_train, y_train)



SVC(C=100, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [1133]:
from keras import regularizers
from sklearn.metrics import confusion_matrix

In [1151]:
class DeepEstimator():
    def __init__(self):
        return None
    def fit(self, X, y, X_val, y_val):
        class_weight = {0: 0.1,
                1: 0.9}
        self.model = self.__define_model(X)
        sched = [[0.0001, 2], [0.001, 20], [0.01, 2], [0.1, 2], [0.5, 1], [0.1, 5], [0.01, 20], [0.001, 40], [0.0001, 80], [0.00005, 120]]
        for lr, epochs in sched:
            self.model.optimizer.lr = lr
            self.model.fit(np.array(X), np.array(y), epochs=epochs,  class_weight=class_weight, batch_size=64, validation_data=(X_val, y_val))
        return self
    def predict(self, X, y=None):
        return self.model.predict(X)
    def __define_model(self, X):
        shape = X.shape[1]
        model = Sequential([
            BatchNormalization(input_shape=(shape,)),
            Dense(25, activation='relu', kernel_regularizer=regularizers.l1(0.01)),
            Dropout(0.8),
            BatchNormalization(),
            Dense(1, activation='sigmoid')   
        ])

        model.compile(optimizer='adam',
                      loss='binary_crossentropy',
                      metrics=['accuracy'])
        return model
    

In [1157]:
X_train, X_test, y_train_continuous, y_test_continuous = train_test_split(X, y, test_size=0.2, random_state=42)
y_train = bool_arr(y_train_continuous, 0.05)
y_test = bool_arr(y_test_continuous, 0.00)

select_fpr = SelectFpr(alpha=0.1)

X_train = select_fpr.fit_transform(X_train, y_train)
X_test = select_fpr.transform(X_test)

DeepEstimator().fit(X_train, y_train, X_test, y_test)



Train on 150 samples, validate on 38 samples
Epoch 1/2
Epoch 2/2
Train on 150 samples, validate on 38 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Train on 150 samples, validate on 38 samples
Epoch 1/2
Epoch 2/2
Train on 150 samples, validate on 38 samples
Epoch 1/2
Epoch 2/2
Train on 150 samples, validate on 38 samples
Epoch 1/1
Train on 150 samples, validate on 38 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Train on 150 samples, validate on 38 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Train on 150 samples, validate on 38 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epo

Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40
Train on 150 samples, validate on 38 samples
Epoch 1/80
Epoch 2/80
Epoch 3/80
Epoch 4/80
Epoch 5/80
Epoch 6/80
Epoch 7/80
Epoch 8/80
Epoch 9/80
Epoch 10/80
Epoch 11/80
Epoch 12/80
Epoch 13/80
Epoch 14/80
Epoch 15/80
Epoch 16/80
Epoch 17/80
Epoch 18/80
Epoch 19/80
Epoch 20/80
Epoch 21/80
Epoch 22/80
Epoch 23/80
Epoch 24/80
Epoch 25/80
Epoch 26/80


Epoch 27/80
Epoch 28/80
Epoch 29/80
Epoch 30/80
Epoch 31/80
Epoch 32/80
Epoch 33/80
Epoch 34/80
Epoch 35/80
Epoch 36/80
Epoch 37/80
Epoch 38/80
Epoch 39/80
Epoch 40/80
Epoch 41/80
Epoch 42/80
Epoch 43/80
Epoch 44/80
Epoch 45/80
Epoch 46/80
Epoch 47/80
Epoch 48/80
Epoch 49/80
Epoch 50/80
Epoch 51/80
Epoch 52/80
Epoch 53/80
Epoch 54/80
Epoch 55/80
Epoch 56/80
Epoch 57/80
Epoch 58/80
Epoch 59/80
Epoch 60/80
Epoch 61/80
Epoch 62/80
Epoch 63/80
Epoch 64/80
Epoch 65/80
Epoch 66/80
Epoch 67/80
Epoch 68/80
Epoch 69/80
Epoch 70/80
Epoch 71/80
Epoch 72/80
Epoch 73/80
Epoch 74/80
Epoch 75/80
Epoch 76/80
Epoch 77/80
Epoch 78/80
Epoch 79/80
Epoch 80/80
Train on 150 samples, validate on 38 samples
Epoch 1/120
Epoch 2/120
Epoch 3/120
Epoch 4/120
Epoch 5/120
Epoch 6/120
Epoch 7/120


Epoch 8/120
Epoch 9/120
Epoch 10/120
Epoch 11/120
Epoch 12/120
Epoch 13/120
Epoch 14/120
Epoch 15/120
Epoch 16/120
Epoch 17/120
Epoch 18/120
Epoch 19/120
Epoch 20/120
Epoch 21/120
Epoch 22/120
Epoch 23/120
Epoch 24/120
Epoch 25/120
Epoch 26/120
Epoch 27/120
Epoch 28/120
Epoch 29/120
Epoch 30/120
Epoch 31/120
Epoch 32/120
Epoch 33/120
Epoch 34/120
Epoch 35/120
Epoch 36/120
Epoch 37/120
Epoch 38/120
Epoch 39/120
Epoch 40/120
Epoch 41/120
Epoch 42/120
Epoch 43/120
Epoch 44/120
Epoch 45/120
Epoch 46/120
Epoch 47/120
Epoch 48/120
Epoch 49/120
Epoch 50/120
Epoch 51/120
Epoch 52/120
Epoch 53/120
Epoch 54/120
Epoch 55/120
Epoch 56/120
Epoch 57/120
Epoch 58/120
Epoch 59/120
Epoch 60/120
Epoch 61/120
Epoch 62/120
Epoch 63/120
Epoch 64/120
Epoch 65/120
Epoch 66/120
Epoch 67/120


Epoch 68/120
Epoch 69/120
Epoch 70/120
Epoch 71/120
Epoch 72/120
Epoch 73/120
Epoch 74/120
Epoch 75/120
Epoch 76/120
Epoch 77/120
Epoch 78/120
Epoch 79/120
Epoch 80/120
Epoch 81/120
Epoch 82/120
Epoch 83/120
Epoch 84/120
Epoch 85/120
Epoch 86/120
Epoch 87/120
Epoch 88/120
Epoch 89/120
Epoch 90/120
Epoch 91/120
Epoch 92/120
Epoch 93/120
Epoch 94/120
Epoch 95/120
Epoch 96/120
Epoch 97/120
Epoch 98/120
Epoch 99/120
Epoch 100/120
Epoch 101/120
Epoch 102/120
Epoch 103/120
Epoch 104/120
Epoch 105/120
Epoch 106/120
Epoch 107/120
Epoch 108/120
Epoch 109/120
Epoch 110/120
Epoch 111/120
Epoch 112/120
Epoch 113/120
Epoch 114/120
Epoch 115/120
Epoch 116/120
Epoch 117/120
Epoch 118/120
Epoch 119/120
Epoch 120/120


<__main__.DeepEstimator at 0x1a12e3ec50>

In [1161]:
X_test.shape

(38, 924)

In [1159]:
deep_train_pipeline = Pipeline([
    ('reduce_false_pos', select_fpr),
    ('deep', DeepEstimator())
])

In [1162]:
score(deep_train_pipeline, X_test, y_test_continuous,0.5)

ValueError: X has a different shape than during fitting.

In [1160]:
import matplotlib.pyplot as plt
%matplotlib inline

def score(est, X, y, cutoff=0.75):
    y_pred = est.predict(X)
    # Invest if over 50% confident 
    y_pred_bool = bool_arr(y_pred, cutoff)
    # Score correct if you make postive returns
    y_true_bool = bool_arr(y, 0)
    total = np.dot(y_pred_bool.reshape(X.shape[0],), y)
    buy_all = np.dot(np.ones(X.shape[0]), y)
    av = total/len(X)
    buy_all_av = buy_all/len(X)
#     print(confusion_matrix(y_true_bool, y_pred_bool))
    return [av, buy_all_av]


ys_attained = []
ys_potential = []
xs = []
for i in np.linspace(0, 1):
    sc = score(deep_train_pipeline, X_test, y_test_continuous,i)
    ys_attained.append(sc[0])
    ys_potential.append(sc[1])
    xs.append(i)
plt.plot(xs, ys_attained)
plt.plot(xs, ys_potential)


ValueError: X has a different shape than during fitting.

In [None]:
bool_arr(deep_train_pipeline.predict(X_test))

In [None]:
# # Factors to look into including: 
# "BookValue" = (Total Assets - Total Liabilities) / Number of shares outstanding
# "MarketCap" = Market price per share * number of shares 
# "DividendYield" = Dividend / Market price per share 
# "EarningsPerShare" 
# "PERatio2" = Market price per share / earning per share 
# "priceBook" = Market price per share / ((Total Assets - Total Liabilities) / Number of shares outstanding)
# "PriceSales" = MarketCap / Revenue 
# "Ask"


In [None]:
# vocab = list(text_word_counts.steps[2][1].vocabulary_.keys())
# import operator
# iv_dict = [[vocab[i],-float(f)] for i,f in enumerate(clf.coef_)]
# most_important_terms = sorted(iv_dict, key=operator.itemgetter(1))[0:100]
# print(most_important_terms)
# most_important_vocab = dict(most_important_terms).keys()

In [None]:
# from joblib import Memory
# %mkdir cachedir
# location = './cachedir'
# memory = Memory(location, verbose=0)
# stock_name_date_mapping = memory.cache(stock_name_date_mapping)

In [None]:
# # print(text_word_counts.steps[1][1].vocabulary_)
# def get_excepted_stock_names(limit=-1):
#     esn = {}
#     for i, file in tqdm(enumerate(glob.glob('filing_texts/*')[0:limit])):
#         stock_name = stock_name_from_filename(file)
#         file_data = open(file, 'r').read()
#         try:
#             with open(file) as open_file:
#                 file_data = [next(open_file) for x in range(3)]
#             file_data = ''.join(file_data)
#             time_data = file_data.split("\n")[2][5:14]
#         except:
#             esn[i] = stock_name
#             continue
#     return esn

# def get_x_mask(X, y, filenames):
#     no_date_indices = []
#     if X.shape[0] != len(y):
#         stock_names = FilenamesToStockNamesTransformer().transform(filenames)
#         for i, stock_name in enumerate(stock_names): 
#             try:
#                 date = spm[stock_name]['date']
#             except:
#                 no_date_indices.append(i)
#         mask = np.ones(X.shape[0], dtype=bool)
#         mask[no_date_indices] = False
#         X_mask = X[mask]
#     else:
#         X_mask = X
#     return X_mask