In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer, CountVectorizer
import glob
import os 
import pandas as pd
import datetime
from scipy.stats import iqr
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split

from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

spm = None
english_words = None
excepted_stock_names = []
limit = -1

In [2]:
import keras
from keras.layers import Dense, Dropout, RepeatVector, BatchNormalization, Convolution1D, Flatten, Lambda, Permute, MaxPooling1D, AlphaDropout
from keras.models import Sequential
from keras.utils import to_categorical
import keras.backend as K
from sklearn.model_selection import train_test_split
from keras.models import load_model
from tqdm import *

Using TensorFlow backend.


In [3]:
def stock_price_mapping(limit=-1):
    spm = {}
    for i, file in enumerate(glob.glob('8k-gz/*')[0:limit]):
        stock_name = file.split('/')[-1]
        file_data = open(file, 'r').read()
        time_data = ""
        try:
            with open(file) as open_file:
                file_data = [next(open_file) for x in range(3)]
            file_data = ''.join(file_data)
            time_data = file_data.split("\n")[2][5:14]
        except:
            print("No data for ", stock_name)
            excepted_stock_names.append(stock_name)
            continue
        year = time_data[0:4]
        month = time_data[4:6]
        day = time_data[6:8]
        date_stamp = ("%s-%s-%s" %(year, month, day))
        spm[stock_name] = {'date': date_stamp}
    return spm

In [4]:
from joblib import Memory
%mkdir cachedir
location = './cachedir'
memory = Memory(location, verbose=0)
stock_price_mapping = memory.cache(stock_price_mapping)

mkdir: cachedir: File exists


In [5]:
class PriceData:
    def __init__(self, stock_name):
        self.stock_name = stock_name
        self.price_data = pd.read_csv('price_history/' + stock_name + '.csv')

    def on_date(self, date, market_time = 'Open'): 
        try:
            return float(self.price_data.loc[self.price_data['Date'] == date][market_time])
        except: 
            return None


In [6]:
class FilenamesToStockNamesTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        return None
    
    def fit(self, X, y=None):
        return self
    
    def fit_transform(self, X, y=None):
        return self.fit(X, y).transform(X)
        
    def transform(self, X):
        output = []
        for filename in X:
            stock_name = filename.split('/')[-1]
            output.append(stock_name)
        return output
class MapStockNamesToDatesTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, start_int, end_int):
        self.start_int = start_int
        self.end_int = end_int
        self.range = (range(start_int, end_int))
    
    def fit(self, X, y=None):
        return self
    
    def fit_transform(self, X, y=None):
        return self.fit(X, y).transform(X)
        
    def transform(self, X):
        output = {}
        for stock_name in X:
            try:
                date = spm[stock_name]['date']
            except: 
                continue # Ignore stocks which don't have a date
            dates = []
            for delta in self.range:
                date_delta = datetime.timedelta(days=delta)
                date_string = datetime.datetime.strptime(date, '%Y-%m-%d').date()
                dates.append(str(date_string + date_delta))
            output[stock_name] = dates
        return output
class StockNameDatesMapToPricesListTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        return None
    
    def fit(self, X, y=None):
        return self
    
    def fit_transform(self, X, y=None):
        return self.fit(X, y).transform(X)
        
    def transform(self, X):
        output = []
        for stock_name in X.keys():
            prices = []
            for date in X[stock_name]:
                prices.append(PriceData(stock_name).on_date(date, 'Close'))
            output.append(prices)
        return np.array(output)
class LabelsTransform(BaseEstimator, TransformerMixin):
    # Returns the interquartile-range and median.
    def __init__(self):
        return None
        
    def fit(self, X, y=None):
        # ldcom = last_day_change_over_median
        self.ldcom = []
        for prices in X:
            this_median = np.median(prices[0:-3])
            self.ldcom.append(((prices[-1]-this_median)/this_median))
        return self
    
    def fit_transform(self, X, y=None):
        self.fit(X, y)
        
        return np.array(self.ldcom)
        
    def transform(self, X):
        return self
            
class StatisticalMeasuresTransformer(BaseEstimator, TransformerMixin):
    # Returns the interquartile-range and median.
    def __init__(self):
        return None
        
    def fit(self, X, y=None):
        # ldcom = last_day_change_over_median
        
        output = []
        self.iqr_var = []
        self.median = []
        for prices in X:
            this_iqr = iqr(prices[0:-3])
            this_median = np.median(prices[0:-3])
            self.iqr_var.append(this_iqr)
            self.median.append(this_median)
        return self
    
    def fit_transform(self, X, y=None):
        return self.fit(X, y).transform(X)
        
    def transform(self, X):
        X_output = []
        for i, prices in enumerate(X):
            stats   = []
            iqr_var = self.iqr_var[i] or iqr(prices)
            median  = self.median[i]  or np.median(prices)
            
            stats.append(iqr_var)
            stats.append(median)
            
            X_output.append(stats)
        return np.array(X_output)
class SparseToArray(BaseEstimator, TransformerMixin):
    def __init__(self):
        return None
        
    def fit(self, X, y=None):
        return self
    
    def fit_transform(self, X, y=None):
        return self.fit(X, y).transform(X)
        
    def transform(self, X):
        return X.toarray() #[ar.toarray() for ar in X]
    
class ReadFiles(BaseEstimator, TransformerMixin): 
    def __init__(self):
        return None
        
    def fit(self, X, y=None):
        return self
    
    def fit_transform(self, X, y=None):
        return self.fit(X, y).transform(X)
        
    def transform(self, X):
        return (open(filename, 'r').read() for filename in tqdm(X))
class CustomCountVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self, vocabulary = None):
        self.dictionary = vocabulary or open('/usr/share/dict/words', 'r').read().split("\n")
        
    def fit(self, X, y=None):
        return self
    
    def fit_transform(self, X, y=None):
        return self.fit(X, y).transform(X)
        
    def transform(self, X):
        output = []
        for file_contents in X: 
            feature = np.zeros(len(self.dictionary))
            ar = file_contents.split(' ')
            ar = [re.sub("<|>", " ", b.lower()) for b in ar]
            for word in ar:
                try:
                    index = self.dictionary.index(word)
                    feature[index] = feature[index] + 1
                except:
                    continue
            output.append(feature)
        return output
class DocLengthNormalizer(BaseEstimator, TransformerMixin):
    def __init__(self):
        return None 
    
    def fit(self, X, y=None):
        return self
    
    def fit_transform(self, X, y=None):
        return self.fit(X, y).transform(X)
        
    def transform(self, X):
        output = []
        for feature in X: 
            new_feature = np.zeros(len(feature))
            feature_sum = float(sum(feature))
            for i,fi in enumerate(feature):
                new_feature[i] = fi/feature_sum
            output.append(new_feature)
        return output
class CustomTfIdf(BaseEstimator, TransformerMixin):
    def __init__(self):
        return None 
    
    def fit(self, X, y=None):
        self.column_averages = np.sum(X, axis=0)/len(X)
        return self
    
    def fit_transform(self, X, y=None):
        return self.fit(X, y).transform(X)
        
    def transform(self, X):
        output = []
        for feature in X: 
            new_feature = np.zeros(len(feature))
            for i,fi in enumerate(feature):
                new_feature[i] = (fi/self.column_averages[i])
            output.append(new_feature)
        return output
class ToNpArray(BaseEstimator, TransformerMixin):
    def __init__(self):
        return None 
    
    def fit(self, X, y=None):
        return self
    
    def fit_transform(self, X, y=None):
        return self.fit(X, y).transform(X)
        
    def transform(self, X):
        return np.array(X)
            

In [7]:
def get_filenames(limit = -1):
    filenames = []
    directory_files = glob.glob('8k-gz/*')
    excepted_files = [('8k-gz/' + sn) for sn in excepted_stock_names]
    filenames_with_data = [x for x in directory_files[:limit] if x not in excepted_files]
    for filename in filenames_with_data:
        filenames.append(filename)
    return filenames

In [8]:
spm = stock_price_mapping(limit)

filenames = get_filenames(limit)

In [9]:
spm

{'A': {'date': '2002-05-17'},
 'AA': {'date': '2002-07-01'},
 'AAN': {'date': '2003-04-08'},
 'AAON': {'date': '2002-06-26'},
 'AAP': {'date': '2002-07-03'},
 'AAPL': {'date': '2002-08-02'},
 'ABAX': {'date': '2002-09-04'},
 'ABC': {'date': '2002-08-14'},
 'ABFS': {'date': '2002-05-17'},
 'ABM': {'date': '2002-07-12'},
 'ABT': {'date': '2002-08-13'},
 'ACAT': {'date': '2003-05-20'},
 'ACC': {'date': '2004-08-19'},
 'ACE': {'date': '2002-08-14'},
 'ACI': {'date': '2002-07-19'},
 'ACIW': {'date': '2002-06-05'},
 'ACM': {'date': '2007-05-10'},
 'ACN': {'date': '2009-09-01'},
 'ACO': {'date': '2003-04-21'},
 'ACXM': {'date': '2002-05-16'},
 'ADBE': {'date': '2002-10-15'},
 'ADI': {'date': '2002-08-06'},
 'ADM': {'date': '2002-09-20'},
 'ADP': {'date': '2002-09-16'},
 'ADS': {'date': '2002-07-17'},
 'ADSK': {'date': '2003-05-22'},
 'ADTN': {'date': '2003-04-14'},
 'ADVS': {'date': '2002-04-29'},
 'AEE': {'date': '2002-05-30'},
 'AEGN': {'date': '2002-06-10'},
 'AEIS': {'date': '2002-05-21'}

In [12]:
# Used to build other pipelines

prices_pipeline = Pipeline([
    ('filenames_to_stock_names', FilenamesToStockNamesTransformer()),
    ('stock_names_to_dates', MapStockNamesToDatesTransformer(-5, 2)),
    ('dates_to_prices_transformer', StockNameDatesMapToPricesListTransformer()),
    ('imputer', SimpleImputer())
])

# Used as the final y values 
labels_pipeline = Pipeline([
    ('prices_pipeline', prices_pipeline),
    ('labels_transform', LabelsTransform())
])

# Used for training and test set features
stock_stats = Pipeline([
    ('prices_pipeline', prices_pipeline),
    ('stats_transform', StatisticalMeasuresTransformer())
])

custom_vocabulary = ['lawsuit', 'lawyer', 'firm', 'split', 'report', 'intended', 'seed', 'weak', 'increase', 'growth', 'new', 'strong', 'forward', 'well', 'grow', 'product', 'future', 'we', 'charge', 'loss', 'lower', 'decline', 'down', 'reduce', 'layoff', 'adjust', 'regulation', 'offset', 'reduction', 'while']
custom_vocabulary2 = ['subject conditions', 'generality foregoing', 'preserve', 'effects', 'lines', 'appointment principal', 'shall fail', 'issue date', 'information provided', 'deal', 'governing', 'memorandum', 'behalf company', 'containing', 'arise', 'term agreement', 'marks', 's subsidiaries', 'consent approval', 'lawful', 'misconduct', 'products services', 'event shall', 'prepared', 'duly', 'weeks', 'offshore', 'person s', 'report', 'eastern time', 'subject liabilities', 'latest', 'legal', 'partner', 'growth', 'marketing', 'language', 'general economic', 'board', 'fund', 'documentation', 'earnings', 'services', 'uniform', 'temporary', 'reconciliation', 'supporting', 'beneficially', 'unit', 'creditors rights', 'sales growth', 'substitute', 'sales million', 'section article', 'non u', 'affirmative vote', 'consideration', 'capitalized terms', 'june', 'covered', 'indemnified', 'items', 'local', 'directors', 'disposed', 'expenditures', 'notifies', 'sufficient', 'receives', 'comprehensive income', 'statements include', 'condensed', 'purposes determining', 'properties assets', 'insolvency', 'good reason', 'employment agreement', 'substances', 'securities', 'procedure', 'obligations shall', 'assessments', 'property rights', 'payable hereunder', 'continues', 'cumulative', 'company person', 'prospectus', 'earlier', 'integrated', 'receivables', 'pro rata', 'statements involve', 'list', 'manager', 'plan year', 'item', 'remaining']
# stop_words = 'english',
# CountVectorizer can take vocabulary=custom_vocabulary if needed.
# Used for training and test set features 
text_word_counts = Pipeline([
    ('read_files', ReadFiles()),
    ('vect', TfidfVectorizer(
                token_pattern=r"[a-zA-Z]+", 
                min_df = 0.10,
                max_df = 0.80,
                stop_words = 'english',
                max_features=9000,
                ngram_range=(1, 1))),
    ('sparse_to_array', SparseToArray()),
    ('np_array', ToNpArray()),
    ('std_scaler', StandardScaler()),
])

feature_union = FeatureUnion(transformer_list=[
    ('text_tf_idf', text_word_counts),
    ('stats_features', stock_stats),
])

full_pipeline = Pipeline([
    ('full_pipeline', feature_union)
])

In [16]:
# stock_stats # text_word_counts
X = text_word_counts.fit_transform(get_filenames(limit))



  0%|          | 0/1499 [00:00<?, ?it/s][A[A

  0%|          | 1/1499 [00:00<17:03,  1.46it/s][A[A

  0%|          | 2/1499 [00:01<19:01,  1.31it/s][A[A

  0%|          | 3/1499 [00:01<15:12,  1.64it/s][A[A

  0%|          | 5/1499 [00:02<11:12,  2.22it/s][A[A

  0%|          | 6/1499 [00:02<10:01,  2.48it/s][A[A

  0%|          | 7/1499 [00:02<09:03,  2.74it/s][A[A

  1%|          | 8/1499 [00:03<09:40,  2.57it/s][A[A

  1%|          | 9/1499 [00:03<09:36,  2.58it/s][A[A

  1%|          | 10/1499 [00:03<09:21,  2.65it/s][A[A

  1%|          | 11/1499 [00:04<10:48,  2.29it/s][A[A

  1%|          | 12/1499 [00:04<10:19,  2.40it/s][A[A

  1%|          | 13/1499 [00:05<11:10,  2.22it/s][A[A

  1%|          | 14/1499 [00:06<11:10,  2.21it/s][A[A

  1%|          | 15/1499 [00:07<12:11,  2.03it/s][A[A

  1%|          | 16/1499 [00:08<12:22,  2.00it/s][A[A

  1%|          | 17/1499 [00:08<12:20,  2.00it/s][A[A

  1%|▏         | 19/1499 [00:08<11:40,  2.11it

 10%|█         | 151/1499 [01:25<12:43,  1.76it/s][A[A

 10%|█         | 152/1499 [01:25<12:41,  1.77it/s][A[A

 10%|█         | 153/1499 [01:26<12:37,  1.78it/s][A[A

 10%|█         | 154/1499 [01:26<12:39,  1.77it/s][A[A

 10%|█         | 155/1499 [01:27<12:36,  1.78it/s][A[A

 10%|█         | 156/1499 [01:28<12:38,  1.77it/s][A[A

 10%|█         | 157/1499 [01:28<12:37,  1.77it/s][A[A

 11%|█         | 158/1499 [01:28<12:33,  1.78it/s][A[A

 11%|█         | 159/1499 [01:29<12:31,  1.78it/s][A[A

 11%|█         | 161/1499 [01:29<12:24,  1.80it/s][A[A

 11%|█         | 162/1499 [01:30<12:24,  1.80it/s][A[A

 11%|█         | 163/1499 [01:30<12:20,  1.80it/s][A[A

 11%|█         | 164/1499 [01:30<12:19,  1.81it/s][A[A

 11%|█         | 165/1499 [01:31<12:19,  1.80it/s][A[A

 11%|█         | 166/1499 [01:31<12:16,  1.81it/s][A[A

 11%|█         | 167/1499 [01:31<12:13,  1.82it/s][A[A

 11%|█         | 168/1499 [01:32<12:12,  1.82it/s][A[A

 11%|█▏       

 20%|██        | 302/1499 [02:40<10:34,  1.89it/s][A[A

 20%|██        | 303/1499 [02:42<10:39,  1.87it/s][A[A

 20%|██        | 304/1499 [02:42<10:38,  1.87it/s][A[A

 20%|██        | 305/1499 [02:43<10:39,  1.87it/s][A[A

 20%|██        | 306/1499 [02:44<10:42,  1.86it/s][A[A

 20%|██        | 307/1499 [02:45<10:41,  1.86it/s][A[A

 21%|██        | 308/1499 [02:46<10:42,  1.85it/s][A[A

 21%|██        | 309/1499 [02:46<10:41,  1.86it/s][A[A

 21%|██        | 310/1499 [02:46<10:39,  1.86it/s][A[A

 21%|██        | 311/1499 [02:46<10:37,  1.86it/s][A[A

 21%|██        | 312/1499 [02:47<10:35,  1.87it/s][A[A

 21%|██        | 313/1499 [02:47<10:34,  1.87it/s][A[A

 21%|██        | 314/1499 [02:47<10:33,  1.87it/s][A[A

 21%|██        | 315/1499 [02:48<10:33,  1.87it/s][A[A

 21%|██        | 316/1499 [02:48<10:31,  1.87it/s][A[A

 21%|██        | 317/1499 [02:49<10:32,  1.87it/s][A[A

 21%|██        | 318/1499 [02:49<10:30,  1.87it/s][A[A

 21%|██▏      

 30%|██▉       | 448/1499 [03:50<09:01,  1.94it/s][A[A

 30%|██▉       | 449/1499 [03:50<08:59,  1.94it/s][A[A

 30%|███       | 450/1499 [03:51<08:59,  1.95it/s][A[A

 30%|███       | 451/1499 [03:51<08:58,  1.95it/s][A[A

 30%|███       | 452/1499 [03:52<08:57,  1.95it/s][A[A

 30%|███       | 453/1499 [03:53<08:58,  1.94it/s][A[A

 30%|███       | 454/1499 [03:53<08:58,  1.94it/s][A[A

 30%|███       | 455/1499 [03:55<08:59,  1.94it/s][A[A

 30%|███       | 456/1499 [03:55<08:59,  1.93it/s][A[A

 30%|███       | 457/1499 [03:56<08:59,  1.93it/s][A[A

 31%|███       | 458/1499 [03:56<08:58,  1.93it/s][A[A

 31%|███       | 459/1499 [03:57<08:57,  1.94it/s][A[A

 31%|███       | 460/1499 [03:57<08:55,  1.94it/s][A[A

 31%|███       | 461/1499 [03:57<08:55,  1.94it/s][A[A

 31%|███       | 463/1499 [03:58<08:54,  1.94it/s][A[A

 31%|███       | 464/1499 [03:59<08:54,  1.94it/s][A[A

 31%|███       | 465/1499 [04:00<08:54,  1.94it/s][A[A

 31%|███      

 40%|████      | 600/1499 [05:03<07:34,  1.98it/s][A[A

 40%|████      | 601/1499 [05:03<07:33,  1.98it/s][A[A

 40%|████      | 602/1499 [05:03<07:32,  1.98it/s][A[A

 40%|████      | 603/1499 [05:04<07:31,  1.98it/s][A[A

 40%|████      | 604/1499 [05:04<07:31,  1.98it/s][A[A

 40%|████      | 605/1499 [05:05<07:31,  1.98it/s][A[A

 40%|████      | 606/1499 [05:05<07:30,  1.98it/s][A[A

 40%|████      | 607/1499 [05:05<07:29,  1.98it/s][A[A

 41%|████      | 608/1499 [05:05<07:28,  1.99it/s][A[A

 41%|████      | 609/1499 [05:06<07:27,  1.99it/s][A[A

 41%|████      | 610/1499 [05:06<07:26,  1.99it/s][A[A

 41%|████      | 611/1499 [05:07<07:26,  1.99it/s][A[A

 41%|████      | 612/1499 [05:08<07:26,  1.99it/s][A[A

 41%|████      | 613/1499 [05:08<07:26,  1.99it/s][A[A

 41%|████      | 614/1499 [05:08<07:25,  1.99it/s][A[A

 41%|████      | 615/1499 [05:09<07:24,  1.99it/s][A[A

 41%|████      | 616/1499 [05:09<07:23,  1.99it/s][A[A

 41%|████     

 50%|████▉     | 749/1499 [06:13<06:13,  2.01it/s][A[A

 50%|█████     | 750/1499 [06:14<06:13,  2.01it/s][A[A

 50%|█████     | 752/1499 [06:14<06:12,  2.01it/s][A[A

 50%|█████     | 753/1499 [06:14<06:11,  2.01it/s][A[A

 50%|█████     | 754/1499 [06:15<06:10,  2.01it/s][A[A

 50%|█████     | 755/1499 [06:15<06:10,  2.01it/s][A[A

 50%|█████     | 756/1499 [06:16<06:09,  2.01it/s][A[A

 51%|█████     | 757/1499 [06:17<06:09,  2.01it/s][A[A

 51%|█████     | 758/1499 [06:17<06:09,  2.01it/s][A[A

 51%|█████     | 759/1499 [06:18<06:08,  2.01it/s][A[A

 51%|█████     | 760/1499 [06:18<06:08,  2.01it/s][A[A

 51%|█████     | 761/1499 [06:19<06:08,  2.00it/s][A[A

 51%|█████     | 762/1499 [06:20<06:07,  2.01it/s][A[A

 51%|█████     | 763/1499 [06:21<06:07,  2.00it/s][A[A

 51%|█████     | 764/1499 [06:21<06:06,  2.00it/s][A[A

 51%|█████     | 765/1499 [06:21<06:05,  2.01it/s][A[A

 51%|█████     | 766/1499 [06:21<06:05,  2.01it/s][A[A

 51%|█████    

 60%|██████    | 900/1499 [07:19<04:52,  2.05it/s][A[A

 60%|██████    | 901/1499 [07:20<04:52,  2.04it/s][A[A

 60%|██████    | 902/1499 [07:21<04:52,  2.04it/s][A[A

 60%|██████    | 903/1499 [07:22<04:52,  2.04it/s][A[A

 60%|██████    | 904/1499 [07:22<04:51,  2.04it/s][A[A

 60%|██████    | 905/1499 [07:23<04:50,  2.04it/s][A[A

 60%|██████    | 906/1499 [07:23<04:50,  2.04it/s][A[A

 61%|██████    | 907/1499 [07:23<04:49,  2.04it/s][A[A

 61%|██████    | 908/1499 [07:24<04:49,  2.04it/s][A[A

 61%|██████    | 909/1499 [07:24<04:48,  2.04it/s][A[A

 61%|██████    | 910/1499 [07:25<04:48,  2.04it/s][A[A

 61%|██████    | 911/1499 [07:25<04:47,  2.04it/s][A[A

 61%|██████    | 912/1499 [07:26<04:47,  2.04it/s][A[A

 61%|██████    | 913/1499 [07:27<04:46,  2.04it/s][A[A

 61%|██████    | 914/1499 [07:27<04:46,  2.04it/s][A[A

 61%|██████    | 915/1499 [07:27<04:45,  2.04it/s][A[A

 61%|██████    | 916/1499 [07:28<04:45,  2.04it/s][A[A

 61%|██████   

 70%|██████▉   | 1047/1499 [08:31<03:40,  2.05it/s][A[A

 70%|██████▉   | 1048/1499 [08:31<03:40,  2.05it/s][A[A

 70%|██████▉   | 1049/1499 [08:32<03:39,  2.05it/s][A[A

 70%|███████   | 1050/1499 [08:33<03:39,  2.05it/s][A[A

 70%|███████   | 1051/1499 [08:33<03:38,  2.05it/s][A[A

 70%|███████   | 1052/1499 [08:34<03:38,  2.05it/s][A[A

 70%|███████   | 1053/1499 [08:35<03:38,  2.04it/s][A[A

 70%|███████   | 1054/1499 [08:35<03:37,  2.04it/s][A[A

 70%|███████   | 1055/1499 [08:36<03:37,  2.04it/s][A[A

 70%|███████   | 1056/1499 [08:36<03:36,  2.04it/s][A[A

 71%|███████   | 1057/1499 [08:36<03:36,  2.05it/s][A[A

 71%|███████   | 1058/1499 [08:37<03:35,  2.04it/s][A[A

 71%|███████   | 1059/1499 [08:37<03:35,  2.05it/s][A[A

 71%|███████   | 1060/1499 [08:37<03:34,  2.05it/s][A[A

 71%|███████   | 1061/1499 [08:38<03:33,  2.05it/s][A[A

 71%|███████   | 1062/1499 [08:39<03:33,  2.04it/s][A[A

 71%|███████   | 1063/1499 [08:40<03:33,  2.04it/s][A[

 80%|███████▉  | 1192/1499 [09:41<02:29,  2.05it/s][A[A

 80%|███████▉  | 1193/1499 [09:41<02:29,  2.05it/s][A[A

 80%|███████▉  | 1194/1499 [09:42<02:28,  2.05it/s][A[A

 80%|███████▉  | 1195/1499 [09:42<02:28,  2.05it/s][A[A

 80%|███████▉  | 1196/1499 [09:43<02:27,  2.05it/s][A[A

 80%|███████▉  | 1197/1499 [09:43<02:27,  2.05it/s][A[A

 80%|███████▉  | 1198/1499 [09:43<02:26,  2.05it/s][A[A

 80%|███████▉  | 1199/1499 [09:44<02:26,  2.05it/s][A[A

 80%|████████  | 1200/1499 [09:44<02:25,  2.05it/s][A[A

 80%|████████  | 1201/1499 [09:44<02:25,  2.05it/s][A[A

 80%|████████  | 1202/1499 [09:45<02:24,  2.05it/s][A[A

 80%|████████  | 1203/1499 [09:45<02:24,  2.05it/s][A[A

 80%|████████  | 1204/1499 [09:46<02:23,  2.05it/s][A[A

 80%|████████  | 1205/1499 [09:46<02:23,  2.05it/s][A[A

 80%|████████  | 1206/1499 [09:47<02:22,  2.05it/s][A[A

 81%|████████  | 1207/1499 [09:47<02:22,  2.06it/s][A[A

 81%|████████  | 1208/1499 [09:47<02:21,  2.06it/s][A[

 89%|████████▉ | 1335/1499 [10:46<01:19,  2.07it/s][A[A

 89%|████████▉ | 1336/1499 [10:46<01:18,  2.07it/s][A[A

 89%|████████▉ | 1337/1499 [10:46<01:18,  2.07it/s][A[A

 89%|████████▉ | 1338/1499 [10:47<01:17,  2.07it/s][A[A

 89%|████████▉ | 1339/1499 [10:47<01:17,  2.07it/s][A[A

 89%|████████▉ | 1340/1499 [10:47<01:16,  2.07it/s][A[A

 89%|████████▉ | 1341/1499 [10:47<01:16,  2.07it/s][A[A

 90%|████████▉ | 1342/1499 [10:48<01:15,  2.07it/s][A[A

 90%|████████▉ | 1343/1499 [10:48<01:15,  2.07it/s][A[A

 90%|████████▉ | 1344/1499 [10:49<01:14,  2.07it/s][A[A

 90%|████████▉ | 1345/1499 [10:49<01:14,  2.07it/s][A[A

 90%|████████▉ | 1346/1499 [10:49<01:13,  2.07it/s][A[A

 90%|████████▉ | 1347/1499 [10:50<01:13,  2.07it/s][A[A

 90%|████████▉ | 1348/1499 [10:51<01:12,  2.07it/s][A[A

 90%|████████▉ | 1349/1499 [10:51<01:12,  2.07it/s][A[A

 90%|█████████ | 1350/1499 [10:52<01:12,  2.07it/s][A[A

 90%|█████████ | 1351/1499 [10:53<01:11,  2.07it/s][A[

 99%|█████████▉| 1482/1499 [11:53<00:08,  2.08it/s][A[A

 99%|█████████▉| 1483/1499 [11:54<00:07,  2.08it/s][A[A

 99%|█████████▉| 1485/1499 [11:54<00:06,  2.08it/s][A[A

 99%|█████████▉| 1486/1499 [11:54<00:06,  2.08it/s][A[A

 99%|█████████▉| 1487/1499 [11:55<00:05,  2.08it/s][A[A

 99%|█████████▉| 1488/1499 [11:56<00:05,  2.08it/s][A[A

 99%|█████████▉| 1490/1499 [11:56<00:04,  2.08it/s][A[A

 99%|█████████▉| 1491/1499 [11:57<00:03,  2.08it/s][A[A

100%|█████████▉| 1492/1499 [11:57<00:03,  2.08it/s][A[A

100%|█████████▉| 1493/1499 [11:57<00:02,  2.08it/s][A[A

100%|█████████▉| 1494/1499 [11:58<00:02,  2.08it/s][A[A

100%|█████████▉| 1495/1499 [11:58<00:01,  2.08it/s][A[A

100%|█████████▉| 1497/1499 [11:58<00:00,  2.08it/s][A[A

100%|█████████▉| 1498/1499 [11:59<00:00,  2.08it/s][A[A

100%|██████████| 1499/1499 [11:59<00:00,  2.08it/s][A[A

[A[A

In [42]:
# print(text_word_counts.steps[1][1].vocabulary_)
def get_excepted_stock_names(limit=-1):
    esn = {}
    for i, file in tqdm(enumerate(glob.glob('8k-gz/*')[0:limit])):
        stock_name = file.split('/')[-1]
        file_data = open(file, 'r').read()
        try:
            with open(file) as open_file:
                file_data = [next(open_file) for x in range(3)]
            file_data = ''.join(file_data)
            time_data = file_data.split("\n")[2][5:14]
        except:
            esn[i] = stock_name
            continue
    return esn
if X.shape[0] != len(y):
    esn = get_excepted_stock_names(limit)
    esn_keys = list(esn.keys())
    mask = np.ones(X.shape[0], dtype=bool)
    mask[esn_keys] = False
    X_mask = X[mask]
else:
    X_mask = X




0it [00:00, ?it/s][A[A[A


1it [00:00,  9.51it/s][A[A[A


2it [00:00,  9.10it/s][A[A[A


8it [00:00, 20.90it/s][A[A[A


11it [00:00, 20.83it/s][A[A[A


13it [00:00, 20.55it/s][A[A[A


15it [00:00, 19.49it/s][A[A[A


17it [00:00, 19.13it/s][A[A[A


21it [00:01, 20.53it/s][A[A[A


25it [00:01, 20.53it/s][A[A[A


29it [00:01, 20.12it/s][A[A[A


32it [00:01, 20.56it/s][A[A[A


35it [00:01, 20.35it/s][A[A[A


39it [00:01, 21.13it/s][A[A[A


42it [00:02, 20.85it/s][A[A[A


45it [00:02, 20.51it/s][A[A[A


48it [00:02, 19.51it/s][A[A[A


51it [00:02, 19.79it/s][A[A[A


54it [00:02, 20.00it/s][A[A[A


57it [00:02, 20.24it/s][A[A[A


60it [00:02, 20.21it/s][A[A[A


65it [00:03, 21.10it/s][A[A[A


68it [00:03, 21.28it/s][A[A[A


71it [00:03, 21.47it/s][A[A[A


74it [00:03, 21.59it/s][A[A[A


77it [00:03, 21.18it/s][A[A[A


80it [00:03, 21.39it/s][A[A[A


83it [00:03, 21.50it/s][A[A[A


86it [00:03, 21.55it/s][

827it [00:34, 24.21it/s][A[A[A


832it [00:34, 24.24it/s][A[A[A


836it [00:34, 24.25it/s][A[A[A


842it [00:34, 24.36it/s][A[A[A


846it [00:34, 24.36it/s][A[A[A


850it [00:34, 24.38it/s][A[A[A


854it [00:34, 24.42it/s][A[A[A


858it [00:35, 24.29it/s][A[A[A


861it [00:35, 24.30it/s][A[A[A


865it [00:35, 24.33it/s][A[A[A


869it [00:35, 24.37it/s][A[A[A


873it [00:35, 24.35it/s][A[A[A


877it [00:35, 24.36it/s][A[A[A


881it [00:36, 24.38it/s][A[A[A


884it [00:36, 24.38it/s][A[A[A


887it [00:36, 24.39it/s][A[A[A


890it [00:36, 24.35it/s][A[A[A


893it [00:36, 24.31it/s][A[A[A


897it [00:36, 24.34it/s][A[A[A


901it [00:37, 24.34it/s][A[A[A


904it [00:37, 24.35it/s][A[A[A


908it [00:37, 24.37it/s][A[A[A


911it [00:37, 24.37it/s][A[A[A


914it [00:37, 24.37it/s][A[A[A


918it [00:37, 24.41it/s][A[A[A


921it [00:37, 24.41it/s][A[A[A


924it [00:37, 24.37it/s][A[A[A


927it [00:38, 24.34it/s][A

In [43]:
y = labels_pipeline.fit_transform(get_filenames(limit))

In [45]:
X_train, X_test, y_train, y_test = train_test_split(X_mask, y, test_size=0.2, random_state=42)

In [134]:
from sklearn.linear_model import ElasticNet
clf = ElasticNet(alpha=0.05, l1_ratio=1)
# clf = RandomForestRegressor(n_estimators=200, max_depth=3, verbose=1, n_jobs=2)
clf.fit(X_train, y_train)

ElasticNet(alpha=0.05, copy_X=True, fit_intercept=True, l1_ratio=1,
      max_iter=1000, normalize=False, positive=False, precompute=False,
      random_state=None, selection='cyclic', tol=0.0001, warm_start=False)

In [135]:
print(clf.score(X_train, y_train))
print(clf.score(X_test, y_test))

0.953156469834
-22.7764603242


In [136]:
vocab = list(text_word_counts.steps[1][1].vocabulary_.keys())
import operator
iv_dict = [[vocab[i],-float(f)] for i,f in enumerate(clf.coef_)]
most_important_terms = sorted(iv_dict, key=operator.itemgetter(1))[0:100]
print(most_important_terms)
most_important_vocab = dict(most_important_terms).keys()

[['unreimbursed', -3.7543057408883627], ['spans', -1.7097488752967323], ['includable', -1.298795944045096], ['fsa', -0.9673614395996905], ['pr', -0.9666345977775118], ['minnesota', -0.748692944438494], ['caps', -0.6414101529121896], ['enlargement', -0.4263038439588916], ['pleasure', -0.21303439849108552], ['reproduce', -0.14486466904298997], ['undivided', -0.13012196418889913], ['lp', -0.12303089743066463], ['renumbered', -0.1204366015307063], ['advertise', -0.11258195219016943], ['relinquish', -0.11198196122716988], ['writs', -0.11113614778859922], ['alabama', -0.10559097982901974], ['annuity', -0.1047144506910212], ['equaling', -0.10142481681239104], ['incurs', -0.09506172329299663], ['containment', -0.0884043659272103], ['notarial', -0.08618441372935201], ['platforms', -0.07989323675545018], ['highs', -0.07940850037045474], ['facilitating', -0.07841036994482438], ['pledgor', -0.07773465245212034], ['row', -0.07701953188060569], ['heart', -0.07566464666227621], ['sa', -0.074469949140

In [133]:
# vocab = list(text_word_counts.steps[1][1].vocabulary_.keys())
# import operator
# iv_dict = [[vocab[i],-float(f)] for i,f in enumerate(clf.feature_importances_)]
# most_important_terms = sorted(iv_dict, key=operator.itemgetter(1))[0:100]
# print(most_important_terms)
# most_important_vocab = dict(most_important_terms).keys()

AttributeError: 'ElasticNet' object has no attribute 'feature_importances_'

In [137]:
print(X_train.shape)
shape = X_train.shape[1]

m1 = Sequential([
    BatchNormalization(input_shape=(shape,)),
    Dense(100, activation='relu'),
    Dropout(0.5),
    BatchNormalization(),
    Dense(1)   
])

m1.compile(optimizer='rmsprop',
              loss='mean_absolute_error',
              metrics=['mae'])
m1

(1176, 7622)


<keras.models.Sequential at 0x1adbe09ac8>

In [138]:
# sched = [[0.0001, 2], [0.001, 20], [0.01, 2], [0.1, 2], [0.5, 1], [0.1, 5], [0.01, 20], [0.001, 40], [0.0001, 80], [0.00005, 120]]
sched = [[0.0001, 1000]]
for i in range(1):
    for info in sched:
        lr, epochs = info
        m1.optimizer.lr = lr
        m1.fit(np.array(X_train), np.array(y_train), epochs=epochs,  batch_size=64, validation_data=(np.array(X_test), np.array(y_test)))

Train on 1176 samples, validate on 295 samples
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000


Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
 192/1176 [===>..........................] - ETA: 0s - loss: 1.3781 - mean_absolute_error: 1.3781

KeyboardInterrupt: 

In [194]:
def economic_score(feat, labels, clf):
    total = np.dot(m1.predict(feat).reshape(feat.shape[0],), labels)
    av = total/len(feat)
    return av

In [195]:
print(economic_score(X_train, y_train, m1))
print(economic_score(X_test, y_test, m1))
print(economic_score(X_test, y_test, clf))

0.219149071614
0.0830137872535
0.0830137872535


In [189]:
# # Factors to look into including: 
# "BookValue" = (Total Assets - Total Liabilities) / Number of shares outstanding
# "MarketCap" = Market price per share * number of shares 
# "DividendYield" = Dividend / Market price per share 
# "EarningsPerShare" 
# "PERatio2" = Market price per share / earning per share 
# "priceBook" = Market price per share / ((Total Assets - Total Liabilities) / Number of shares outstanding)
# "PriceSales" = MarketCap / Revenue 
# "Ask"


In [171]:
def bool_arr(arr):
    y_bool = []
    for num in arr:
        if num >= 0.1:
            y_bool.append(1)
        else:
            y_bool.append(0)
    return np.array(y_bool)
    

In [172]:
X_train, X_test, y_train, y_test = train_test_split(X_mask, y, test_size=0.2, random_state=42)

In [208]:
from keras import regularizers
y_train_bool = bool_arr(y_train)
y_test_bool = bool_arr(y_test)

shape = X_train.shape[1]

m1 = Sequential([
    BatchNormalization(input_shape=(shape,)),
    Dense(35, activation='relu', kernel_regularizer=regularizers.l1(0.01)),
    Dropout(0.5),
    BatchNormalization(),
    Dense(1, activation='sigmoid')   
])

m1.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
m1

<keras.models.Sequential at 0x1b107b86d8>

In [209]:
# sched = [[0.0001, 1000]]
sched = [[0.0001, 2], [0.001, 20], [0.01, 2], [0.1, 2], [0.5, 1], [0.1, 5], [0.01, 20], [0.001, 40], [0.0001, 80], [0.00005, 120]]

for i in range(1):
    for info in sched:
        lr, epochs = info
        m1.optimizer.lr = lr
        m1.fit(np.array(X_train), np.array(y_train_bool), epochs=epochs,  batch_size=64, validation_data=(np.array(X_test), np.array(y_test_bool)))

Train on 1176 samples, validate on 295 samples
Epoch 1/2
Epoch 2/2
Train on 1176 samples, validate on 295 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Train on 1176 samples, validate on 295 samples
Epoch 1/2
Epoch 2/2
Train on 1176 samples, validate on 295 samples
Epoch 1/2
Epoch 2/2
Train on 1176 samples, validate on 295 samples
Epoch 1/1
Train on 1176 samples, validate on 295 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Train on 1176 samples, validate on 295 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Train on 1176 samples, validate on 295 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/4

Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40
Train on 1176 samples, validate on 295 samples
Epoch 1/80
Epoch 2/80
Epoch 3/80
Epoch 4/80
Epoch 5/80
Epoch 6/80
Epoch 7/80
Epoch 8/80
Epoch 9/80
Epoch 10/80
Epoch 11/80
Epoch 12/80
Epoch 13/80
Epoch 14/80
Epoch 15/80
Epoch 16/80
Epoch 17/80
Epoch 18/80
Epoch 19/80
Epoch 20/80
Epoch 21/80
Epoch 22/80
Epoch 23/80
Epoch 24/80
Epoch 25/80


Epoch 26/80
Epoch 27/80
Epoch 28/80
Epoch 29/80
Epoch 30/80
Epoch 31/80
Epoch 32/80
Epoch 33/80
Epoch 34/80
Epoch 35/80
Epoch 36/80
Epoch 37/80
Epoch 38/80
Epoch 39/80
Epoch 40/80
Epoch 41/80
Epoch 42/80
Epoch 43/80
Epoch 44/80
Epoch 45/80
Epoch 46/80
Epoch 47/80
Epoch 48/80
Epoch 49/80
Epoch 50/80
Epoch 51/80
Epoch 52/80
Epoch 53/80
Epoch 54/80
Epoch 55/80
Epoch 56/80
Epoch 57/80
Epoch 58/80
Epoch 59/80
Epoch 60/80
Epoch 61/80
Epoch 62/80
Epoch 63/80
Epoch 64/80
Epoch 65/80
Epoch 66/80
Epoch 67/80
Epoch 68/80
Epoch 69/80
Epoch 70/80
Epoch 71/80
Epoch 72/80
Epoch 73/80
Epoch 74/80
Epoch 75/80
Epoch 76/80
Epoch 77/80
Epoch 78/80
Epoch 79/80
Epoch 80/80
Train on 1176 samples, validate on 295 samples
Epoch 1/120
Epoch 2/120
Epoch 3/120
Epoch 4/120


Epoch 5/120
Epoch 6/120
Epoch 7/120
Epoch 8/120
Epoch 9/120
Epoch 10/120
Epoch 11/120
Epoch 12/120
Epoch 13/120
Epoch 14/120
Epoch 15/120
Epoch 16/120
Epoch 17/120
Epoch 18/120
Epoch 19/120
Epoch 20/120
Epoch 21/120
Epoch 22/120
Epoch 23/120
Epoch 24/120
Epoch 25/120
Epoch 26/120
Epoch 27/120
Epoch 28/120
Epoch 29/120
Epoch 30/120
Epoch 31/120
Epoch 32/120
Epoch 33/120
Epoch 34/120
Epoch 35/120
Epoch 36/120
Epoch 37/120
Epoch 38/120
Epoch 39/120
Epoch 40/120
Epoch 41/120
Epoch 42/120
Epoch 43/120
Epoch 44/120
Epoch 45/120
Epoch 46/120
Epoch 47/120
Epoch 48/120
Epoch 49/120
Epoch 50/120
Epoch 51/120
Epoch 52/120
Epoch 53/120
Epoch 54/120
Epoch 55/120
Epoch 56/120
Epoch 57/120
Epoch 58/120
Epoch 59/120
Epoch 60/120
Epoch 61/120
Epoch 62/120
Epoch 63/120


Epoch 64/120
Epoch 65/120
Epoch 66/120
Epoch 67/120
Epoch 68/120
Epoch 69/120
Epoch 70/120
Epoch 71/120
Epoch 72/120
Epoch 73/120
Epoch 74/120
Epoch 75/120
Epoch 76/120
Epoch 77/120
Epoch 78/120
Epoch 79/120
Epoch 80/120
Epoch 81/120
Epoch 82/120
Epoch 83/120
Epoch 84/120
Epoch 85/120
Epoch 86/120
Epoch 87/120
Epoch 88/120
Epoch 89/120
Epoch 90/120
Epoch 91/120
Epoch 92/120
Epoch 93/120
Epoch 94/120
Epoch 95/120
Epoch 96/120
Epoch 97/120

KeyboardInterrupt: 

In [210]:
economic_score(X_test, y_test, m1)

0.035010527186504929