In [12]:
import glob
import os 
import pandas as pd
import datetime
from scipy.stats import iqr
import numpy as np
import math

from shutil import copyfile
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [14]:
PATH='data/language_model/'

TRN_PATH = 'train/all/'
VAL_PATH = 'test/all/'
TRN = f'{PATH}{TRN_PATH}' 
VAL = f'{PATH}{VAL_PATH}' 

In [15]:
class PriceData:
    def __init__(self, stock_name):
        self.stock_name = stock_name
        try:
            self.price_data = pd.read_csv('prices/' + stock_name + '.csv')
        except:
            self.price_data = pd.DataFrame.from_dict({})

    def on_date(self, date, market_time = 'open'): 
        try:
            return float(self.price_data.loc[self.price_data['date'] == date][market_time])
        except: 
            return None

In [16]:
class X_y_mapping():
    def __init__(self, start_int, end_int):
        self.filing_int = start_int * -1
        self.start_int = start_int
        self.end_int = end_int
        self.range = (range(start_int, end_int))
        
    def __stock_name_from_filename(self, filename):
        return filename.split('/')[-1]
        
    def transform(self):
        stock_names = [self.__stock_name_from_filename(filename) for filename in glob.glob('filing_texts/*')]
        price_ranges = self.__get_price_ranges(stock_names)
        X_y = {}
        for stock_name in price_ranges.keys():
            cp_ratio = self.__comparison_price_ratio(price_ranges, stock_name)
            if cp_ratio and not math.isnan(cp_ratio):
                X_y[stock_name] = cp_ratio
        return X_y
    
    
    def __get_price_ranges(self, stock_names):
        h = {}
        for i, stock_name in enumerate(stock_names):
            try:
                date = stock_name.split('_')[1]
            except:
                continue # Ignore stocks which don't have a date
            dates = []
            for delta in self.range:
                date_delta = datetime.timedelta(days=delta)
                date_string = datetime.datetime.strptime(date, '%Y-%m-%d').date()
                dates.append(str(date_string + date_delta))
            h[stock_name] = dates
        return h
    
    def __comparison_price_ratio(self,h, stock_name_with_date):
        earliest_price_after_filing = None
        hist_p = []
        stock_name = stock_name_with_date.split('_')[0]
        for i, date in enumerate(h[stock_name_with_date]):
            price = PriceData(stock_name).on_date(date, 'open')
            if price and not math.isnan(price):
                if i > self.filing_int:
                    earliest_price_after_filing = earliest_price_after_filing or price
                else:
                    hist_p.append(price)
        # Closing price on day of filing
        price_close_filing = PriceData(stock_name).on_date(h[stock_name_with_date][self.filing_int], 'close')
        # Use either the next open day of trading or the close price on day of filing
        comparison_price = earliest_price_after_filing or price_close_filing
        # Remove nans from historical prices before taking the mean
        this_mean = np.mean(hist_p)
        if comparison_price and this_mean:
            return ((comparison_price-this_mean)/this_mean)
        else:
            return None

In [17]:
spm = X_y_mapping(-5,2).transform()

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [18]:
len(spm.keys())

499

In [19]:
X = np.array(list(spm.keys()))
y = np.array(list(spm.values()))

In [20]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33)

In [72]:
import glob as glob
from bs4 import BeautifulSoup as Soup

def stringify_file(filename):
    res = open(filename)
    soup = Soup(res,'html.parser')
    text = []
    for p in soup.find_all('p'):
        text.append(p.text.replace("\\n", ' -_n_- '))
    string = ' '.join(text)
    return string.encode('utf-8', "backslashreplace").decode('utf-8')

In [73]:
os.makedirs('data/language_model', exist_ok=True)
% rm -rf 'data/language_model/all'
% rm -rf 'data/language_model/train'
% rm -rf 'data/language_model/val'

os.makedirs(f'proc_filing_texts', exist_ok=True)
os.makedirs(f'{PATH}train/yes', exist_ok=True)
os.makedirs(f'{PATH}val/yes', exist_ok=True)
os.makedirs(f'{PATH}train/no', exist_ok=True)
os.makedirs(f'{PATH}val/no', exist_ok=True)
os.makedirs(f'{PATH}all/train', exist_ok=True)
os.makedirs(f'{PATH}all/val', exist_ok=True)
os.makedirs(f'{PATH}models', exist_ok=True)

atp = 'data/language_model/all/train'
ytp = 'data/language_model/train/yes'
ntp = 'data/language_model/train/no'

avp = 'data/language_model/all/val'
yvp = 'data/language_model/val/yes'
nvp = 'data/language_model/val/no'

threshold = 0.1

for i, filename in enumerate(X):
    sf = stringify_file(f'filing_texts/{filename}')
    f = open(f'proc_filing_texts/{filename}', 'w+')
    f.write(sf)
    f.close()

for i, filename in enumerate(X_train):
    copyfile(f'proc_filing_texts/{filename}', f'{atp}/{filename}.txt')
    if y_train[i] >= threshold:
        copyfile(f'proc_filing_texts/{filename}', f'{ytp}/{filename}.txt')
    else:
        copyfile(f'proc_filing_texts/{filename}', f'{ntp}/{filename}.txt')
for i, filename in enumerate(X_test):
    copyfile(f'proc_filing_texts/{filename}', f'{avp}/{filename}.txt')
    if y_train[i] >= threshold:
        copyfile(f'proc_filing_texts/{filename}', f'{yvp}/{filename}.txt')
    else:
        copyfile(f'proc_filing_texts/{filename}', f'{nvp}/{filename}.txt')