# About

Read in stock analysis posts for a chosen range of dates and compile them to
a list of features grouped by the date.

Features are the list of words in *features.json* ad the feature values are the
word counds for each post.

In [12]:
import json
from os.path import join

import pandas as pd
import numpy as np

## Load data

In [27]:
SOURCE_DATA_FOLDER = ['..', 'datasets']
SOURCE_DATA = ['kaggle','miguelaenlle','analyst_ratings_processed.csv']
# Load the data.
df_src = pd.read_csv(join(*SOURCE_DATA_FOLDER,*SOURCE_DATA)).dropna()
print('Unfiltered Rows:', df_src.shape[0])
print(df_src.head())

Unfiltered Rows: 1397891
   Unnamed: 0                                              title  \
0         0.0            Stocks That Hit 52-Week Highs On Friday   
1         1.0         Stocks That Hit 52-Week Highs On Wednesday   
2         2.0                      71 Biggest Movers From Friday   
3         3.0       46 Stocks Moving In Friday's Mid-Day Session   
4         4.0  B of A Securities Maintains Neutral on Agilent...   

                        date stock  
0  2020-06-05 10:30:00-04:00     A  
1  2020-06-03 10:45:00-04:00     A  
2  2020-05-26 04:30:00-04:00     A  
3  2020-05-22 12:45:00-04:00     A  
4  2020-05-22 11:38:00-04:00     A  


In [14]:
# Load features.
with open(join(*SOURCE_DATA_FOLDER,'features.json'),'rb') as f:
    features = json.load(f)
print('Feature count:', len(features))
print(features)

Feature count: 491


## Process data

### Filter out choser date range.

In [15]:
MONTHS = ('2020-01','2020-02','2020-03')

bool_filter = df_src['date'].str.startswith(MONTHS)
df_filtered = df_src[bool_filter]
print(f'{MONTHS} has {df_filtered.shape[0]} rows.')

('2020-01', '2020-02', '2020-03') has 59852 rows.


### Group by date

In [16]:
df_date_title = pd.concat([df_filtered['date'].str[:10],df_filtered['title']], axis=1)
df_by_date = df_date_title.groupby(by='date')

In [17]:
WORDS_BY_DATE = {}

for date, group in df_by_date:
    words = []
    for sentence in list(group['title']):
        words.extend(sentence.lower().split())
    WORDS_BY_DATE[date] = words

### Create feature samples

Create samples from list of word lists by counting words and setting the
corresponding feature value.

In [26]:
DATE_FEATURES = []
current = 0
total = df_filtered.shape[0]
for date,words in WORDS_BY_DATE.items():
    # get word histogram for date
    wmap = {}
    for w in words:
        if w in wmap: wmap[w] += 1
        else: wmap[w] = 1
    # feature list
    row = [0]*len(features)
    #row = np.zeros(len(features),dtype=np.float32)
    for word,word_count in wmap.items():
        try:
            idx = features.index(word)
            row[idx] = word_count
        except: pass
    DATE_FEATURES.append([date]+row)
    current += 1
    if current % 5000 == 0:
        print(f'Row {current} of {total}.')

dates = list(WORDS_BY_DATE.keys())
start_date = dates[0]
end_date = dates[-1]

print('Date range:',start_date,'to',end_date,f'({len(dates)} days)')

Date range: 2020-01-01 to 2020-03-31 (91 days)


In [28]:
# Create feature dataframe
print('Creating DataFrame.')
df_features = pd.DataFrame(DATE_FEATURES, columns=['DATE']+features)
print('Writing DataFrame to file.')
df_features.to_csv(join(*SOURCE_DATA_FOLDER,'news_features.csv'))
print('Bye.')

Creating DataFrame.
Writing DataFrame to file.
Bye.
