# Alpha by NLP: Augmenting stock data with Alpha values

Using a csv file with the schema:  
`   0      1      2      3      4      5      6      7      8      9      10`  
`Date   Symbol Return`  
We will create a csv file with the schema:  
`   0      1      2      3      4      5      6      7      8      9      10`  
`Date   Symbol Return                                    Alpha`  

Running this notebook will take `/data/Stock_Returns/Original/stocks.csv` and augment each row with alpha values, i.e. the result of r<sub>stock</sub> - $\beta$r<sub>market</sub>, and create file `/data/Stock_Returns/Stocks_Plus_Alphas.csv`.

In [1]:
import csv
import os.path
import numpy as np
from datetime import datetime

dirname = 'data'
dirname2 = 'Stock_Returns'
dateformat = '%Y-%M-%d'

def write_csv_with_alphas(rows):
    with open(os.path.join(dirname, dirname2, 'SPY.csv'), 'rt') as SPY_File:
        SPY_Rows = []
        reader = csv.reader(SPY_File, delimiter=',')
        for row in reader:
            SPY_Rows.append(row)
        
        assert(len(rows) > 0)
        SPY_earliest_date_str = SPY_Rows[0][0]
        SPY_latest_date_str = SPY_Rows[-1][0]
        other_earliest_date_str = rows[0][0]
        other_latest_date_str = rows[-1][0]
        
        # strptime converts a string to a date
        SPY_earliest_date = datetime.strptime(SPY_earliest_date_str, dateformat) 
        SPY_latest_date = datetime.strptime(SPY_latest_date_str, dateformat)
        other_earliest_date = datetime.strptime(other_earliest_date_str, dateformat)
        other_latest_date = datetime.strptime(other_latest_date_str, dateformat)
        
        if (other_earliest_date > SPY_earliest_date):
            start_date = other_earliest_date
        else: # SPY_earliest_date >= other_earliest_date
            start_date = SPY_earliest_date
        if (other_latest_date < SPY_latest_date):
            end_date = other_latest_date
        else: # SPY_latest_date <= other_latest_date
            end_date = SPY_latest_date
            
        # strftime converts a date to a string
        start_date_str = datetime.strftime(start_date, dateformat)
        end_date_str = datetime.strftime(end_date, dateformat)
        
        reached_start_date = False
        reached_end_date = False
            
        SPY_Returns = []
        SPY_Rows1 = []
        for row in SPY_Rows:
            if (row[0] == start_date_str):
                reached_start_date = True
            if (row[0] == end_date_str):
                reached_end_date = True
            if (reached_start_date and not reached_end_date):
                SPY_Returns.append(row[2])
                SPY_Rows1.append(row)
            
        reached_start_date = False
        reached_end_date = False
            
        Other_Returns = []
        Other_Rows = []
        for row in rows:
            if (row[0] == start_date_str):
                reached_start_date = True
            if (row[0] == end_date_str):
                reached_end_date = True
            if (reached_start_date and not reached_end_date):
                Other_Returns.append(row[2])
                Other_Rows.append(row)
                
#         assert(len(SPY_Returns) == len(Other_Returns))  

#         print(len(SPY_Returns))
#         print(len(Other_Returns))
#         if (len(Other_Returns) != len(SPY_Returns)):
#             print(SPY_Rows)
#             print(rows)

        if (len(Other_Returns) != len(SPY_Returns)):
            if(len(Other_Rows)!=0):
                print("{} failed.".format(Other_Rows[0][1]))
            else:
                print("UNK failed")
            return

        SPY_Returns_arr = np.array(SPY_Returns, dtype=np.float32)
        Other_Returns_arr = np.array(Other_Returns, dtype=np.float32)
        # np.cov returns [cov(a,a) cov(a,b)
        #                 cov(a,b) cov(b,b)]
        cov = np.cov(SPY_Returns_arr, Other_Returns_arr)
        beta = cov[0][1]/cov[0][0]

        
    with open(os.path.join(dirname, dirname2, 'Stocks_Plus_Alphas.csv'), 'a') as csvfile:
        writer = csv.writer(csvfile, delimiter=',')
        
        assert(len(SPY_Returns) == len(Other_Returns))   
        for i in range(len(SPY_Returns)):
            alpha = Other_Returns_arr[i] - beta * SPY_Returns_arr[i]
            writer.writerow(Other_Rows[i] + ['', '', '', '', ''] + [alpha])

In [3]:
dirname3 = 'Original'

if (os.path.isfile(os.path.join(dirname, dirname2, 'Stocks_Plus_Alphas.csv'))):
    print("Please delete Stock_Plus_Alphas.csv before proceeding so we can regenerate the file.")
else:
    with open(os.path.join(dirname, dirname2, dirname3, 'stocks.csv'), 'rt') as csvfile:
        reader = csv.reader(csvfile, delimiter=',')
        rows = []
        last_symbol = ''
        for row in reader:
            if(row[0] == 'date'):
                pass
            else:
                if (last_symbol != row[1]): # We have reached a new stock symbol
                    if (len(rows) >= 2):
                        write_csv_with_alphas(rows)
                    rows = []
                rows.append(row)
                last_symbol = row[1]

IBM failed.
ISRG failed.
LKQ failed.
MJN failed.
MTD failed.
MHK failed.
ORLY failed.
REGN failed.
SRCL failed.
URBN failed.
VRTX failed.
AABA failed.
AAPC failed.
ABAC failed.
ABCD failed.
ABEO failed.
ABIL failed.
ABIO failed.
ABTL failed.
ACNB failed.
ACST failed.
ADMA failed.
ADMP failed.
ADOM failed.
AEHR failed.
AEMD failed.
AETI failed.
AEZS failed.
AHPA failed.
AHPI failed.
AIRG failed.
AIRT failed.
AKER failed.
AKTS failed.
AKTX failed.
ALBO failed.
ALDX failed.
ALQA failed.
ALT failed.
AMDA failed.
AMMA failed.
ANDA failed.
APDN failed.
APOP failed.
APPN failed.
APPS failed.
APRI failed.
APTO failed.
AQB failed.
AQMS failed.
ARCI failed.
ARCW failed.
ARDM failed.
ARGX failed.
ARIS failed.
ARTW failed.
ASMB failed.
ASTC failed.
ASUR failed.
ASV failed.
ATHX failed.
ATLO failed.
ATNX failed.
ATOM failed.
ATOS failed.
ATRS failed.
ATTU failed.
AVXL failed.
AXAR failed.
AXAS failed.
AXDX failed.
AXGN failed.
AYA failed.
AZRX failed.
BASI failed.
BCLI failed.
BCTF failed.
BDSI fai

  c *= 1. / np.float64(fact)
  c *= 1. / np.float64(fact)


EFOI failed.
EGAN failed.
EGBN failed.
EGT failed.
EKSO failed.
ELEC failed.
ELSE failed.
ELTK failed.
EMCF failed.
ENG failed.
ENT failed.
EPIX failed.
EQFN failed.
ERI failed.
ESBK failed.
ESEA failed.
ESES failed.
ESXB failed.
ETRM failed.
EVAR failed.
EVBS failed.
EVGBC failed.
EVLMC failed.
EVOK failed.
EVOL failed.
EVSTC failed.
EXAS failed.
EYEG failed.
EYES failed.
FALC failed.
FBIO failed.
FBSS failed.
FCAL failed.
FCAP failed.
FCCO failed.
FCSC failed.
FFHL failed.
FH failed.
FHCO failed.
FLIC failed.
FLL failed.
FMAO failed.
FMCI failed.
FMNB failed.
FNJN failed.
FNTE failed.
FOANC failed.
FONR failed.
FORD failed.
FORK failed.
FPAY failed.
FRP failed.
FRSX failed.
FSBC failed.
FSBW failed.
FSFG failed.
FSNN failed.
FTFT failed.
FUNC failed.
FUSB failed.
FVE failed.
GALE failed.
GALT failed.
GBT failed.
GCBC failed.
GENE failed.
GGAL failed.
GIGA failed.
GIGM failed.
GLBS failed.
GLBZ failed.
GLMD failed.
GNUS failed.
GNVC failed.
GPAC failed.
GPIA failed.
GRBK failed.
GROW 