In [3]:
#library('igraph')
import os
import pandas as pd
import numpy as np
from collections import defaultdict

In [4]:
df = pd.read_csv('../finance_data/Name_sector.csv')
df.head()

Unnamed: 0,Symbol,Sector
0,A,Health Care
1,AAL,Industrials
2,AAP,Consumer Discretionary
3,AAPL,Information Technology
4,ABBV,Health Care


In [5]:
df1 = pd.read_csv('../finance_data/data/BRK.B.csv')
df1.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Adj Close
0,5/1/14,128.809998,129.25,128.149994,129.059998,2655500,129.059998
1,5/2/14,129.449997,129.729996,127.82,128.089996,4276700,128.089996
2,5/5/14,127.190002,127.190002,126.080002,126.610001,3486100,126.610001
3,5/6/14,126.129997,126.290001,124.059998,124.669998,5381100,124.669998
4,5/7/14,125.400002,127.529999,125.32,127.449997,3301400,127.449997


In [48]:
def get_closing(stocks):
    p = defaultdict(list)
    stock_path = '../finance_data/data/'
    ref = pd.read_csv('../finance_data/data/A.csv').shape[0]
    for s in stocks:
        stock_name = s.split(".csv")[0]
        df = pd.read_csv(stock_path+s)
        if df.shape[0] != ref:
            continue
        closing_prices = list(df['Close'].values)
        p[stock_name].extend(closing_prices)
    return p

In [148]:
stocks = os.listdir('../finance_data/data/')
p = get_closing(stocks)
stocks = [x+".csv" for x in list(p.keys())]

In [152]:
len(p.keys())

494

In [153]:
def get_stock_return(stock_name, p):
    q = []
    for i in range(1, len(p[stock_name])):
        q.append(float((p[stock_name][i] - p[stock_name][i-1])/p[stock_name][i-1]))
    return q

In [154]:
len((get_stock_return('ZTS', p)))

764

In [155]:
def unnormalized_return(stocks, p):
    ur = defaultdict(list)
    for s in stocks:
        stock_name = s.split(".csv")[0]
        q = get_stock_return(stock_name, p)
        ur[stock_name].extend(q)
    return ur

In [156]:
def log_normalized_return(stocks, p):
    r = defaultdict(list)
    for s in stocks:
        stock_name = s.split(".csv")[0]
        q = get_stock_return(stock_name, p)
        norm_return = [np.log(1 + x) for x in q]
        r[stock_name].extend(norm_return)
    return r

In [157]:
r = log_normalized_return(stocks, p)

In [158]:
len(r['ZTS'])

764

In [159]:
def correlation(stock1, stock2, r):
    r_i = np.array(r[stock1])
    r_j = np.array(r[stock2])
    """
    count = 0
    if len(r_i) != len(r_j):
        count+=1
    n = min(len(r_i), len(r_j))
    r_i = r_i[:n]
    r_j = r_j[:n]
    """
    #print(stock1, stock2)
    numerator = np.mean(np.multiply(r_i, r_j)) - np.mean(r_i)*np.mean(r_j)
    den1 = np.mean(r_i**2) - np.mean(r_i)**2
    den2 = np.mean(r_j**2) - np.mean(r_j)**2
    denominator = np.sqrt(den1 * den2)
    corr_ij = numerator/denominator
    return corr_ij

In [160]:
correlation('ZTS', 'A', r)

0.34266477849887145

In [161]:
stock_names = sorted([s.split(".csv")[0] for s in stocks])
n = len(stock_names)
correlation_matrix = pd.DataFrame(data=np.zeros((n,n)), columns=stock_names)
correlation_matrix.insert(0, 'ID', stock_names)
correlation_matrix = correlation_matrix.set_index('ID')

In [162]:
for s1 in stock_names:
    for s2 in stock_names:
        correlation_matrix[s1][s2] = correlation(s1, s2, r)
correlation_matrix;

In [163]:
correlation_weights = np.sqrt(2 * (1 - correlation_matrix))

In [164]:
def edgelist(df):
    a = df.values
    c = df.columns
    n = len(c)
    #print(n)
    c_ar = np.array(c)
    out = np.empty((n, n, 2), dtype=c_ar.dtype)

    out[...,0] = c_ar[:,None]
    out[...,1] = c_ar

    #mask = ~np.eye(n,dtype=bool)
    mask = ~np.triu(np.ones((n,n), dtype=bool))
    df_out = pd.DataFrame(out[mask], columns=[['V1','V2']])
    df_out['Weight'] = a[mask]
    return df_out


In [165]:
correlation_edgelist = edgelist(correlation_weights)

In [166]:
len(correlation_edgelist)

121771

In [167]:
correlation_edgelist.to_csv('../finance_data/correlation_edgelist.txt', sep=' ', header=False, index=False)

In [168]:
ur = unnormalized_return(stocks, p)

In [169]:
unnormalized_correlation_matrix = pd.DataFrame.copy(correlation_matrix)
for s1 in stock_names:
    for s2 in stock_names:
        unnormalized_correlation_matrix[s1][s2] = correlation(s1, s2, ur)
unnormalized_correlation_matrix;

In [170]:
unnormalized_correlation_weights = np.sqrt(2 * (1 - unnormalized_correlation_matrix))

In [171]:
unnormalized_correlation_edgelist = edgelist(unnormalized_correlation_weights)

In [172]:
unnormalized_correlation_edgelist.to_csv('../finance_data/unnormalized_correlation_edgelist.txt', sep=' ', header=False, index=False)

In [173]:
unnormalized_correlation_edgelist.head()

Unnamed: 0,V1,V2,Weight
0,AAL,A,1.176795
1,AAP,A,1.260924
2,AAP,AAL,1.217204
3,AAPL,A,1.312798
4,AAPL,AAL,1.319783


In [174]:
correlation_edgelist.head()

Unnamed: 0,V1,V2,Weight
0,AAL,A,1.189554
1,AAP,A,1.26971
2,AAP,AAL,1.21758
3,AAPL,A,1.373192
4,AAPL,AAL,1.365377


<h3>Remove stocks not being used from Name_sector.csv</h3>

In [175]:
name_sector = pd.read_csv('../finance_data/Name_sector.csv')

In [176]:
len(stocks)

494

In [177]:
name_sector.shape

(505, 2)

In [178]:
name_sector.head()

Unnamed: 0,Symbol,Sector
0,A,Health Care
1,AAL,Industrials
2,AAP,Consumer Discretionary
3,AAPL,Information Technology
4,ABBV,Health Care


In [179]:
new_name_sector = name_sector.loc[name_sector['Symbol'].isin(stocks)]

In [180]:
new_name_sector.shape

(0, 2)

In [181]:
new_name_sector.to_csv('../finance_data/Name_sector2.csv', index=False)

<h3>Weekly data</h3>

In [182]:
df = pd.read_csv('../finance_data/data/BRK.B.csv')
df['Weekday'] = pd.to_datetime(df['Date']).apply(lambda x: x.weekday())
df[df['Weekday'] == 0].head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Adj Close,Weekday
2,5/5/14,127.190002,127.190002,126.080002,126.610001,3486100,126.610001,0
7,5/12/14,127.870003,128.240005,126.889999,127.290001,3406900,127.290001,0
12,5/19/14,126.769997,127.209999,126.110001,127.129997,2242400,127.129997,0
21,6/2/14,128.279999,128.75,127.370003,127.879997,2007200,127.879997,0
26,6/9/14,128.330002,128.619995,127.57,127.989998,2385200,127.989998,0


In [183]:
def get_weekly_closing(stocks):
    p = defaultdict(list)
    stock_path = '../finance_data/data/'
    #ref = pd.read_csv('../finance_data/data/A.csv').shape[0]
    for s in stocks:
        stock_name = s.split(".csv")[0]
        df = pd.read_csv(stock_path+s)
        df['Weekday'] = pd.to_datetime(df['Date']).apply(lambda x: x.weekday())
        df = df[df['Weekday'] == 0]
        #if df.shape[0] != ref:
         #   continue
        closing_prices = list(df['Close'].values)
        p[stock_name].extend(closing_prices)
    return p

In [184]:
p_weekly = get_weekly_closing(stocks)

In [185]:
r_weekly = log_normalized_return(stocks, p_weekly)

In [186]:
weekly_correlation_matrix = pd.DataFrame.copy(correlation_matrix)
for s1 in stock_names:
    for s2 in stock_names:
        weekly_correlation_matrix[s1][s2] = correlation(s1, s2, r_weekly)
weekly_correlation_matrix;

In [187]:
weekly_correlation_weights = np.sqrt(2 * (1 - weekly_correlation_matrix))

In [188]:
weekly_correlation_edgelist = edgelist(weekly_correlation_weights)

In [189]:
weekly_correlation_edgelist.to_csv('../finance_data/weekly_correlation_edgelist.txt', sep=' ', header=False, index=False)