In [4]:
import pandas as pd
import os
import glob

In [60]:
# Access Components
credit_components = './selected_kpca_components'
sentiment_components = './selected_kpca_components'
nlp_components = './nlp_components'
nlp_components_processed = './nlp_components_processed'
adjacency_processed = './adjacency/adj_matrices'

def get_tickers_in_path(path):
    tickers = [path.split('/')[-1] for path in glob.glob(path + '/*')]
    available_tickers = [t.split('.')[0] for t in tickers]
    
    return available_tickers

# Define all tickers
available_tickers = get_tickers_in_path(credit_components)

In [57]:
# Interpolate sentiment data
# available_tickers
for ticker in ['AAPL']:
    ticker_sentiment = pd.read_csv(nlp_components + '/%s.csv' % ticker).iloc[:,1:]
    ticker_sentiment = ticker_sentiment.drop(columns='headline')

    expanded_ticker = [] 
    ticker_sentiment['firstCreated'] = pd.to_datetime(ticker_sentiment['firstCreated'])

    for i in range(ticker_sentiment.shape[0]-1):
        row = ticker_sentiment.iloc[i,:]
        date = ticker_sentiment.iloc[i,0]
        date_next = ticker_sentiment.iloc[i+1,0]

        multiple = (date_next - date).days
        dates = pd.date_range(start=date, periods=multiple, freq="1d")

        if multiple > 0:

            repeated_values = pd.concat([pd.DataFrame(ticker_sentiment.iloc[i, :]).T]*multiple)
            repeated_values.iloc[:,0] = dates
            expanded_ticker.append(repeated_values)
        else:
            expanded_ticker.append(pd.concat([pd.DataFrame(ticker_sentiment.iloc[i, :]).T]))

    interpolated_data = pd.concat(expanded_ticker)
    interpolated_data.to_csv(nlp_components_processed + '/%s.csv' % ticker)

In [117]:
# Operate on adjacency matrices
def get_matrix_years_in_path(path):
    tickers = [path.split('/')[-1] for path in glob.glob(path + '/*')]
    available_matrices = [t.split('_')[-1] for t in tickers]
    available_matrices = [t.split('.')[0] for t in available_matrices]
    
    return available_matrices

matrix_years = get_matrix_years_in_path(adjacency_processed)

def load_matrix(year):
    name = 'adj_matrix_%s' % str(year)
    path = adjacency_processed + '/' + name + '.csv'
    
    matrix = pd.read_csv(path)
    matrix = matrix.set_index('supplier_ticker')
    
    return matrix

# Load Matrices
matrices = {}

for year in matrix_years:
    matrices[year] = load_matrix(year)

In [137]:
def get_non_zero_weights(w):
    return w.T[w.T.iloc[:,0] > 0].T

def rescale_weights(x):
    y = x.copy()
    vals = (x.values / x.values.sum())
    y.iloc[0,:] = vals 
    return y

def get_supplier_weights(ticker,year):
    matrix = matrices[year]
    matrix_row = matrix[matrix.index == ticker]
    x_non_zero = get_non_zero_weights(matrix_row)
    x_norm = rescale_weights(x_non_zero)
    return x_norm

In [139]:
get_supplier_weights('AAPL', '2016')

Unnamed: 0_level_0,600288,9984,BBY,DCM,S,VZ
supplier_ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AAPL,0.002869,0.137762,0.209921,0.167282,0.078721,0.403445


In [122]:
x_norm

Unnamed: 0_level_0,9984,BBY,CHL,DCM,IM,S,T,TECD,TMUS,VZ
supplier_ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
AAPL,0.073145,0.098633,0.052246,0.088819,0.114795,0.041797,0.177638,0.123044,0.036572,0.193312


In [116]:
x_norm

Unnamed: 0_level_0,9984,BBY,CHL,DCM,IM,S,T,TECD,TMUS,VZ
supplier_ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
AAPL,0.073145,0.098633,0.052246,0.088819,0.114795,0.041797,0.177638,0.123044,0.036572,0.193312


In [83]:
x_non_zero.sum(axis=1)

supplier_ticker
AAPL    0.191401
dtype: float64

In [18]:
def get_days_in_month(month):
    if month == 2:
        return 29
    else:
        if month % 2 == 0:
            return 31
        else:
            return 30

for ticker in available_tickers:
    expanded_ticker = []
    pca_data = pd.read_csv('./pca_out/%s.csv' % ticker)
    price_data = pd.DataFrame(pd.read_csv('./price_data/%s.csv' % ticker)[['Date','Adj_Close']])  
    pca_data['datepll'] = pd.to_datetime(pca_data['datepll'])
    price_data['Date'] = pd.to_datetime(price_data['Date'])

    for i in range(pca_data.shape[0]):
        row = pca_data.iloc[i,:]
        month = pca_data.iloc[i,0].month
        multiple = get_days_in_month(month)
        dates = pd.date_range(start=pca_data.iloc[i,0], periods=multiple, freq="1d")

        repeated_values = pd.concat([pd.DataFrame(pca_data.iloc[i, :]).T]*multiple)
        repeated_values.iloc[:,0] = dates

        expanded_ticker.append(repeated_values)


    interpolated_data = pd.concat(expanded_ticker)
    merged_data = interpolated_data.merge(price_data, how='inner', left_on='datepll', right_on='Date')
    merged_data = merged_data.drop(columns=['Date'])
    merged_data.to_csv('./interpolated_pca_data/%s.csv' % ticker)
    

Unnamed: 0.1,Unnamed: 0,firstCreated,headline,compound_sentiment_value
0,0,2016-11-01,BUZZ-Apple flirts with worst session in two mo...,-0.9517
1,1,2016-11-02,SCALES CORPORATION - ENTERED INTO AN AGREEME...,0.6408
2,2,2016-11-07,APPLE HOSPITALITY REIT INC QTRLY SHR $0.07;; N...,0.2960
3,3,2016-11-09,"PROPOSED ISSUE: City of Apple Valley, MN, $148...",0.0000
4,4,2016-11-14,Apple stock extends losses after China warning...,-0.8020
5,5,2016-11-15,Apple considering expansion into wearable glas...,0.0000
6,6,2016-11-17,"Taiwan stocks rise; TSMC, Hon Hai track Apple ...",0.2960
7,7,2016-11-21,APPLE STARTED DISPERSING ENGINEERS TO OTHER P...,-0.6523
8,8,2016-11-22,BRIEF-Kcell board approves agreement with Appl...,0.7096
9,9,2016-11-28,ACCC PROPOSES TO DENY AUTHORISATION FOR BANKS ...,-0.7626
