In [1]:
from google.cloud import storage
import configparser
import pandas as pd

config = configparser.ConfigParser()
config.read('../../config/credentials.ini')

gcp_path = config['gcp']['credentials_path']

In [2]:
nasdaq100 = pd.read_csv('../../raw_data/nasdaq100.csv', header=None)
nasdaq100_list = list(nasdaq100.values.flatten())

## Upload Data

In [84]:
def upload_processed_files_to_gcp(ticker_list=None,
                                  bucket_name='mlchartist-project',
                                  folder='nasdaq_100_processed/',
                                  file_path='../../raw_data/processed/',
                                  credentials_path=None):
    
    for ticker in ticker_list:
        if credentials_path is not None:
            storage_client = storage.Client.from_service_account_json(credentials_path)
        else:
            storage_client = storage.Client()
        bucket = storage_client.bucket(bucket_name)
        source_file = file_path + ticker.strip().lower() + '.csv'
        destination_blob_name = folder + ticker.strip().lower() + '.csv'
        blob = bucket.blob(destination_blob_name)
        blob.upload_from_filename(source_file)
        print(f"File {source_file} uploaded to {destination_blob_name}")

In [85]:
upload_processed_files_to_gcp(ticker_list=nasdaq100_list, credentials_path=gcp_path)

File ../../raw_data/processed/atvi.csv uploaded to nasdaq_100_processed/atvi.csv.
File ../../raw_data/processed/adbe.csv uploaded to nasdaq_100_processed/adbe.csv.
File ../../raw_data/processed/amd.csv uploaded to nasdaq_100_processed/amd.csv.
File ../../raw_data/processed/algn.csv uploaded to nasdaq_100_processed/algn.csv.
File ../../raw_data/processed/alxn.csv uploaded to nasdaq_100_processed/alxn.csv.
File ../../raw_data/processed/amzn.csv uploaded to nasdaq_100_processed/amzn.csv.
File ../../raw_data/processed/amgn.csv uploaded to nasdaq_100_processed/amgn.csv.
File ../../raw_data/processed/aal.csv uploaded to nasdaq_100_processed/aal.csv.
File ../../raw_data/processed/adi.csv uploaded to nasdaq_100_processed/adi.csv.
File ../../raw_data/processed/aapl.csv uploaded to nasdaq_100_processed/aapl.csv.
File ../../raw_data/processed/amat.csv uploaded to nasdaq_100_processed/amat.csv.
File ../../raw_data/processed/asml.csv uploaded to nasdaq_100_processed/asml.csv.
File ../../raw_data/pr

## Access Files From Bucket

In [20]:
def download_file(bucket_name='mlchartist-project', source_blob_name=None, destination_file_name=None, credentials_path=None):
    if credentials_path is not None:
            storage_client = storage.Client.from_service_account_json(credentials_path)
    else:
        storage_client = storage.Client()

    bucket = storage_client.bucket(bucket_name)

    # Construct a client side representation of a blob.
    # Note `Bucket.blob` differs from `Bucket.get_blob` as it doesn't retrieve
    # any content from Google Cloud Storage. As we don't need additional data,
    # using `Bucket.blob` is preferred here.
    blob = bucket.get_blob(source_blob_name)
    blob.download_to_filename(destination_file_name)

    print(f"Blob {source_blob_name} downloaded to {destination_file_name}.")


In [21]:
download_file(bucket_name='mlchartist-project', 
              source_blob_name='nasdaq_100_processed/UAL.csv', 
              destination_file_name='../../raw_data/gcp_test/UAL.csv', 
              credentials_path=gcp_path)

Blob nasdaq_100_processed/UAL.csv downloaded to ../../raw_data/gcp_test/UAL.csv.


## List all Blobs in Bucket Folder

In [23]:
def list_blobs_with_prefix(bucket_name=None, prefix=None, delimiter=None, credentials_path=None):
    """Lists all the blobs in the bucket that begin with the prefix.

    This can be used to list all blobs in a "folder", e.g. "public/".

    The delimiter argument can be used to restrict the results to only the
    "files" in the given "folder". Without the delimiter, the entire tree under
    the prefix is returned. For example, given these blobs:

        a/1.txt
        a/b/2.txt

    If you just specify prefix = 'a', you'll get back:

        a/1.txt
        a/b/2.txt

    However, if you specify prefix='a' and delimiter='/', you'll get back:

        a/1.txt

    Additionally, the same request will return blobs.prefixes populated with:

        a/b/
    """

    if credentials_path is not None:
            storage_client = storage.Client.from_service_account_json(credentials_path)
    else:
        storage_client = storage.Client()

    # Note: Client.list_blobs requires at least package version 1.17.0.
    blobs = storage_client.list_blobs(
        bucket_name, prefix=prefix, delimiter=delimiter
    )

    print("Blobs:")
    for blob in blobs:
        print(blob.name)

    if delimiter:
        print("Prefixes:")
        for prefix in blobs.prefixes:
            print(prefix)


In [25]:
list_blobs_with_prefix(bucket_name='mlchartist-project', 
                       prefix='nasdaq_100_processed/', delimiter=None, credentials_path=gcp_path)

Blobs:
nasdaq_100_processed/
nasdaq_100_processed/AAL.csv
nasdaq_100_processed/AAPL.csv
nasdaq_100_processed/ADBE.csv
nasdaq_100_processed/ADI.csv
nasdaq_100_processed/ADP.csv
nasdaq_100_processed/ADSK.csv
nasdaq_100_processed/ALGN.csv
nasdaq_100_processed/ALXN.csv
nasdaq_100_processed/AMAT.csv
nasdaq_100_processed/AMD.csv
nasdaq_100_processed/AMGN.csv
nasdaq_100_processed/AMZN.csv
nasdaq_100_processed/ASML.csv
nasdaq_100_processed/ATVI.csv
nasdaq_100_processed/AVGO.csv
nasdaq_100_processed/BIDU.csv
nasdaq_100_processed/BIIB.csv
nasdaq_100_processed/BKNG.csv
nasdaq_100_processed/BMRN.csv
nasdaq_100_processed/CDNS.csv
nasdaq_100_processed/CERN.csv
nasdaq_100_processed/CHKP.csv
nasdaq_100_processed/CHTR.csv
nasdaq_100_processed/CMCSA.csv
nasdaq_100_processed/COST.csv
nasdaq_100_processed/CSCO.csv
nasdaq_100_processed/CSX.csv
nasdaq_100_processed/CTAS.csv
nasdaq_100_processed/CTSH.csv
nasdaq_100_processed/CTXS.csv
nasdaq_100_processed/DLTR.csv
nasdaq_100_processed/EA.csv
nasdaq_100_proces

In [35]:
def array_of_files_in_folder(bucket_name=None, prefix=None, delimiter=None, credentials_path=None):
    if credentials_path is not None:
            storage_client = storage.Client.from_service_account_json(credentials_path)
    else:
        storage_client = storage.Client()

    blobs = storage_client.list_blobs(
        bucket_name, prefix=prefix, delimiter=delimiter
    )
    file_list = []
    for blob in blobs:
        file = blob.name.split(prefix)[1]
        if file != '':
            file_list.append(file)  
    return file_list


In [36]:
array_of_files_in_folder(bucket_name='mlchartist-project', 
                       prefix='nasdaq_100_processed/', delimiter=None, credentials_path=gcp_path)

['AAL.csv',
 'AAPL.csv',
 'ADBE.csv',
 'ADI.csv',
 'ADP.csv',
 'ADSK.csv',
 'ALGN.csv',
 'ALXN.csv',
 'AMAT.csv',
 'AMD.csv',
 'AMGN.csv',
 'AMZN.csv',
 'ASML.csv',
 'ATVI.csv',
 'AVGO.csv',
 'BIDU.csv',
 'BIIB.csv',
 'BKNG.csv',
 'BMRN.csv',
 'CDNS.csv',
 'CERN.csv',
 'CHKP.csv',
 'CHTR.csv',
 'CMCSA.csv',
 'COST.csv',
 'CSCO.csv',
 'CSX.csv',
 'CTAS.csv',
 'CTSH.csv',
 'CTXS.csv',
 'DLTR.csv',
 'EA.csv',
 'EBAY.csv',
 'EXC.csv',
 'EXPE.csv',
 'FAST.csv',
 'FB.csv',
 'FISV.csv',
 'GILD.csv',
 'GOOG.csv',
 'GOOGL.csv',
 'HAS.csv',
 'HSIC.csv',
 'IDXX.csv',
 'ILMN.csv',
 'INCY.csv',
 'INTC.csv',
 'INTU.csv',
 'ISRG.csv',
 'JBHT.csv',
 'JD.csv',
 'KHC.csv',
 'KLAC.csv',
 'LBTYA.csv',
 'LBTYK.csv',
 'LRCX.csv',
 'LULU.csv',
 'MAR.csv',
 'MCHP.csv',
 'MDLZ.csv',
 'MELI.csv',
 'MNST.csv',
 'MSFT.csv',
 'MU.csv',
 'MXIM.csv',
 'MYL.csv',
 'NFLX.csv',
 'NLOK.csv',
 'NTAP.csv',
 'NTES.csv',
 'NVDA.csv',
 'NXPI.csv',
 'ORLY.csv',
 'PAYX.csv',
 'PCAR.csv',
 'PEP.csv',
 'PYPL.csv',
 'QCOM.csv',
 

In [31]:
t1 = 'nasdaq_100_processed/'
t2 = 'nasdaq_100_processed/AAL.csv'
prefix = 'nasdaq_100_processed/'
output = t2.split(prefix)
output[1]

'AAL.csv'

In [42]:
from google.cloud import storage
from io import BytesIO

storage_client = storage.Client.from_service_account_json(gcp_path)
bucket_name = 'mlchartist-project'
source_blob_name='nasdaq_100_processed/UAL.csv'

bucket = storage_client.bucket(bucket_name)
blob = bucket.get_blob(source_blob_name)
content = blob.download_as_string()
df = pd.read_csv(BytesIO(content))

In [43]:
df.head()

Unnamed: 0,ticker,date,RSI,Stochastic,Stochastic_signal,ADI,OBV,ATR,ADX,ADX_pos,ADX_neg,MACD,MACD_diff,MACD_signal,5TD_return,10TD_return,20TD_return,1D_past_return,5D_past_return,10D_past_return
0,UAL,2020-12-23,48.757321,18.918919,13.805684,56364750.0,243929457,2.131804,25.020812,19.116085,21.623755,0.845981,-0.900063,1.746044,-0.019497,-0.01859,-0.084561,0.02653,-0.041087,-0.081424
1,UAL,2020-12-22,44.55829,6.081081,8.579397,52092620.0,228644728,2.163482,26.472004,20.285129,22.946155,1.045551,-0.925508,1.97106,0.039097,0.015127,-0.019316,-0.024628,-0.094606,-0.123777
2,UAL,2020-12-21,48.03713,16.417051,14.428601,61097490.0,245247459,2.148365,28.034825,20.674438,24.885123,1.40114,-0.801297,2.202437,-0.00715,-0.01169,-0.033254,-0.015311,-0.057042,-0.104573
3,UAL,2020-12-18,50.341257,3.240059,17.395069,48141160.0,267126545,2.151316,29.480415,22.234237,19.861636,1.724148,-0.678613,2.402761,-0.018328,-0.029951,0.009835,-0.024847,-0.074855,-0.091389
4,UAL,2020-12-17,54.371488,23.628692,32.894684,60244210.0,283459777,2.209879,31.314587,23.310021,17.689838,2.0416,-0.530814,2.572414,-0.057105,-0.092633,-0.024629,-0.002609,-0.075745,-0.080745


In [7]:
from io import BytesIO

def load_file_from_gcp(bucket_name='mlchartist-project', source_blob_name=None, credentials_path=None):
    if credentials_path is not None:
            storage_client = storage.Client.from_service_account_json(credentials_path)
    else:
        storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.get_blob(source_blob_name)
    print('blob', blob)
    content = blob.download_as_string()
    df = pd.read_csv(BytesIO(content))
    return df

In [8]:
import glob

def check_nrows(nrows=None, df=None):
    if nrows is None:
        return len(df)
    return nrows

def load_processed_data(nrows=10000, local=False, ticker_list=None, min_length=500, nasdaq100=False, gcp_credentials_path=None):
    joined_df = pd.DataFrame()
    if ticker_list is None and nasdaq100 == True:
        ticker_list = pd.read_csv('../../raw_data/nasdaq100.csv', header=None)
        ticker_list = list(ticker_list.values.flatten())

    ## load from local machine
    if local == True:
        if ticker_list is not None:
            for ticker in ticker_list:
                file_path = '../../raw_data/processed/' + ticker.strip().lower() + '.csv'
                df = pd.read_csv(file_path, nrows=nrows)
                if len(df) > min_length:
                    joined_df = joined_df.append(df)
        else:
            csv_files = glob.glob('../../raw_data/processed/*.csv')
            for ticker in csv_files:
                df = pd.read_csv(ticker, nrows=nrows)
                if len(df) > min_length:
                    joined_df = joined_df.append(df)

    ## load from GCP
    elif local == False:
        if ticker_list is not None:
            for ticker in ticker_list:
                file_path = 'nasdaq_100_processed/' + ticker.strip().lower() + '.csv'
                print(file_path)
                df = load_file_from_gcp(bucket_name='mlchartist-project', source_blob_name=file_path, credentials_path=gcp_credentials_path)
                if len(df) > min_length:
                    ## X nrows
                    nrows = check_nrows(nrows=nrows, df=df)
                    reduced_df = df.iloc[0: nrows].copy()
                    joined_df = joined_df.append(reduced_df)
        else:
            csv_files = array_of_files_in_folder(bucket_name='mlchartist-project', 
                       prefix='nasdaq_100_processed/', credentials_path=gcp_credentials_path)
            for ticjer in csv_files:
                file_path = 'nasdaq_100_processed/' + ticker.strip().lower() + '.csv'
                df = load_file_from_gcp(bucket_name='mlchartist-project', source_blob_name=file_path, credentials_path=gcp_credentials_path)
                if len(df) > min_length:
                    nrows = check_nrows(nrows=nrows, df=df)
                    reduced_df = df.iloc[0: nrows].copy()
                    joined_df = joined_df.append(reduced_df)
    
    return joined_df

In [9]:
ticker_list = ['AAPL', 'TSLA']

output_df = load_processed_data(nrows=10000, 
                                local=False, 
                                ticker_list=ticker_list, 
                                min_length=500, 
                                nasdaq100=False, 
                                gcp_credentials_path=gcp_path)

output_df

nasdaq_100_processed/aapl.csv
blob <Blob: mlchartist-project, nasdaq_100_processed/aapl.csv, 1613644078617859>
nasdaq_100_processed/tsla.csv
blob <Blob: mlchartist-project, nasdaq_100_processed/tsla.csv, 1613644264465982>


Unnamed: 0,ticker,date,RSI,Stochastic,Stochastic_signal,ADI,OBV,ATR,ADX,ADX_pos,ADX_neg,MACD,MACD_diff,MACD_signal,5TD_return,10TD_return,20TD_return,1D_past_return,5D_past_return,10D_past_return
0,AAPL,2020-12-23,66.468844,75.833041,81.267982,6.954094e+10,257617492699,3.203636,19.725858,34.524874,18.680217,3.169309,0.648044,2.521265,0.013210,0.008323,0.091326,-0.006976,0.024646,0.075382
1,AAPL,2020-12-22,69.105683,82.286917,79.135547,6.960991e+10,257705716391,3.323147,18.952436,35.843505,19.393683,3.011667,0.652413,2.359254,0.013952,-0.007279,0.054519,0.028464,0.031279,0.060299
2,AAPL,2020-12-21,63.817814,85.683987,82.742858,6.962042e+10,257536364566,3.103773,18.119520,26.223295,22.361683,2.672979,0.476828,2.196151,0.051782,-0.012712,0.067379,0.012435,0.052964,0.036202
3,AAPL,2020-12-18,61.153688,69.435737,85.978511,6.950316e+10,257415113013,2.968602,18.901932,29.526363,18.257190,2.573448,0.496504,2.076944,0.079231,0.034385,0.042438,-0.015890,0.034679,0.036033
4,AAPL,2020-12-17,67.111474,93.108849,96.114229,6.962657e+10,257607654509,2.967725,18.541790,31.807016,14.677821,2.569341,0.616523,1.952818,0.025408,0.005517,-0.006760,0.006963,0.044304,0.046852
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2604,TSLA,2010-08-19,46.539004,29.227557,31.593598,-3.071737e+07,-64503725,0.256057,19.384521,31.069975,24.961201,-0.045987,-0.040981,-0.005006,0.051091,0.120809,0.076637,0.001066,0.067614,-0.081174
2605,TSLA,2010-08-18,46.406465,28.810021,31.524008,-3.071737e+07,-67399135,0.261600,20.036987,32.697077,24.731478,-0.043685,-0.048925,0.005239,0.060202,0.089505,0.115610,-0.019843,0.048603,-0.117121
2606,TSLA,2010-08-17,48.529118,36.743215,28.392484,-2.874330e+07,-64392475,0.266492,20.511336,33.442802,26.097094,-0.039705,-0.057175,0.017471,0.002611,0.017232,0.147781,0.019702,0.006306,-0.127563
2607,TSLA,2010-08-16,46.308683,29.018789,17.606124,-2.917673e+07,-66631880,0.277453,21.140097,31.341715,26.962893,-0.041785,-0.073549,0.031764,0.071885,0.058040,0.124601,0.025109,-0.041837,-0.102294
