# Project: Predicting Stock Price & Return  
## 2. Extrat Transform Load (ETL)
Data source include: Alphavantage free API, Bank of Canada, Google Trends

In this notebook, I clean the data that were collected from the first notebook and put them all together in a single csv file.

Ver. 1.0  
Remove unecessary content  

Ver. 0.1  
The ETL pipeline, including interpolating and cleaning

## Table of Content  
[0. Import packages](#0)  
[1. Extract Data](#1)  
[2. Transform the Data](#2)  
[3. Load the Data](#3)  

<a class="anchor" id="0"></a>
#### 0. Import packages

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [2]:
#let the notebook display full length of the data columns
pd.set_option('display.max_columns', None)  
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', -1)

<a class="anchor" id="1"></a>
#### 1. Extract Data

##### Define extract functions  
These functions can be easily changed if we want to change the data acquisition methods.

In [3]:
def extract_stock_data(name):
    '''
    read stock data from csv file
    '''
    data = pd.read_csv("data/"+ name + ".csv", parse_dates=['date'], index_col='date')
    return data

In [4]:
def extract_indecis_data(name):
    '''
    read index data from csv file
    '''
    data = pd.read_csv("data/"+name + ".csv", parse_dates=['date'], index_col='date')
    return data

In [5]:
def extract_exchange_data(name):
    '''
    read exchange data from csv file
    '''
    data = pd.read_csv("data/"+name + ".csv", parse_dates=['date'], index_col='date')
    return data

In [6]:
def extract_trend_data(name):
    '''
    read trend data from csv file
    '''
    data = pd.read_csv("data/"+"trend_" + name + ".csv", parse_dates=['date'], index_col='date')
    return data

In [7]:
def extract_data(stock_list=['loblaw', 'metro', 'empa', 'gwl', 'atd', 'tsx', 'sp500'],
                 indecis_list=['BCPI', 'CPI', 'bank_interest'], 
                 exchange_list=['CEER'], 
                 trend_list=['grocery_store', 'loblaws', 'stock']):

    data = pd.DataFrame()
    for stock in stock_list:
        stock_data = extract_stock_data(stock)
        stock_data.columns = [stock + '_price', stock + '_volume']
        data = data.join(stock_data, how='outer')

    for index in indecis_list:
        indecis_data = extract_indecis_data(index)
        indecis_data.columns = [index]
        data = data.join(indecis_data, how='outer')

    for exchange in exchange_list:
        exchange_data = extract_indecis_data(exchange)
        exchange_data.columns = [exchange]
        data = data.join(exchange_data, how='outer')

    for trend in trend_list:
        trend_data = extract_trend_data(trend)
        trend_data.columns = ["trend_" + trend]
        data = data.join(trend_data, how='outer')

    return data


##### Extract the data using the functions  
Inspect the data and we will see lots of missing values. This is because the data have different starting and ending date, and some data have lower frequency such as weekly and monthly.

In [8]:
data_extracted = extract_data(stock_list=['loblaw', 'metro', 'empa', 'gwl', 'atd', 'tsx', 'sp500'],
                              indecis_list=['BCPI', 'CPI', 'bank_interest'], 
                              exchange_list=['CEER'], 
                              trend_list=['grocery_store', 'loblaws', 'stock'])

In [9]:
data_extracted.head()

Unnamed: 0_level_0,loblaw_price,loblaw_volume,metro_price,metro_volume,empa_price,empa_volume,gwl_price,gwl_volume,atd_price,atd_volume,tsx_price,tsx_volume,sp500_price,sp500_volume,BCPI,CPI,bank_interest,CEER,trend_grocery_store,trend_loblaws,trend_stock
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1999-01-01,,,,,,,,,,,,,,,,,,95.43,,,
1999-01-04,,,,,,,,,,,,,,,,,,95.47,,,
1999-01-05,,,,,,,,,,,,,,,,,,95.84,,,
1999-01-06,,,,,,,,,,,,,,,,,,96.66,,,
1999-01-07,,,,,,,,,,,,,,,,,,96.46,,,


In [10]:
data_extracted.tail()

Unnamed: 0_level_0,loblaw_price,loblaw_volume,metro_price,metro_volume,empa_price,empa_volume,gwl_price,gwl_volume,atd_price,atd_volume,tsx_price,tsx_volume,sp500_price,sp500_volume,BCPI,CPI,bank_interest,CEER,trend_grocery_store,trend_loblaws,trend_stock
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-10-21,71.42,455500.0,55.927,320200.0,34.92,491852.0,109.16,99100.0,39.27,748298.0,16418.5,155105700.0,3006.72,3271620000.0,,,,,,,
2019-10-22,70.77,613100.0,55.6879,247800.0,35.02,469645.0,108.15,104800.0,39.03,1083171.0,16391.5,171979500.0,2995.99,3523890000.0,,,,,,,
2019-10-23,69.85,595400.0,55.1,516000.0,34.45,467906.0,106.7,106000.0,39.53,1228126.0,16335.9004,181906700.0,3004.52,3392870000.0,,,,,,,
2019-10-24,69.22,785600.0,54.95,296500.0,34.46,368114.0,106.01,80300.0,39.08,1151289.0,16369.2998,167159100.0,3010.29,3692600000.0,,,,,,,
2019-10-25,69.6,1068100.0,54.92,338100.0,34.75,352352.0,106.06,73000.0,39.81,1262531.0,16404.5,173518300.0,3022.55,3370370000.0,,,,,,,


In [11]:
data_extracted.to_csv("data/data_extracted.csv")

In [12]:
pd.read_csv("data/data_extracted.csv").head()

Unnamed: 0,date,loblaw_price,loblaw_volume,metro_price,metro_volume,empa_price,empa_volume,gwl_price,gwl_volume,atd_price,atd_volume,tsx_price,tsx_volume,sp500_price,sp500_volume,BCPI,CPI,bank_interest,CEER,trend_grocery_store,trend_loblaws,trend_stock
0,1999-01-01,,,,,,,,,,,,,,,,,,95.43,,,
1,1999-01-04,,,,,,,,,,,,,,,,,,95.47,,,
2,1999-01-05,,,,,,,,,,,,,,,,,,95.84,,,
3,1999-01-06,,,,,,,,,,,,,,,,,,96.66,,,
4,1999-01-07,,,,,,,,,,,,,,,,,,96.46,,,


In [13]:
data_extracted.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 5500 entries, 1999-01-01 to 2019-10-25
Data columns (total 21 columns):
loblaw_price           5024 non-null float64
loblaw_volume          5024 non-null float64
metro_price            5025 non-null float64
metro_volume           5025 non-null float64
empa_price             5025 non-null float64
empa_volume            5025 non-null float64
gwl_price              5024 non-null float64
gwl_volume             5024 non-null float64
atd_price              5025 non-null float64
atd_volume             5025 non-null float64
tsx_price              5025 non-null float64
tsx_volume             5025 non-null float64
sp500_price            4986 non-null float64
sp500_volume           4986 non-null float64
BCPI                   1030 non-null float64
CPI                    237 non-null float64
bank_interest          1030 non-null float64
CEER                   5421 non-null float64
trend_grocery_store    190 non-null float64
trend_loblaws         

In [14]:
data_extracted.describe()

Unnamed: 0,loblaw_price,loblaw_volume,metro_price,metro_volume,empa_price,empa_volume,gwl_price,gwl_volume,atd_price,atd_volume,tsx_price,tsx_volume,sp500_price,sp500_volume,BCPI,CPI,bank_interest,CEER,trend_grocery_store,trend_loblaws,trend_stock
count,5024.0,5024.0,5025.0,5025.0,5025.0,5025.0,5024.0,5024.0,5025.0,5025.0,5025.0,5025.0,4986.0,4986.0,1030.0,237.0,1030.0,5421.0,190.0,190.0,190.0
mean,35.614063,529489.7,16.226961,702097.8,13.328838,228355.1,71.211612,115664.2,9.980139,2031178.0,11839.875345,172009500.0,1560.950916,3087643000.0,474.326641,115.605907,4.006359,119.529555,51.421053,38.878947,26.221053
std,12.903062,591971.3,15.04459,804787.1,8.344615,297398.1,21.421331,103384.5,12.29385,2336902.0,2969.689206,75723800.0,569.467685,1478108000.0,133.807475,11.783501,1.397609,13.244301,15.005169,8.555519,20.481081
min,0.0,0.0,1.0977,0.0,0.0,0.0,0.0,0.0,0.2445,0.0,0.0,0.0,676.53,356070000.0,222.7,93.5,2.25,95.43,34.0,24.0,11.0
25%,26.301225,251200.0,4.0492,346200.0,5.5121,60750.0,56.00395,58400.0,1.5268,953298.0,9278.5,123891500.0,1149.667575,1662200000.0,365.9825,105.4,3.0,110.3,41.25,35.0,14.0
50%,32.5264,414450.0,8.9606,545000.0,11.5014,148048.0,68.72695,92000.0,2.7502,1560404.0,12268.2998,171979500.0,1356.7,3212905000.0,451.6,115.6,3.7,118.33,46.0,38.0,17.0
75%,44.075475,644500.0,23.1487,853800.0,20.089,302466.0,88.391075,139300.0,17.6148,2368018.0,14209.5996,213323100.0,1969.28,3917825000.0,580.9875,125.7,4.75,131.31,60.0,41.0,26.75
max,75.77,14825200.0,58.3084,31572000.0,37.2337,5316156.0,118.5684,1635000.0,43.7946,65852270.0,16899.6992,858888100.0,3025.8601,11456230000.0,911.94,137.0,7.5,150.48,100.0,100.0,100.0


<a class="anchor" id="2"></a>
    
#### 2. Transform the data

In [15]:
def transform_data(data_extracted,
                              stock_list=['loblaw', 'metro', 'empa', 'gwl', 'atd', 'tsx', 'sp500'],
                              indecis_list=['BCPI', 'CPI', 'bank_interest'], 
                              exchange_list=['CEER'], 
                              trend_list=['grocery_store', 'loblaws', 'stock']):
    '''
    tranform the data into a usable form. include slicing the date, interpolating values, etc.
    '''
    data_extracted = data_extracted.resample('B').interpolate()

    data_extracted = data_extracted['2003':]

    data_extracted = data_extracted[1:]

    for column in [column for column in data_extracted.columns if ('price' in column) or ('volume' in column)]:
        for row in data_extracted[data_extracted[column] == 0].iterrows():
            '''
            last_price = data_extracted.loc[:row[0]].tail(2).head(1)
            while last_price[column].squeeze() == 0:
                last_price = data_extracted.loc[:last_price.index.max()].tail(2).head(1)
            last_price = last_price[column].squeeze()

            next_price = data_extracted.loc[row[0]:].head(2).tail(1)
            while next_price[column].squeeze() == 0:
                next_price = data_extracted.loc[next_price.index.min():].head(2).tail(1)
            next_price = next_price[column].squeeze()

            data_extracted.loc[row[0]][column] = np.mean([last_price, next_price])
            '''
            # The previous solution was to use mean value to subsitute the 0 value of price, now we decide to drop these rows
            data_extracted.drop(row[0], inplace=True)

    return data_extracted


In [16]:
data_transformed = transform_data(data_extracted)

In [17]:
data_extracted.describe()

Unnamed: 0,loblaw_price,loblaw_volume,metro_price,metro_volume,empa_price,empa_volume,gwl_price,gwl_volume,atd_price,atd_volume,tsx_price,tsx_volume,sp500_price,sp500_volume,BCPI,CPI,bank_interest,CEER,trend_grocery_store,trend_loblaws,trend_stock
count,5024.0,5024.0,5025.0,5025.0,5025.0,5025.0,5024.0,5024.0,5025.0,5025.0,5025.0,5025.0,4986.0,4986.0,1030.0,237.0,1030.0,5421.0,190.0,190.0,190.0
mean,35.614063,529489.7,16.226961,702097.8,13.328838,228355.1,71.211612,115664.2,9.980139,2031178.0,11839.875345,172009500.0,1560.950916,3087643000.0,474.326641,115.605907,4.006359,119.529555,51.421053,38.878947,26.221053
std,12.903062,591971.3,15.04459,804787.1,8.344615,297398.1,21.421331,103384.5,12.29385,2336902.0,2969.689206,75723800.0,569.467685,1478108000.0,133.807475,11.783501,1.397609,13.244301,15.005169,8.555519,20.481081
min,0.0,0.0,1.0977,0.0,0.0,0.0,0.0,0.0,0.2445,0.0,0.0,0.0,676.53,356070000.0,222.7,93.5,2.25,95.43,34.0,24.0,11.0
25%,26.301225,251200.0,4.0492,346200.0,5.5121,60750.0,56.00395,58400.0,1.5268,953298.0,9278.5,123891500.0,1149.667575,1662200000.0,365.9825,105.4,3.0,110.3,41.25,35.0,14.0
50%,32.5264,414450.0,8.9606,545000.0,11.5014,148048.0,68.72695,92000.0,2.7502,1560404.0,12268.2998,171979500.0,1356.7,3212905000.0,451.6,115.6,3.7,118.33,46.0,38.0,17.0
75%,44.075475,644500.0,23.1487,853800.0,20.089,302466.0,88.391075,139300.0,17.6148,2368018.0,14209.5996,213323100.0,1969.28,3917825000.0,580.9875,125.7,4.75,131.31,60.0,41.0,26.75
max,75.77,14825200.0,58.3084,31572000.0,37.2337,5316156.0,118.5684,1635000.0,43.7946,65852270.0,16899.6992,858888100.0,3025.8601,11456230000.0,911.94,137.0,7.5,150.48,100.0,100.0,100.0


In [18]:
data_transformed.describe()

Unnamed: 0,loblaw_price,loblaw_volume,metro_price,metro_volume,empa_price,empa_volume,gwl_price,gwl_volume,atd_price,atd_volume,tsx_price,tsx_volume,sp500_price,sp500_volume,BCPI,CPI,bank_interest,CEER,trend_grocery_store,trend_loblaws,trend_stock
count,4361.0,4361.0,4361.0,4361.0,4361.0,4361.0,4361.0,4361.0,4361.0,4361.0,4361.0,4361.0,4361.0,4361.0,4361.0,4361.0,4361.0,4361.0,4109.0,4109.0,4109.0
mean,37.14731,569575.7,18.913607,719495.6,15.287601,248284.7,73.409281,120961.0,11.813733,2196316.0,12595.170847,186047000.0,1625.598647,3407853000.0,506.35046,118.998076,3.682573,124.28216,51.306065,38.841198,26.564944
std,13.070197,578920.5,15.011572,825824.2,7.716913,284012.4,20.817372,103977.3,12.626393,2226329.0,2532.315722,71194530.0,589.046743,1348808000.0,119.141573,9.753498,1.123584,10.066851,14.789069,8.27557,20.758822
min,16.836,24500.0,2.9753,11400.0,3.8082,2385.0,33.3835,5000.0,0.5754,65598.0,6228.6001,182400.0,676.53,487220000.0,277.38,102.00625,2.25,96.7,34.0,24.0,11.0
25%,26.4889,292900.0,6.6967,372500.0,8.6472,78990.0,56.6802,63800.0,2.1679,1179834.0,11424.5996,146611300.0,1182.76995,2567540000.0,407.79,111.609302,3.0,116.29,41.325581,35.581395,14.05
50%,34.1875,455500.0,12.06665,563100.0,14.2318,177021.0,70.0101,96200.0,3.7536,1680192.0,12816.0,184423400.0,1400.5,3400350000.0,498.164,119.872727,3.0,123.32,46.0,38.023256,16.619048
75%,48.5461,682000.0,33.3836,859700.0,21.2685,326556.0,93.3011,144300.0,25.9386,2493000.0,14595.6499,221256000.0,2061.72,4065230000.0,604.926,127.004651,4.25,133.83,60.069767,40.625,26.043478
max,75.77,14825200.0,58.3084,31572000.0,37.2337,4790751.0,118.5684,1635000.0,43.7946,65852270.0,16899.6992,858888100.0,3025.8601,11456230000.0,911.94,137.0,6.25,150.48,100.0,100.0,100.0


In [19]:
data_transformed.head()

Unnamed: 0_level_0,loblaw_price,loblaw_volume,metro_price,metro_volume,empa_price,empa_volume,gwl_price,gwl_volume,atd_price,atd_volume,tsx_price,tsx_volume,sp500_price,sp500_volume,BCPI,CPI,bank_interest,CEER,trend_grocery_store,trend_loblaws,trend_stock
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2003-01-02,31.6037,134900.0,3.1212,101400.0,5.1538,17982.0,65.7853,133000.0,0.7302,296784.0,6740.1001,76406700.0,909.03,1229200000.0,305.898,102.00625,4.5,96.7,,,
2003-01-03,31.3404,48500.0,3.1387,273600.0,5.1797,34761.0,65.7062,9300.0,0.7091,2904300.0,6772.7002,74780400.0,908.59,1130800000.0,306.836,102.0125,4.5,97.08,,,
2003-01-06,31.5452,232700.0,3.1247,161700.0,5.1971,64512.0,65.8931,99400.0,0.7036,541584.0,6837.2998,142266300.0,929.01,1435900000.0,307.774,102.01875,4.5,97.34,,,
2003-01-07,31.6037,842600.0,3.1387,945900.0,5.2317,366657.0,66.1089,54400.0,0.7004,685632.0,6802.7998,150351200.0,922.93,1545200000.0,308.712,102.025,4.5,97.42,,,
2003-01-08,31.5452,423900.0,3.1212,301800.0,5.1624,179112.0,65.8212,227400.0,0.6759,1697196.0,6723.1001,145587400.0,909.93,1467600000.0,309.65,102.03125,4.5,97.21,,,


In [20]:
data_transformed.tail()

Unnamed: 0_level_0,loblaw_price,loblaw_volume,metro_price,metro_volume,empa_price,empa_volume,gwl_price,gwl_volume,atd_price,atd_volume,tsx_price,tsx_volume,sp500_price,sp500_volume,BCPI,CPI,bank_interest,CEER,trend_grocery_store,trend_loblaws,trend_stock
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-10-21,71.42,455500.0,55.927,320200.0,34.92,491852.0,109.16,99100.0,39.27,748298.0,16418.5,155105700.0,3006.72,3271620000.0,432.33,136.8,3.95,117.75,80.0,34.0,64.0
2019-10-22,70.77,613100.0,55.6879,247800.0,35.02,469645.0,108.15,104800.0,39.03,1083171.0,16391.5,171979500.0,2995.99,3523890000.0,432.33,136.8,3.95,117.75,80.0,34.0,64.0
2019-10-23,69.85,595400.0,55.1,516000.0,34.45,467906.0,106.7,106000.0,39.53,1228126.0,16335.9004,181906700.0,3004.52,3392870000.0,432.33,136.8,3.95,117.75,80.0,34.0,64.0
2019-10-24,69.22,785600.0,54.95,296500.0,34.46,368114.0,106.01,80300.0,39.08,1151289.0,16369.2998,167159100.0,3010.29,3692600000.0,432.33,136.8,3.95,117.75,80.0,34.0,64.0
2019-10-25,69.6,1068100.0,54.92,338100.0,34.75,352352.0,106.06,73000.0,39.81,1262531.0,16404.5,173518300.0,3022.55,3370370000.0,432.33,136.8,3.95,117.75,80.0,34.0,64.0


In [21]:
data_transformed.to_csv('data/data_transformed.csv')

<a class="anchor" id="3"></a>

#### 3. Load the Data

In [22]:
def load_data(data_transformed_filename):
    '''
    save the transformed data to csv file
    '''
    data_loaded = pd.read_csv(data_transformed_filename, parse_dates=['date'], index_col='date')

    return data_loaded

In [23]:
data_loaded = load_data("data/data_transformed.csv")

In [24]:
data_loaded.head()

Unnamed: 0_level_0,loblaw_price,loblaw_volume,metro_price,metro_volume,empa_price,empa_volume,gwl_price,gwl_volume,atd_price,atd_volume,tsx_price,tsx_volume,sp500_price,sp500_volume,BCPI,CPI,bank_interest,CEER,trend_grocery_store,trend_loblaws,trend_stock
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2003-01-02,31.6037,134900.0,3.1212,101400.0,5.1538,17982.0,65.7853,133000.0,0.7302,296784.0,6740.1001,76406700.0,909.03,1229200000.0,305.898,102.00625,4.5,96.7,,,
2003-01-03,31.3404,48500.0,3.1387,273600.0,5.1797,34761.0,65.7062,9300.0,0.7091,2904300.0,6772.7002,74780400.0,908.59,1130800000.0,306.836,102.0125,4.5,97.08,,,
2003-01-06,31.5452,232700.0,3.1247,161700.0,5.1971,64512.0,65.8931,99400.0,0.7036,541584.0,6837.2998,142266300.0,929.01,1435900000.0,307.774,102.01875,4.5,97.34,,,
2003-01-07,31.6037,842600.0,3.1387,945900.0,5.2317,366657.0,66.1089,54400.0,0.7004,685632.0,6802.7998,150351200.0,922.93,1545200000.0,308.712,102.025,4.5,97.42,,,
2003-01-08,31.5452,423900.0,3.1212,301800.0,5.1624,179112.0,65.8212,227400.0,0.6759,1697196.0,6723.1001,145587400.0,909.93,1467600000.0,309.65,102.03125,4.5,97.21,,,


In [25]:
data_loaded.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 4361 entries, 2003-01-02 to 2019-10-25
Data columns (total 21 columns):
loblaw_price           4361 non-null float64
loblaw_volume          4361 non-null float64
metro_price            4361 non-null float64
metro_volume           4361 non-null float64
empa_price             4361 non-null float64
empa_volume            4361 non-null float64
gwl_price              4361 non-null float64
gwl_volume             4361 non-null float64
atd_price              4361 non-null float64
atd_volume             4361 non-null float64
tsx_price              4361 non-null float64
tsx_volume             4361 non-null float64
sp500_price            4361 non-null float64
sp500_volume           4361 non-null float64
BCPI                   4361 non-null float64
CPI                    4361 non-null float64
bank_interest          4361 non-null float64
CEER                   4361 non-null float64
trend_grocery_store    4109 non-null float64
trend_loblaws       

In [26]:
data_loaded.describe()

Unnamed: 0,loblaw_price,loblaw_volume,metro_price,metro_volume,empa_price,empa_volume,gwl_price,gwl_volume,atd_price,atd_volume,tsx_price,tsx_volume,sp500_price,sp500_volume,BCPI,CPI,bank_interest,CEER,trend_grocery_store,trend_loblaws,trend_stock
count,4361.0,4361.0,4361.0,4361.0,4361.0,4361.0,4361.0,4361.0,4361.0,4361.0,4361.0,4361.0,4361.0,4361.0,4361.0,4361.0,4361.0,4361.0,4109.0,4109.0,4109.0
mean,37.14731,569575.7,18.913607,719495.6,15.287601,248284.7,73.409281,120961.0,11.813733,2196316.0,12595.170847,186047000.0,1625.598647,3407853000.0,506.35046,118.998076,3.682573,124.28216,51.306065,38.841198,26.564944
std,13.070197,578920.5,15.011572,825824.2,7.716913,284012.4,20.817372,103977.3,12.626393,2226329.0,2532.315722,71194530.0,589.046743,1348808000.0,119.141573,9.753498,1.123584,10.066851,14.789069,8.27557,20.758822
min,16.836,24500.0,2.9753,11400.0,3.8082,2385.0,33.3835,5000.0,0.5754,65598.0,6228.6001,182400.0,676.53,487220000.0,277.38,102.00625,2.25,96.7,34.0,24.0,11.0
25%,26.4889,292900.0,6.6967,372500.0,8.6472,78990.0,56.6802,63800.0,2.1679,1179834.0,11424.5996,146611300.0,1182.76995,2567540000.0,407.79,111.609302,3.0,116.29,41.325581,35.581395,14.05
50%,34.1875,455500.0,12.06665,563100.0,14.2318,177021.0,70.0101,96200.0,3.7536,1680192.0,12816.0,184423400.0,1400.5,3400350000.0,498.164,119.872727,3.0,123.32,46.0,38.023256,16.619048
75%,48.5461,682000.0,33.3836,859700.0,21.2685,326556.0,93.3011,144300.0,25.9386,2493000.0,14595.6499,221256000.0,2061.72,4065230000.0,604.926,127.004651,4.25,133.83,60.069767,40.625,26.043478
max,75.77,14825200.0,58.3084,31572000.0,37.2337,4790751.0,118.5684,1635000.0,43.7946,65852270.0,16899.6992,858888100.0,3025.8601,11456230000.0,911.94,137.0,6.25,150.48,100.0,100.0,100.0
