## Reading Financial Data From Different Sources

### The Data Set

In [4]:
fn = '../data/AAPL.csv'

In [5]:
with open(fn, 'r') as f:
    for _ in range(5):
        print(f.readline(), end='')

Date,HIGH,CLOSE,LOW,OPEN,COUNT,VOLUME
2020-04-01,248.72,240.91,239.13,246.5,460606.0,44054638.0
2020-04-02,245.15,244.93,236.9,240.34,380294.0,41483493.0
2020-04-03,245.7,241.41,238.9741,242.8,293699.0,32470017.0
2020-04-06,263.11,262.47,249.38,250.9,486681.0,50455071.0


### Reading from a CSV File with Python

In [19]:
import csv

In [20]:
csv_reader = csv.reader(open(fn, 'r'))

In [21]:
data = list(csv_reader)

In [22]:
data[:5]

[['Date', 'HIGH', 'CLOSE', 'LOW', 'OPEN', 'COUNT', 'VOLUME'],
 ['2020-04-01',
  '248.72',
  '240.91',
  '239.13',
  '246.5',
  '460606.0',
  '44054638.0'],
 ['2020-04-02',
  '245.15',
  '244.93',
  '236.9',
  '240.34',
  '380294.0',
  '41483493.0'],
 ['2020-04-03',
  '245.7',
  '241.41',
  '238.9741',
  '242.8',
  '293699.0',
  '32470017.0'],
 ['2020-04-06',
  '263.11',
  '262.47',
  '249.38',
  '250.9',
  '486681.0',
  '50455071.0']]

In [26]:
csv_reader = csv.DictReader(open(fn, 'r'))

In [27]:
data = list(csv_reader)

In [28]:
data[:5]

[{'Date': '2020-04-01',
  'HIGH': '248.72',
  'CLOSE': '240.91',
  'LOW': '239.13',
  'OPEN': '246.5',
  'COUNT': '460606.0',
  'VOLUME': '44054638.0'},
 {'Date': '2020-04-02',
  'HIGH': '245.15',
  'CLOSE': '244.93',
  'LOW': '236.9',
  'OPEN': '240.34',
  'COUNT': '380294.0',
  'VOLUME': '41483493.0'},
 {'Date': '2020-04-03',
  'HIGH': '245.7',
  'CLOSE': '241.41',
  'LOW': '238.9741',
  'OPEN': '242.8',
  'COUNT': '293699.0',
  'VOLUME': '32470017.0'},
 {'Date': '2020-04-06',
  'HIGH': '263.11',
  'CLOSE': '262.47',
  'LOW': '249.38',
  'OPEN': '250.9',
  'COUNT': '486681.0',
  'VOLUME': '50455071.0'},
 {'Date': '2020-04-07',
  'HIGH': '271.7',
  'CLOSE': '259.43',
  'LOW': '259.0',
  'OPEN': '270.8',
  'COUNT': '467375.0',
  'VOLUME': '50721831.0'}]

In [29]:
mean = sum([float(l['CLOSE']) for l in data]) / len(data)
mean

272.38619047619045

### Reading from a CSV File with pandas

In [30]:
import pandas as pd

In [31]:
data = pd.read_csv(fn, index_col=0, parse_dates=True)

In [32]:
data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 21 entries, 2020-04-01 to 2020-04-30
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   HIGH    21 non-null     float64
 1   CLOSE   21 non-null     float64
 2   LOW     21 non-null     float64
 3   OPEN    21 non-null     float64
 4   COUNT   21 non-null     float64
 5   VOLUME  21 non-null     float64
dtypes: float64(6)
memory usage: 1.1 KB


In [33]:
data.tail()

Unnamed: 0_level_0,HIGH,CLOSE,LOW,OPEN,COUNT,VOLUME
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-04-24,283.01,282.97,277.0,277.2,306176.0,31627183.0
2020-04-27,284.54,283.17,279.95,281.8,300771.0,29271893.0
2020-04-28,285.83,278.58,278.2,285.08,285384.0,28001187.0
2020-04-29,289.67,287.73,283.89,284.73,324890.0,34320204.0
2020-04-30,294.53,293.8,288.35,289.96,471129.0,45765968.0


In [34]:
data.index

DatetimeIndex(['2020-04-01', '2020-04-02', '2020-04-03', '2020-04-06',
               '2020-04-07', '2020-04-08', '2020-04-09', '2020-04-13',
               '2020-04-14', '2020-04-15', '2020-04-16', '2020-04-17',
               '2020-04-20', '2020-04-21', '2020-04-22', '2020-04-23',
               '2020-04-24', '2020-04-27', '2020-04-28', '2020-04-29',
               '2020-04-30'],
              dtype='datetime64[ns]', name='Date', freq=None)

In [35]:
data['CLOSE'].mean()

272.38619047619056

### Exporting to Excel and JSON

In [37]:
#data.to_excel('data/aapl.xls', 'AAPL')

In [41]:
data.to_json('./aapl.json')

### Reading from Excel and JSON

In [42]:
#excel_data = pd.read_excel('data/aapl.xls', 'AAPL', index_col=0)
#excel_data.head()

In [44]:
json_data = pd.read_json('./aapl.json')
json_data.head()

Unnamed: 0,HIGH,CLOSE,LOW,OPEN,COUNT,VOLUME
2020-04-01,248.72,240.91,239.13,246.5,460606,44054638
2020-04-02,245.15,244.93,236.9,240.34,380294,41483493
2020-04-03,245.7,241.41,238.9741,242.8,293699,32470017
2020-04-06,263.11,262.47,249.38,250.9,486681,50455071
2020-04-07,271.7,259.43,259.0,270.8,467375,50721831


## Working with Open Data Sources

In [47]:
%load_ext dotenv
%dotenv
import os

The dotenv extension is already loaded. To reload it, use:
  %reload_ext dotenv


In [48]:
quandl_api_key = os.environ.get("QUANDL_API_KEY")

In [49]:
import quandl as q

In [50]:
data = q.get('BCHAIN/MKPRU', api_key=quandl_api_key)

In [51]:
data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 5361 entries, 2009-01-02 to 2023-09-06
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Value   5361 non-null   float64
dtypes: float64(1)
memory usage: 83.8 KB


In [53]:
data['Value'].resample('A').last()

Date
2009-12-31        0.000000
2010-12-31        0.299998
2011-12-31        4.470000
2012-12-31       13.570000
2013-12-31      746.900000
2014-12-31      315.700000
2015-12-31      428.230000
2016-12-31      958.120000
2017-12-31    14165.575000
2018-12-31     3791.545833
2019-12-31     7219.600000
2020-12-31    28856.590000
2021-12-31    47132.960000
2022-12-31    16599.690000
2023-12-31    25785.300000
Freq: A-DEC, Name: Value, dtype: float64

In [55]:
data = q.get('FSE/SAP_X', start_date='2018-1-1',
                      end_date='2020-05-01',
                      api_key=quandl_api_key)

In [56]:
data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 579 entries, 2018-01-02 to 2020-04-30
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Open                   257 non-null    float64
 1   High                   579 non-null    float64
 2   Low                    579 non-null    float64
 3   Close                  579 non-null    float64
 4   Change                 0 non-null      object 
 5   Traded Volume          533 non-null    float64
 6   Turnover               533 non-null    float64
 7   Last Price of the Day  0 non-null      object 
 8   Daily Traded Units     0 non-null      object 
 9   Daily Turnover         0 non-null      object 
dtypes: float64(6), object(4)
memory usage: 49.8+ KB


In [57]:
data.head()

Unnamed: 0_level_0,Open,High,Low,Close,Change,Traded Volume,Turnover,Last Price of the Day,Daily Traded Units,Daily Turnover
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2018-01-02,93.35,93.95,92.12,92.8,,2334984.0,216571734.0,,,
2018-01-03,93.0,94.65,92.63,94.07,,2346852.0,219877195.0,,,
2018-01-11,,93.74,91.04,91.43,,4274991.0,393498865.0,,,
2018-01-12,,91.47,90.27,90.73,,3437210.0,312013502.0,,,
2018-01-15,,90.61,89.71,90.06,,2053812.0,185176806.0,,,


In [58]:
q.ApiConfig.api_key = quandl_api_key

In [61]:
vol = q.get_table('QUANTCHA/VOL', date='2018-12-31', ticker='MSFT')

In [62]:
vol.iloc[:, :10].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 10 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   ticker  1 non-null      object        
 1   date    1 non-null      datetime64[ns]
 2   hv10    1 non-null      float64       
 3   hv20    1 non-null      float64       
 4   hv30    1 non-null      float64       
 5   hv60    1 non-null      float64       
 6   hv90    1 non-null      float64       
 7   hv120   1 non-null      float64       
 8   hv150   1 non-null      float64       
 9   hv180   1 non-null      float64       
dtypes: datetime64[ns](1), float64(8), object(1)
memory usage: 208.0+ bytes


In [64]:
vol[['ivmean30', 'ivmean60', 'ivmean90']].tail()

Unnamed: 0_level_0,ivmean30,ivmean60,ivmean90
None,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.376,0.3519,0.331
