Clean up stock sample data. #7645

Merged
merged 1 commit into from Jan 13, 2017
@@ -0,0 +1,13 @@
+Cleanup of stock sample data
+````````````````````````````
+
+The sample data of stocks has been cleaned up to remove redundancies and
+increase portability. The ``AAPL.dat.gz``, ``INTC.dat.gz`` and ``aapl.csv``
+files have been removed entirely and will also no longer be available from
+`matplotlib.cbook.get_sample_data`. If a CSV file is required, we suggest using
+the ``msft.csv`` that continues to be shipped in the sample data. If a NumPy
+binary file is acceptable, we suggest using one of the following two new files.
+The ``aapl.npy.gz`` and ``goog.npy`` files have been replaced by ``aapl.npz``
+and ``goog.npz``, wherein the first column's type has changed from
+`datetime.date` to `np.datetime64` for better portability across Python
+versions. Note that matplotlib does not fully support `np.datetime64` as yet.
View
@@ -87,28 +87,38 @@ gracefully, and here are some tricks to help you work around them.
We'll load up some sample date data which contains datetime.date
objects in a numpy record array::
- In [63]: datafile = cbook.get_sample_data('goog.npy')
+ In [63]: datafile = cbook.get_sample_data('goog.npz')
- In [64]: r = np.load(datafile).view(np.recarray)
+ In [64]: r = np.load(datafile)['price_data'].view(np.recarray)
In [65]: r.dtype
- Out[65]: dtype([('date', '|O4'), ('', '|V4'), ('open', '<f8'),
+ Out[65]: dtype([('date', '<M8[D]'), ('', '|V4'), ('open', '<f8'),
('high', '<f8'), ('low', '<f8'), ('close', '<f8'),
('volume', '<i8'), ('adj_close', '<f8')])
In [66]: r.date
Out[66]:
- array([2004-08-19, 2004-08-20, 2004-08-23, ..., 2008-10-10, 2008-10-13,
- 2008-10-14], dtype=object)
+ array(['2004-08-19', '2004-08-20', '2004-08-23', ..., '2008-10-10',
+ '2008-10-13', '2008-10-14'], dtype='datetime64[D]')
-The dtype of the numpy record array for the field ``date`` is ``|O4``
-which means it is a 4-byte python object pointer; in this case the
-objects are datetime.date instances, which we can see when we print
-some samples in the ipython terminal window.
+The dtype of the NumPy record array for the field ``date`` is ``datetime64[D]``
+which means it is a 64-bit `np.datetime64` in 'day' units. While this format is
+more portable, Matplotlib cannot plot this format natively yet. We can plot
+this data by changing the dates to `datetime.date` instances instead, which can
+be achieved by converting to an object array::
+
+ In [67]: r.date.astype('O')
+ array([datetime.date(2004, 8, 19), datetime.date(2004, 8, 20),
+ datetime.date(2004, 8, 23), ..., datetime.date(2008, 10, 10),
+ datetime.date(2008, 10, 13), datetime.date(2008, 10, 14)],
+ dtype=object)
+
+The dtype of this converted array is now ``object`` and it is filled with
+datetime.date instances instead.
If you plot the data, ::
- In [67]: plot(r.date, r.close)
+ In [67]: plot(r.date.astype('O'), r.close)
Out[67]: [<matplotlib.lines.Line2D object at 0x92a6b6c>]
you will see that the x tick labels are all squashed together.
@@ -117,18 +127,12 @@ you will see that the x tick labels are all squashed together.
:context:
import matplotlib.cbook as cbook
- datafile = cbook.get_sample_data('goog.npy')
- try:
- # Python3 cannot load python2 .npy files with datetime(object) arrays
- # unless the encoding is set to bytes. Hovever this option was
- # not added until numpy 1.10 so this example will only work with
- # python 2 or with numpy 1.10 and later.
- r = np.load(datafile, encoding='bytes').view(np.recarray)
- except TypeError:
- # Old Numpy
- r = np.load(datafile).view(np.recarray)
+ with cbook.get_sample_data('goog.npz') as datafile:
+ r = np.load(datafile)['price_data'].view(np.recarray)
+ # Matplotlib prefers datetime instead of np.datetime64.
+ date = r.date.astype('O')
plt.figure()
- plt.plot(r.date, r.close)
+ plt.plot(date, r.close)
plt.title('Default date handling can cause overlapping labels')
Another annoyance is that if you hover the mouse over the window and
@@ -149,7 +153,7 @@ a number of date formatters built in, so we'll use one of those.
plt.close('all')
fig, ax = plt.subplots(1)
- ax.plot(r.date, r.close)
+ ax.plot(date, r.close)
# rotate and align the tick labels so they look better
fig.autofmt_xdate()
@@ -186,22 +190,17 @@ right.
import matplotlib.cbook as cbook
# load up some sample financial data
- datafile = cbook.get_sample_data('goog.npy')
- try:
- # Python3 cannot load python2 .npy files with datetime(object) arrays
- # unless the encoding is set to bytes. Hovever this option was
- # not added until numpy 1.10 so this example will only work with
- # python 2 or with numpy 1.10 and later.
- r = np.load(datafile, encoding='bytes').view(np.recarray)
- except TypeError:
- r = np.load(datafile).view(np.recarray)
+ with cbook.get_sample_data('goog.npz') as datafile:
+ r = np.load(datafile)['price_data'].view(np.recarray)
+ # Matplotlib prefers datetime instead of np.datetime64.
+ date = r.date.astype('O')
# create two subplots with the shared x and y axes
fig, (ax1, ax2) = plt.subplots(1,2, sharex=True, sharey=True)
pricemin = r.close.min()
- ax1.plot(r.date, r.close, lw=2)
- ax2.fill_between(r.date, pricemin, r.close, facecolor='blue', alpha=0.5)
+ ax1.plot(date, r.close, lw=2)
+ ax2.fill_between(date, pricemin, r.close, facecolor='blue', alpha=0.5)
for ax in ax1, ax2:
ax.grid(True)
View
@@ -24,31 +24,26 @@
months = mdates.MonthLocator() # every month
yearsFmt = mdates.DateFormatter('%Y')
-# load a numpy record array from yahoo csv data with fields date,
-# open, close, volume, adj_close from the mpl-data/example directory.
-# The record array stores python datetime.date as an object array in
-# the date column
-datafile = cbook.get_sample_data('goog.npy')
-try:
- # Python3 cannot load python2 .npy files with datetime(object) arrays
- # unless the encoding is set to bytes. However this option was
- # not added until numpy 1.10 so this example will only work with
- # python 2 or with numpy 1.10 and later.
- r = np.load(datafile, encoding='bytes').view(np.recarray)
-except TypeError:
- r = np.load(datafile).view(np.recarray)
+# Load a numpy record array from yahoo csv data with fields date, open, close,
+# volume, adj_close from the mpl-data/example directory. The record array
+# stores the date as an np.datetime64 with a day unit ('D') in the date column.
+with cbook.get_sample_data('goog.npz') as datafile:
+ r = np.load(datafile)['price_data'].view(np.recarray)
+# Matplotlib works better with datetime.datetime than np.datetime64, but the
+# latter is more portable.
+date = r.date.astype('O')
fig, ax = plt.subplots()
-ax.plot(r.date, r.adj_close)
+ax.plot(date, r.adj_close)
# format the ticks
ax.xaxis.set_major_locator(years)
ax.xaxis.set_major_formatter(yearsFmt)
ax.xaxis.set_minor_locator(months)
-datemin = datetime.date(r.date.min().year, 1, 1)
-datemax = datetime.date(r.date.max().year + 1, 1, 1)
+datemin = datetime.date(date.min().year, 1, 1)
+datemax = datetime.date(date.max().year + 1, 1, 1)
ax.set_xlim(datemin, datemax)
@@ -10,22 +10,23 @@
from __future__ import print_function
import numpy as np
import matplotlib.pyplot as plt
-import matplotlib.mlab as mlab
import matplotlib.cbook as cbook
import matplotlib.ticker as ticker
-datafile = cbook.get_sample_data('aapl.csv', asfileobj=False)
-print('loading %s' % datafile)
-r = mlab.csv2rec(datafile)
-
-r.sort()
+# Load a numpy record array from yahoo csv data with fields date, open, close,
+# volume, adj_close from the mpl-data/example directory. The record array
+# stores the date as an np.datetime64 with a day unit ('D') in the date column.
+with cbook.get_sample_data('goog.npz') as datafile:
+ r = np.load(datafile)['price_data'].view(np.recarray)
r = r[-30:] # get the last 30 days
-
+# Matplotlib works better with datetime.datetime than np.datetime64, but the
+# latter is more portable.
+date = r.date.astype('O')
# first we'll do it the default way, with gaps on weekends
fig, axes = plt.subplots(ncols=2, figsize=(8, 4))
ax = axes[0]
-ax.plot(r.date, r.adj_close, 'o-')
+ax.plot(date, r.adj_close, 'o-')
ax.set_title("Default")
fig.autofmt_xdate()
@@ -36,7 +37,7 @@
def format_date(x, pos=None):
thisind = np.clip(int(x + 0.5), 0, N - 1)
- return r.date[thisind].strftime('%Y-%m-%d')
+ return date[thisind].strftime('%Y-%m-%d')
ax = axes[1]
ax.plot(ind, r.adj_close, 'o-')
@@ -3,7 +3,7 @@
import matplotlib.mlab as mlab
import matplotlib.cbook as cbook
-datafile = cbook.get_sample_data('aapl.csv', asfileobj=False)
+datafile = cbook.get_sample_data('msft.csv', asfileobj=False)
print('loading', datafile)
r = mlab.csv2rec(datafile)
r.sort()
@@ -3,7 +3,7 @@
import matplotlib.mlab as mlab
import matplotlib.cbook as cbook
-datafile = cbook.get_sample_data('aapl.csv', asfileobj=False)
+datafile = cbook.get_sample_data('msft.csv', asfileobj=False)
print('loading', datafile)
r = mlab.csv2rec(datafile)
@@ -19,20 +19,15 @@
import matplotlib.pyplot as plt
# load some financial data; apple's stock price
-fh = cbook.get_sample_data('aapl.npy.gz')
-try:
- # Python3 cannot load python2 .npy files with datetime(object) arrays
- # unless the encoding is set to bytes. However this option was
- # not added until numpy 1.10 so this example will only work with
- # python 2 or with numpy 1.10 and later.
- r = np.load(fh, encoding='bytes')
-except TypeError:
- r = np.load(fh)
-fh.close()
+with cbook.get_sample_data('aapl.npz') as fh:
+ r = np.load(fh)['price_data'].view(np.recarray)
r = r[-250:] # get the last 250 days
+# Matplotlib works better with datetime.datetime than np.datetime64, but the
+# latter is more portable.
+date = r.date.astype('O')
fig, ax = plt.subplots()
-ax.plot(r.date, r.adj_close)
+ax.plot(date, r.adj_close)
ax.xaxis.set_major_locator(dates.MonthLocator())
ax.xaxis.set_minor_locator(dates.MonthLocator(bymonthday=15))
@@ -46,5 +41,5 @@
tick.label1.set_horizontalalignment('center')
imid = len(r)//2
-ax.set_xlabel(str(r.date[imid].year))
+ax.set_xlabel(str(date[imid].year))
plt.show()
@@ -5,19 +5,11 @@
import matplotlib.pyplot as plt
import matplotlib.cbook as cbook
-# Load a numpy record array from yahoo csv data with fields date,
-# open, close, volume, adj_close from the mpl-data/example directory.
-# The record array stores python datetime.date as an object array in
-# the date column
-datafile = cbook.get_sample_data('goog.npy')
-try:
- # Python3 cannot load python2 .npy files with datetime(object) arrays
- # unless the encoding is set to bytes. However this option was
- # not added until numpy 1.10 so this example will only work with
- # python 2 or with numpy 1.10 and later
- price_data = np.load(datafile, encoding='bytes').view(np.recarray)
-except TypeError:
- price_data = np.load(datafile).view(np.recarray)
+# Load a numpy record array from yahoo csv data with fields date, open, close,
+# volume, adj_close from the mpl-data/example directory. The record array
+# stores the date as an np.datetime64 with a day unit ('D') in the date column.
+with cbook.get_sample_data('goog.npz') as datafile:
+ price_data = np.load(datafile)['price_data'].view(np.recarray)
price_data = price_data[-250:] # get the most recent 250 trading days
delta1 = np.diff(price_data.adj_close)/price_data.adj_close[:-1]
Binary file not shown.
Binary file not shown.
Oops, something went wrong.