# CPS600 - Python Programming for Finance 
###  
<img src="https://www.syracuse.edu/wp-content/themes/g6-carbon/img/syracuse-university-seal.svg?ver=6.3.9" style="width: 200px;"/>

# Visualization (& Munging)

###  October 2, 2018

In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
from os import listdir
from os.path import isfile, join
import glob
import re
%matplotlib inline

## Some `DataFrame`s

First, we take a look at the `pandas` library and its `DataFrame` objects.

Well, `DataFrame`s actually start with `Series`.

In [None]:
s = pd.Series([1,3,5,np.nan,6,8])
s

Now we'll create a `DataFrame` having a datetime index. First, the index, which is an object in its own right.

In [None]:
dates = pd.date_range('20130101', periods=6)
dates

In [None]:
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))
df

Creating a `DataFrame` from a dictionary.

In [None]:
df2 = pd.DataFrame({ 'A' : 1.,
                     'B' : pd.Timestamp('20130102'),
                     'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
                     'D' : np.array([3] * 4,dtype='int32'),
                     'E' : pd.Categorical(["test","train","test","train"]),
                     'F' : 'foo' })
df2

Note that these are different from arrays in at least one important way:

In [None]:
df2.dtypes

We can look at attributes and methods using *tab completion*

In [None]:
type(df2.A)

Let's look at a piece of it.

In [None]:
df.head(2)

In [17]:
g = df.tail(1)

In [None]:
g['A'] = 1

In [None]:
df

In [None]:
g

The *index* keeps track of the rows...

In [None]:
df.index

And the columns, for that matter

In [None]:
df.columns

In [None]:
df2.values

In [None]:
df.values

The `describe` method provides summary statistics.

In [None]:
df.describe()

Taking the transpose is easy:

In [None]:
df.T.describe()

We can sort by the rows

In [None]:
df.sort_index(axis=1, ascending=False) #Returning a copy

We can sort by a specific column (that we name)

In [None]:
df.sort_values(by='B') #Returning a copy

### Selection

Here are two different ways to ask for the same column (sort of)

In [None]:
type(df['A'])

In [None]:
df.A

This slices the rows:

In [None]:
df[0:3]

In [None]:
df['20130102':'20130104']

More expressive selections are possible using `.loc`

In [None]:
df.loc[dates[0]]

In [None]:
df.loc[:,['A','B']]

In [None]:
type(df.loc['20130102':'20130104',['A','B']])

In [None]:
type(df.loc['20130102',['A','B']])

A single 'entry'

In [None]:
type(df.loc[dates[0],'A'])

Equivalently,

In [None]:
df.at[dates[0],'A']

We can also select by location via `.iloc'

In [None]:
df.iloc[3]

In [None]:
df.iloc[3:5,0:2]

In [None]:
df.iloc[[1,2,4],[0,2]]

In [None]:
df.iloc[1:3,:]

In [None]:
df.iloc[:,1:3]

Again, accessing a single entry.

In [None]:
df.iloc[1,1]

In [None]:
df.iat[1,1]

Now it gets more interesting. This is where we can does SQLesque things.

In [None]:
df[df.B > 0]

In [None]:
df[df > 0]

Another way to filter

In [None]:
df2 = df.copy()
df2['E'] = ['one', 'one','two','three','four','three']

In [None]:
df2[df2['E'].isin(['two','four'])]

### Setting

In [67]:
s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range('20130102', periods=6))

In [68]:
df['F'] = s1 # Adding a column (called F)

In [69]:
df.at[dates[0],'A'] = 0 # Changing an entry

In [71]:
df.iat[0,1] = 0 # Also changing an entry

In [73]:
df.loc[:,'D'] = np.array([5] * len(df))

In [None]:
df

Here, setting values in places where a certain condition holds

In [75]:
df2 = df.copy()

In [76]:
df2[df2 > 0] = -df2

In [None]:
df2

In [78]:
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E'])
df1.loc[dates[0]:dates[1],'E'] = 1

We can drop the missing values (`np.nan`).

In [None]:
df1

In [None]:
df1.dropna(how='any')

Just a single row that will work.

Alternatively, we could fill in the missing values

In [83]:
df1.fillna(value=5, inplace=True)

Or, we might be interested in a dataframe of bools

In [None]:
pd.isna(df1)

### Statistics

In [None]:
df.mean(), 
#df.mean(1)

### Apply

We can apply a function row-wise (or column-wise) to a data frame

In [None]:
df.apply(lambda x: x.max() - x.min(),axis=1)

### Merge, join & concatenate

While we're at it, here is a handy way to generate a random data frame quickly:

In [90]:
df = pd.DataFrame(np.random.randn(10, 4))

Next we create a list of data frames - some pieces of the original.

In [91]:
pieces = [df[:3], df[3:7], df[7:]]

Then we concatenate them.

In [None]:
df

In [None]:
pd.concat(pieces) == df

We can also join two separate tables that share some kind of a column

In [98]:
left = pd.DataFrame({'key': ['foo', 'foo'], 'lval': [1, 2]})
right = pd.DataFrame({'key': ['foo', 'foo'], 'rval': [4, 5]})

In [None]:
pd.merge(left, right, on='key',how='inner')

Another case to consider is *appending*

In [101]:
df = pd.DataFrame(np.random.randn(8, 4), columns=['A','B','C','D'])

In [None]:
s = df.iloc[3]
s

In [None]:
s

In [None]:
df.append(s, ignore_index=True)

### Split-Apply-Combine

No demo of SQLesque operations would be complete without a *grouping* example

In [106]:
df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
                           'foo', 'bar', 'foo', 'foo'],
                    'B' : ['one', 'one', 'two', 'three',
                           'two', 'two', 'one', 'three'],
                    'C' : np.random.randn(8),
                    'D' : np.random.randn(8)})

In [None]:
df.groupby('A').sum()

In [None]:
df.groupby(['A','B']).sum()

## Two-Dimensional Plotting

### One-Dimensional Data Set

In [2]:
np.random.seed(1000)
# Seed the random number generator with a fixed value.
# This way, you'll always get the same random number sequence.
# we generate 20 standard normally distributed (pseudo)random numbers as a NumPy ndarray
y = np.random.standard_normal(20) # returns ndarray

In [None]:
y

In [None]:
print (len(y))
x = range(len(y))
# x values: a list or an array containing the x coordinates 
# y values: a list or an array containing the y coordinates

plt.plot(x, y)
# title: Plot given x- and y-values

In [None]:
x

In [None]:
plt.plot(y)
# y is an ndarray. Index is taken as an respective data.
# title: Plot given data as 1d-array

In [None]:
plt.plot(y.cumsum()) 
# cumulative sum
# title: Plot given a 1d-array with method attached

In [None]:
plt.plot(y.cumsum())
plt.grid(True)  # adds a grid
plt.axis('tight')  # adjusts the axis ranges
# axis : on , off, equal, scaled, tight, image, [xmin, xmax,ymin, ymax]
# title: Plot with grid and tight axes

<table> 
<tr><td> Parameter</td> <td> Description </td></tr>
<tr><td>on </td> <td>   Turns axis lines and labels on</td></tr>
<tr><td>off   </td> <td>   Turns axis lines and labels off</td></tr>
<tr><td>equal </td> <td>   Leads to equal scaling</td></tr>
<tr><td>scaled </td> <td> Equal scaling via dimension changes</td></tr>
<tr><td>tight  </td> <td>  Makes all data visible (tightens limits)</td></tr>
<tr><td>image </td> <td>   Makes all data visible (with data limits)</td></tr>
<tr><td>[xmin, xmax, ymin, ymax] </td> <td>  Sets limits to given ( list of) values</td></tr>
</table>

In [None]:
#  plt.plot(y.cumsum())
plt.grid(True)
plt.xlim(-1, 20)
plt.ylim(np.min(y.cumsum()) - 1,
         np.max(y.cumsum()) + 1)
# title: Plot with custom axes limits

In [None]:
plt.figure(figsize=(7, 4))     # size of the figure x=7 units, y = 4 units
plt.plot(y.cumsum(), 'b', lw=1.5)  # blue, line width 1.5 points
plt.plot(y.cumsum(), 'ro')     # red circle marker
plt.grid(True)
plt.axis('tight')
plt.xlabel('index')
plt.ylabel('value')
plt.title('A Simple Plot')
# colors: b (blue), g (green), r (red), y (yellow), 
#         w (white), c (cyan), m (magenta), k (black)
# title: Plot with typical labels

### Two-Dimensional Data Set

In [11]:
np.random.seed(2000)
y = np.random.standard_normal((20, 2)).cumsum(axis=0)

#  2 dimensional sample data set
# the code generates  ndarray of shape  20x2 
#     with standard-normally distributed  pseudo random numbers
# cumulative sum is calculated along the first dimention

In [None]:
y

In [None]:
plt.figure(figsize=(7, 4))
plt.plot(y, lw=1.5)

#plt.plot(y, 'ro')      # Circle marker
plt.plot(y, 'rD')       # Diamond marker
#plt.plot(y, 'r*')      # start marker
                        # rey also ^ 1 2 3 4 s p + x | and more

plt.grid(True)
plt.axis('tight')
plt.xlabel('index')
plt.ylabel('value')
plt.title('A Simple Plot')
# title: Plot with two data sets

In [14]:
#    -    Solid line style
#    —    Dashed line style
#    -.   Dash-dot line style
#    :    Dotted line style
#    .    Point marker
#    ,    Pixel marker
#    o    Circle marker
#    v    Triangle_down marker
#    ^    Triangle_up marker
#    <    Triangle_left marker
#    >    Triangle_right marker
#    1    Tri_down marker
#    2    Tri_up marker
#    3    Tri_left marker
#    4    Tri_right marker
#    s    Square marker
#    p    Pentagon marker
#    *    Star marker
#    h    Hexagon1 marker
#    H    Hexagon2 marker
#    +    Plus marker
#    x    X marker
#    D    Diamond marker
#    d    Thin diamond marker
#    |    Vline marker

In [None]:
plt.figure(figsize=(7, 4))
plt.plot(y[:, 0], lw=1.5, label='1st')  # first column
plt.plot(y[:, 1], lw=1.5, label='2nd')  # second column
plt.plot(y, 'ro')  # red circle markers
plt.grid(True)
plt.legend(loc=0)  # location - 0= best possible ; 0-10
plt.axis('tight')  # tight limits
plt.xlabel('index')
plt.ylabel('value')
plt.title('A Simple Plot')
# title: Plot with labeled data sets

In [16]:
#    Empty    Automatic
#    0        Best possible
#    1        Upper right
#    2        Upper left
#    3        Lower left
#    4        Lower right
#    5        Right
#    6        Center left
#    7        Center right
#    8        Lower center
#    9        Upper center
#    10       Center

In [None]:
y[:, 0] = y[:, 0] * 100
plt.figure(figsize=(7, 4))
plt.plot(y[:, 0], lw=1.5, label='1st')
plt.plot(y[:, 1], lw=1.5, label='2nd')
plt.plot(y, 'ro')
plt.grid(True)
plt.legend(loc=0)
plt.axis('tight')
plt.xlabel('index')
plt.ylabel('value')
plt.title('A Simple Plot')
# title: Plot with two differently scaled data sets

In [None]:
fig, ax1 = plt.subplots()  # returns axis objects
# plot first data set using the first (left) axix
plt.plot(y[:, 0], 'b', lw=1.5, label='1st')
plt.plot(y[:, 0], 'ro')
plt.grid(True)
plt.legend(loc=8) # location: lower-center
plt.axis('tight')
plt.xlabel('index')
plt.ylabel('value 1st')
plt.title('A Simple Plot')
ax2 = ax1.twinx()   #generates twin axix
# plot the second data set using the second (right) axis
plt.plot(y[:, 1], 'g', lw=1.5, label='2nd')
plt.plot(y[:, 1], 'ro')
plt.legend(loc=0) # location best possible
plt.ylabel('value 2nd')
# title: Plot with two data sets and two y-axes

In [None]:
plt.figure(figsize=(7, 5))
plt.subplot(211)  # picture: 2 rows, 1 column, 1 -figure number
plt.plot(y[:, 0], lw=1.5, label='1st') # first data set
plt.plot(y[:, 0], 'ro')
plt.grid(True)
plt.legend(loc=0) # location - best possible
plt.axis('tight')
plt.ylabel('value')
plt.title('A Simple Plot')
plt.subplot(212)  # picture: 2 rows, 1 column, 2 -figure number
plt.plot(y[:, 1], 'g', lw=1.5, label='2nd')  #second data set
plt.plot(y[:, 1], 'ro')
plt.grid(True)
plt.legend(loc=0)
plt.axis('tight')
plt.xlabel('index')
plt.ylabel('value')
# title: Plot with two sub-plots

In [None]:
plt.figure(figsize=(9, 4))
plt.subplot(121)  # picture: 1 row, 2 columns, 1 -figure number
plt.plot(y[:, 0], lw=1.5, label='1st')
plt.plot(y[:, 0], 'ro')
plt.grid(True)
plt.legend(loc=0)
plt.axis('tight')
plt.xlabel('index')
plt.ylabel('value')
plt.title('1st Data Set')

plt.subplot(122)   # picture: 1 row, 1 columns, 2 -figure number
plt.bar(np.arange(len(y)), y[:, 1], width=0.5,
        color='g', label='2nd')
plt.grid(True)
plt.legend(loc=0)
plt.axis('tight')
plt.xlabel('index')
plt.title('2nd Data Set')
# title: Plot combining line/point sub-plot with bar sub-plot
# size: 80

### Other Plot Styles

In [None]:
y = np.random.standard_normal((1000, 2))
# 2 dimensional data set

In [None]:
y

In [None]:
plt.figure(figsize=(7, 5))
plt.plot(y[:, 0], y[:, 1], 'ro')
plt.grid(True)
plt.xlabel('1st')
plt.ylabel('2nd')
plt.title('Scatter Plot')
# title: Scatter plot via +plot+ function

In [None]:
plt.figure(figsize=(7, 5))
plt.scatter(y[:, 0], y[:, 1], marker='o')
plt.grid(True)
plt.xlabel('1st')
plt.ylabel('2nd')
plt.title('Scatter Plot')
# title: Scatter plot via +scatter+ function

In [None]:
plt.figure(figsize=(7, 5))
plt.scatter(y[:, 0], y[:, 1], marker='o')
plt.grid(True)
plt.xlabel('1st')
plt.ylabel('2nd')
plt.title('Scatter Plot')
# title: Scatter plot via +scatter+ function

In [25]:
c = np.random.randint(0, 10, len(y))
# return random integers from low (inclusive) to high (exclusive)
# returns numpy ndarray with len(y) items

In [None]:
c

In [None]:
plt.figure(figsize=(7, 5))
plt.scatter(y[:, 0], y[:, 1], c=c, marker='o') 
# c=c is a color parameter
plt.colorbar()   # add color bar
plt.grid(True)
plt.xlabel('1st')
plt.ylabel('2nd')
plt.title('Scatter Plot')
# title: Scatter plot with third dimension

In [None]:
plt.figure(figsize=(7, 4))
plt.hist(y, label=['1st', '2nd'], bins=25)
plt.grid(True)
plt.legend(loc=0)
plt.xlabel('value')
plt.ylabel('frequency')
plt.title('Histogram')
# title: Histogram for two data sets

In [None]:
plt.figure(figsize=(7, 4))
plt.hist(y, label=['1st', '2nd'], color=['b', 'g'],
            stacked=True, bins=20)
plt.grid(True)
plt.legend(loc=0)
plt.xlabel('value')
plt.ylabel('frequency')
plt.title('Histogram')
# title: Stacked histogram for two data sets

In [None]:
y = np.random.standard_normal((1000, 2))
# 2 dimensional data set

fig, ax = plt.subplots(figsize=(7, 4))
plt.boxplot(y)
plt.grid(True)
plt.setp(ax, xticklabels=['1st', '2nd'])
plt.xlabel('data set')
plt.ylabel('value')
plt.title('Boxplot')

In [None]:
from matplotlib.patches import Polygon
def func(x):
    return 0.5 * np.exp(x) + 1

a, b = 0.5, 1.5  # integral limits
x = np.linspace(0, 2)
y = func(x)

fig, ax = plt.subplots(figsize=(7, 5))
plt.plot(x, y, 'b', linewidth=2)
plt.ylim(ymin=0)

# Illustrate the integral value, i.e. the area under the function
# between lower and upper limit
Ix = np.linspace(a, b)
Iy = func(Ix)
verts = [(a, 0)] + list(zip(Ix, Iy)) + [(b, 0)]
poly = Polygon(verts, facecolor='0.7', edgecolor='0.5')
ax.add_patch(poly)

plt.text(0.5 * (a + b), 1, r"$\int_a^b f(x)\mathrm{d}x$",
         horizontalalignment='center', fontsize=30)

plt.figtext(0.9, 0.075, '$x$')
plt.figtext(0.075, 0.9, '$f(x)$')

ax.set_xticks((a, b))
ax.set_xticklabels(('$a$', '$b$'))
ax.set_yticks([func(a), func(b)])
ax.set_yticklabels(('$f(a)$', '$f(b)$'))
plt.grid(True)
# title: Exponential function, integral area and Latex labels
# size: 60

In [None]:
print(x)
print (y)

In [None]:
print(verts)

## Financial Plots

In [9]:
import numpy as np
import pandas as pd
import mpl_finance as mpf

In [44]:
start = datetime.datetime(2014, 1, 1)
end = datetime.datetime(2014, 2, 26)

# Getting some quotes data
quotes = pd.read_csv('yahoofinance-AAPL.csv')

# We need datetime values for filtering
quotes['Date'] = pd.to_datetime(quotes.Date.apply(lambda x: ''.join(x.split('-'))).values)

# Here filtering
quotes = quotes[(quotes['Date'] > start) & (quotes['Date'] < end)]

In [47]:
start = datetime.datetime(2016, 1, 1)
end = datetime.datetime(2016, 3, 31)

quotes = pd.read_csv('yahoofinance-GOOG.csv')

# We need datetime values for filtering
quotes['Date'] = pd.to_datetime(quotes.Date.apply(lambda x: ''.join(x.split('-'))).values)

# Here filtering
quotes = quotes[(quotes['Date'] > start) & (quotes['Date'] < end)]

y= np.array(quotes)

## 3d Plotting

In [59]:
X = np.linspace(50, 150, 24)
Y = np.linspace(0.5, 2.5, 24)
X, Y = np.meshgrid(X, Y)
# cretae 2 dimentional  coordinate system
# numpy meshgrid function generates such a system 
#   out of 2 one dimentional ndarray objects

In [None]:
X[:2]

In [63]:
Z = (X - 100) ** 2 / (100 * X) / Y

In [None]:
from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure(figsize=(9, 6))
ax = fig.gca(projection='3d')

surf = ax.plot_surface(X, Y, Z, rstride=2, cstride=2,
                       cmap=plt.cm.coolwarm, linewidth=0.5,
                       antialiased=True)

ax.set_xlabel('strike')
ax.set_ylabel('time-to-maturity')
ax.set_zlabel('implied volatility')

fig.colorbar(surf, shrink=0.5, aspect=5)
# title: 3d surface plot for (fake) implied volatilities
# size: 70

In [None]:
#    Parameters for plot_surface
#    ===========================
#    Parameter     Description
#    ---------     -----------
#    X, Y, Z       Data values as 2D arrays
#    rstride       Array row stride (step size)
#    cstride       Array column stride (step size)
#    color         Color of the surface patches
#    cmap          A colormap for the surface patches
#    facecolors    Face colors for the individual patches
#    norm          An instance of Normalize to map values to colors
#    vmin          Minimum value to map
#    vmax          Maximum value to map
#    shade         Whether to shade the face colors

In [None]:
fig = plt.figure(figsize=(8, 5))
ax = fig.add_subplot(111, projection='3d')
ax.view_init(30, 60)

ax.scatter(X, Y, Z, zdir='z', s=25, c='b',
           marker='^', cmap=plt.cm.coolwarm)

ax.set_xlabel('strike')
ax.set_ylabel('time-to-maturity')
ax.set_zlabel('implied volatility')
# title: 3d scatter plot for (fake) implied volatilities
# size: 70

## Further Reading

In [None]:
# %load http://matplotlib.org/mpl_examples/pie_and_polar_charts/pie_demo_features.py
"""
Demo of a basic pie chart plus a few additional features.

In addition to the basic pie chart, this demo shows a few optional features:

    * slice labels
    * auto-labeling the percentage
    * offsetting a slice with "explode"
    * drop-shadow
    * custom start angle

Note about the custom start angle:

The default ``startangle`` is 0, which would start the "Frogs" slice on the
positive x-axis. This example sets ``startangle = 90`` such that everything is
rotated counter-clockwise by 90 degrees, and the frog slice starts on the
positive y-axis.
"""
import matplotlib.pyplot as plt

# The slices will be ordered and plotted counter-clockwise.
labels = 'Frogs', 'Hogs', 'Dogs', 'Logs'
sizes = [15, 30, 45, 10]
colors = ['yellowgreen', 'gold', 'lightskyblue', 'lightcoral']
explode = (0, 0.1, 0, 0)  # only "explode" the 2nd slice (i.e. 'Hogs')

plt.pie(sizes, explode=explode, labels=labels, colors=colors,
        autopct='%1.1f%%', shadow=True, startangle=90)
# Set aspect ratio to be equal so that pie is drawn as a circle.
plt.axis('equal')

fig = plt.figure()
ax = fig.gca()
import numpy as np

ax.pie(np.random.random(4), explode=explode, labels=labels, colors=colors,
       autopct='%1.1f%%', shadow=True, startangle=90,
       radius=0.25, center=(0, 0), frame=True)
ax.pie(np.random.random(4), explode=explode, labels=labels, colors=colors,
       autopct='%1.1f%%', shadow=True, startangle=90,
       radius=0.25, center=(1, 1), frame=True)
ax.pie(np.random.random(4), explode=explode, labels=labels, colors=colors,
       autopct='%1.1f%%', shadow=True, startangle=90,
       radius=0.25, center=(0, 1), frame=True)
ax.pie(np.random.random(4), explode=explode, labels=labels, colors=colors,
       autopct='%1.1f%%', shadow=True, startangle=90,
       radius=0.25, center=(1, 0), frame=True)

ax.set_xticks([0, 1])
ax.set_yticks([0, 1])
ax.set_xticklabels(["Sunny", "Cloudy"])
ax.set_yticklabels(["Dry", "Rainy"])
ax.set_xlim((-0.5, 1.5))
ax.set_ylim((-0.5, 1.5))

# Set aspect ratio to be equal so that pie is drawn as a circle.
ax.set_aspect('equal')

plt.show()


In [None]:
import matplotlib.pyplot as plot 
import matplotlib.image as image 
img = image.imread('isabelle.png')  
# replace isabelle.png by your own image
plot.imshow( img) 
plot.show()

* The home page of matplotlib is, of course, the best starting point:   http://matplotlib.org.
* There’s a gallery with many useful examples: http://matplotlib.org/gallery.html.
* A tutorial for 2D plotting is found here:    http://matplotlib.org/users/pyplot_tutorial.html.
* Another one for 3D plotting is here:         http://matplotlib.org/mpl_toolkits/mplot3d/tutorial.html.