### First let's import some libraries and get some public data

#### Imports

In [1]:
import numpy as np
import pandas as pd
import pandas_datareader.data as web
%matplotlib inline
import matplotlib.pyplot as plt
#%matplotlib notebook

#### Fetch S&P500 ETF market data

In [2]:
sp = web.DataReader('SPY', 'yahoo', start='1990-01-01', end='2016-10-31')

In [3]:
sp.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1993-01-29,43.9687,43.9687,43.75,43.9375,1003200,28.165877
1993-02-01,43.9687,44.25,43.9687,44.25,480500,28.366203
1993-02-02,44.2187,44.375,44.125,44.3437,201300,28.426269
1993-02-03,44.4062,44.8437,44.375,44.8125,529400,28.726791
1993-02-04,44.9687,45.0937,44.4687,45.0,531500,28.846986


#### Now let's add some computed information

In [4]:
#adding '_' to facilitate handling the column
sp.rename(columns={'Adj Close':'Adj_Close'}, inplace=True) 
sp['p_r'] = sp.Adj_Close/sp.Adj_Close.shift()-1        #simple returns
sp['l_r'] = np.log(sp.Adj_Close/sp.Adj_Close.shift())  #log returns
sp.tail()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Adj_Close,p_r,l_r
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2016-10-25,214.679993,214.979996,213.979996,214.169998,66542300,214.169998,-0.003351,-0.003356
2016-10-26,213.210007,214.419998,212.929993,213.740005,75705500,213.740005,-0.002008,-0.00201
2016-10-27,214.580002,214.619995,213.080002,213.169998,77220200,213.169998,-0.002667,-0.00267
2016-10-28,213.139999,213.929993,211.710007,212.539993,140623200,212.539993,-0.002955,-0.00296
2016-10-31,212.929993,213.190002,212.360001,212.550003,61272500,212.550003,4.7e-05,4.7e-05


In [31]:
sp.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 5984 entries, 1993-01-29 to 2016-10-31
Data columns (total 8 columns):
Open         5984 non-null float64
High         5984 non-null float64
Low          5984 non-null float64
Close        5984 non-null float64
Volume       5984 non-null int64
Adj_Close    5984 non-null float64
p_r          5983 non-null float64
l_r          5983 non-null float64
dtypes: float64(7), int64(1)
memory usage: 420.8 KB


In [5]:
import benford as bf

### Now to real usage
##### Benford's Analysis is the study of a series of numbers through the recording of digits position and comparing the digits proportion in the series with Benford's expected didstributions.

##### The main class in the Benford module is 'Analysis'. It takes a sequence of numbers and creates a pandas DataFrame with columns that refer to the digits and their position in each number.

In [23]:
a = bf.Analysis(sp.Volume, sign='all', dec=0)  # used absolute values of SPY log returns
a.head(10)

Initialized sequence with 5984 registries.


Unnamed: 0_level_0,Seq,ZN,F1D,SD,F2D,F3D,L2D
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1993-01-29,1003200,1003200,1,0,10,100,0
1993-02-01,480500,480500,4,8,48,480,0
1993-02-02,201300,201300,2,0,20,201,0
1993-02-03,529400,529400,5,2,52,529,0
1993-02-04,531500,531500,5,3,53,531,0
1993-02-05,492100,492100,4,9,49,492,0
1993-02-08,596100,596100,5,9,59,596,0
1993-02-09,122100,122100,1,2,12,122,0
1993-02-10,379600,379600,3,7,37,379,0
1993-02-11,19500,19500,1,9,19,195,0


In [36]:
a.duplicates(top_Rep=10)

Found 66 duplicated entries
The entries with the 10 highest repitition counts are:
         Count
Entries       
390700       3
99300        3
389400       3
358200       3
66900        3
30800        2
344500       2
373700       2
353800       2
318500       2


In [37]:
a.maps

{'F2D': array([40, 72, 29, 24]),
 'L2D': array([0]),
 'dup': array([390700,  99300, 389400, 358200,  66900,  30800, 344500, 373700,
        353800, 318500])}