Skip to content

Commit

Permalink
Merge branch 'tests' into develop
Browse files Browse the repository at this point in the history
  • Loading branch information
milcent committed Mar 27, 2020
2 parents e0ae5d4 + 758f28c commit fd3c78b
Show file tree
Hide file tree
Showing 6 changed files with 422 additions and 10 deletions.
2 changes: 1 addition & 1 deletion benford/benford.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ class Base(DataFrame):
TypeError: if not receiving `int` or `float` as input.
"""
def __init__(self, data, decimals, sign='all', sec_order=False):

DataFrame.__init__(self, {'Seq': data})

if (self.Seq.dtypes != 'float64') & (self.Seq.dtypes != 'int64'):
Expand Down
30 changes: 27 additions & 3 deletions benford/checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
from .constants import digs_dict, rev_digs, confs

def _check_digs_(digs):
"""Chhecks the possible values for the digs of the First Digits test1
"""Checks the possible values for the digs parameter of the
First Digits tests
"""
if digs not in [1, 2, 3]:
raise ValueError("The value assigned to the parameter -digs- "
Expand All @@ -30,6 +31,24 @@ def _check_test_(test):
f'values are\n {list(digs_dict.keys())} for ints and'
f'\n {list(rev_digs.keys())} for strings.')

def _check_decimals_(decimals):
""""""
if isinstance(decimals, int):
if (decimals < 0):
raise ValueError("Parameter -decimals- must be an int >= 0, or 'infer'.")
else:
if decimals != 'infer':
raise ValueError("Parameter -decimals- must be an int >= 0, or 'infer'.")
return decimals

def _check_sign_(sign):
""""""
if sign not in ['all', 'pos', 'neg']:
raise ValueError("Parameter -sign- must be one of the following: "
"'all', 'pos' or 'neg'.")
return sign


def _check_confidence_(confidence):
""""""
if confidence not in confs.keys():
Expand All @@ -55,10 +74,15 @@ def _check_num_array_(data):
except:
raise ValueError('Could not convert data. Check input.')
print('\nConversion successful.')
elif (data.dtype != int) & (data.dtype != float):
print("\n`data` type not int nor float. Trying to convert...")

try:
data = data.astype(float)
except:
raise ValueError('Could not convert data. Check input.')
else:
if data.dtype not in [int, float]:
try:
data = data.astype(float)
except:
raise ValueError('Could not convert data. Check input.')
return data
2 changes: 1 addition & 1 deletion benford/stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ def KS(frame, confidence, N, verbose=True):
Returns:
The Suprem, which is the greatest absolute difference between the
Found end th expected proportions, and the Kolmogorov-Smirnov
Found and the expected proportions, and the Kolmogorov-Smirnov
critical value according to the confidence level, for ccomparison
"""
if confidence is None:
Expand Down
74 changes: 69 additions & 5 deletions benford/utils.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
from pandas import Series, DataFrame
from numpy import array, arange, log10, ndarray
from .expected import _test_
from .constants import digs_dict
from .constants import digs_dict, rev_digs
from .stats import Z_score
from .checks import _check_num_array_, _check_sign_, _check_decimals_


def _set_N_(len_df, limit_N):
Expand Down Expand Up @@ -58,25 +59,88 @@ def input_data(given):
raise TypeError("Wrong data input type. Check docstring.")
return data, chosen

def set_sign(data, sign="all"):
"""
"""
sign = _check_sign_(sign)

if sign == 'all':
data.Seq = data.Seq.loc[data.Seq != 0]
elif sign == 'pos':
data.Seq = data.Seq.loc[data.Seq > 0]
else:
data.Seq = data.Seq.loc[data.Seq < 0]

return data.dropna()


def get_times_10_power(data, decimals=2):
""""""
decimals = _check_decimals_(decimals)

ab = data.Seq.abs()

def prepare(data, digs, limit_N, simple=False, confidence=None):
if data.Seq.dtypes == 'int64':
data['ZN'] = ab
else:
if decimals == 'infer':
data['ZN'] = ab.astype(str).str\
.replace('.', '')\
.str.lstrip('0')\
.str[:5].astype(int)
else:
data['ZN'] = (ab * (10 ** decimals)).astype(int)
return data


def extract_digs(data, decimals=2, sign="all"):
"""
"""
df = DataFrame({'Seq': _check_num_array_(data)})

df = choose_sign(df, sign=sign)

df = get_times_10_power(df, decimals=decimals)

# First digits
for col in ['F1D', 'F2D', 'F3D']:
temp = df.ZN.loc[df.ZN >= 10 ** (rev_digs[col] - 1)]
df[col] = (temp // 10 ** ((log10(temp).astype(int)) -
(rev_digs[col] - 1)))
# fill NANs with -1, which is a non-usable value for digits,
# to be discarded later.
df[col] = df[col].fillna(-1).astype(int)
# Second digit
temp_sd = df.loc[df.ZN >= 10]
df['SD'] = (temp_sd.ZN // 10**((log10(temp_sd.ZN)).astype(int) -
1)) % 10
df['SD'] = df['SD'].fillna(-1).astype(int)
# Last two digits
temp_l2d = df.loc[df.ZN >= 1000]
df['L2D'] = temp_l2d.ZN % 100
df['L2D'] = df['L2D'].fillna(-1).astype(int)
return df


def prepare(data, digs, limit_N=None, simple=False, confidence=None):
"""Transforms the original number sequence into a DataFrame reduced
by the ocurrences of the chosen digits, creating other computed
columns
"""
N = _set_N_(len(data), limit_N=limit_N)

# get the number of occurrences of the digits
v = data.value_counts()
counts = data.value_counts()
# get their relative frequencies
p = data.value_counts(normalize=True)
proportions = data.value_counts(normalize=True)
# crate dataframe from them
dd = DataFrame({'Counts': v, 'Found': p}).sort_index()
dd = DataFrame({'Counts': counts, 'Found': proportions}).sort_index()
# join the dataframe with the one of expected Benford's frequencies
dd = _test_(digs).join(dd).fillna(0)
# create column with absolute differences
dd['Dif'] = dd.Found - dd.Expected
dd['AbsDif'] = dd.Dif.abs()
print(dd.Found)
if simple:
del dd['Dif']
return dd
Expand Down

0 comments on commit fd3c78b

Please sign in to comment.