Merge branch 'tests' into develop

milcent · Mar 27, 2020 · fd3c78b · fd3c78b
2 parents e0ae5d4 + 758f28c
commit fd3c78b
Show file tree

Hide file tree

Showing 6 changed files with 422 additions and 10 deletions.
diff --git a/benford/benford.py b/benford/benford.py
@@ -35,7 +35,7 @@ class Base(DataFrame):
         TypeError: if not receiving `int` or `float` as input.
     """
     def __init__(self, data, decimals, sign='all', sec_order=False):
-
+        
         DataFrame.__init__(self, {'Seq': data})
 
         if (self.Seq.dtypes != 'float64') & (self.Seq.dtypes != 'int64'):

diff --git a/benford/checks.py b/benford/checks.py
@@ -3,7 +3,8 @@
 from .constants import digs_dict, rev_digs, confs
 
 def _check_digs_(digs):
-    """Chhecks the possible values for the digs of the First Digits test1
+    """Checks the possible values for the digs parameter of the
+    First Digits tests
     """
     if digs not in [1, 2, 3]:
         raise ValueError("The value assigned to the parameter -digs- "
@@ -30,6 +31,24 @@ def _check_test_(test):
                          f'values are\n {list(digs_dict.keys())} for ints and'
                          f'\n {list(rev_digs.keys())} for strings.')
 
+def _check_decimals_(decimals):
+    """"""
+    if isinstance(decimals, int):
+        if (decimals < 0):
+            raise ValueError("Parameter -decimals- must be an int >= 0, or 'infer'.")
+    else:
+        if decimals != 'infer':
+            raise ValueError("Parameter -decimals- must be an int >= 0, or 'infer'.")
+    return decimals
+
+def _check_sign_(sign):
+    """"""
+    if sign not in ['all', 'pos', 'neg']:
+        raise ValueError("Parameter -sign- must be one of the following: "
+                         "'all', 'pos' or 'neg'.")
+    return sign
+
+
 def _check_confidence_(confidence):
     """"""
     if confidence not in confs.keys():
@@ -55,10 +74,15 @@ def _check_num_array_(data):
         except:
             raise ValueError('Could not convert data. Check input.')
         print('\nConversion successful.')
-    elif (data.dtype != int) & (data.dtype != float):
-        print("\n`data` type not int nor float. Trying to convert...")
+
         try:
             data = data.astype(float)
         except:
             raise ValueError('Could not convert data. Check input.')
+    else:
+        if data.dtype not in [int, float]:
+            try:
+                data = data.astype(float)
+            except:
+                raise ValueError('Could not convert data. Check input.')
     return data
diff --git a/benford/stats.py b/benford/stats.py
@@ -76,7 +76,7 @@ def KS(frame, confidence, N, verbose=True):
     
     Returns:
         The Suprem, which is the greatest absolute difference between the
-            Found end th expected proportions, and the Kolmogorov-Smirnov
+            Found and the expected proportions, and the Kolmogorov-Smirnov
             critical value according to the confidence level, for ccomparison
     """
     if confidence is None:

diff --git a/benford/utils.py b/benford/utils.py
@@ -1,8 +1,9 @@
 from pandas import Series, DataFrame
 from numpy import array, arange, log10, ndarray
 from .expected import _test_
-from .constants import digs_dict
+from .constants import digs_dict, rev_digs
 from .stats import Z_score
+from .checks import _check_num_array_, _check_sign_, _check_decimals_
 
 
 def _set_N_(len_df, limit_N):
@@ -58,25 +59,88 @@ def input_data(given):
         raise TypeError("Wrong data input type. Check docstring.")
     return data, chosen
 
+def set_sign(data, sign="all"):
+    """
+    """
+    sign = _check_sign_(sign)
+
+    if sign == 'all':
+        data.Seq = data.Seq.loc[data.Seq != 0]
+    elif sign == 'pos':
+        data.Seq = data.Seq.loc[data.Seq > 0]
+    else:
+        data.Seq = data.Seq.loc[data.Seq < 0]
+
+    return data.dropna()
+
+
+def get_times_10_power(data, decimals=2):
+    """"""
+    decimals = _check_decimals_(decimals)
+
+    ab = data.Seq.abs()
 
-def prepare(data, digs, limit_N, simple=False, confidence=None):
+    if data.Seq.dtypes == 'int64':
+        data['ZN'] = ab
+    else:
+        if decimals == 'infer':
+            data['ZN'] = ab.astype(str).str\
+                            .replace('.', '')\
+                            .str.lstrip('0')\
+                            .str[:5].astype(int)
+        else:
+            data['ZN'] = (ab * (10 ** decimals)).astype(int)
+    return data
+
+
+def extract_digs(data, decimals=2, sign="all"):
+    """ 
+    """
+    df = DataFrame({'Seq': _check_num_array_(data)})
+
+    df = choose_sign(df, sign=sign)
+
+    df = get_times_10_power(df, decimals=decimals)
+
+    # First digits
+    for col in ['F1D', 'F2D', 'F3D']:
+        temp = df.ZN.loc[df.ZN >= 10 ** (rev_digs[col] - 1)]
+        df[col] = (temp // 10 ** ((log10(temp).astype(int)) -
+                                    (rev_digs[col] - 1)))
+        # fill NANs with -1, which is a non-usable value for digits,
+        # to be discarded later.
+        df[col] = df[col].fillna(-1).astype(int)
+    # Second digit
+    temp_sd = df.loc[df.ZN >= 10]
+    df['SD'] = (temp_sd.ZN // 10**((log10(temp_sd.ZN)).astype(int) -
+                                        1)) % 10
+    df['SD'] = df['SD'].fillna(-1).astype(int)
+    # Last two digits
+    temp_l2d = df.loc[df.ZN >= 1000]
+    df['L2D'] = temp_l2d.ZN % 100
+    df['L2D'] = df['L2D'].fillna(-1).astype(int)
+    return df
+
+
+def prepare(data, digs, limit_N=None, simple=False, confidence=None):
     """Transforms the original number sequence into a DataFrame reduced
     by the ocurrences of the chosen digits, creating other computed
     columns
     """
     N = _set_N_(len(data), limit_N=limit_N)
 
     # get the number of occurrences of the digits
-    v = data.value_counts()
+    counts = data.value_counts()
     # get their relative frequencies
-    p = data.value_counts(normalize=True)
+    proportions = data.value_counts(normalize=True)
     # crate dataframe from them
-    dd = DataFrame({'Counts': v, 'Found': p}).sort_index()
+    dd = DataFrame({'Counts': counts, 'Found': proportions}).sort_index()
     # join the dataframe with the one of expected Benford's frequencies
     dd = _test_(digs).join(dd).fillna(0)
     # create column with absolute differences
     dd['Dif'] = dd.Found - dd.Expected
     dd['AbsDif'] = dd.Dif.abs()
+    print(dd.Found)
     if simple:
         del dd['Dif']
         return dd