Merge branch 'hotfix/mad_verbose_arg' into develop

milcent · Mar 14, 2020 · fc40c61 · fc40c61
2 parents f58aa37 + 8dca432
commit fc40c61
Show file tree

Hide file tree

Showing 4 changed files with 79 additions and 105 deletions.
diff --git a/benford/benford.py b/benford/benford.py
@@ -10,12 +10,11 @@
      get_mantissas
 from .expected import First, Second, LastTwo, _test_
 from .viz import _get_plot_args, plot_digs, plot_sum, plot_ordered_mantissas,\
-    plot_mantissa_arc_test
+    plot_mantissa_arc_test, plot_roll_mse, plot_roll_mad
 from .reports import _inform_, _report_mad_, _report_summ_, _report_KS_,\
     _report_Z_, _report_chi2_, _report_test_, _deprecate_inform_,\
     _report_mantissa_
-from .stats import Z_score, chi_square, chi_square_2, KS, KS_2, \
-    mad, mse
+from .stats import Z_score, chi_square, chi_square_2, KS, KS_2
 
 class Base(DataFrame):
     """Internalizes and prepares the data for Analysis.
@@ -127,6 +126,7 @@ def __init__(self, base, digs, confidence, limit_N=None, sec_order=False):
         self.chi_square = chi_square_2(self)
         self.KS = KS_2(self)
         self.MAD = self.AbsDif.mean()
+        self.MSE = (self.AbsDif ** 2).mean()
         self.confidence = confidence
         self.digs = digs
         self.sec_order = sec_order
@@ -207,6 +207,7 @@ def __init__(self, base, test):
         self.index = self.index.astype(int)
         #: Mean Absolute Deviation for the test
         self.MAD = self.AbsDif.mean()
+        self.MSE = (self.AbsDif ** 2).mean()
         #: Confidence level to consider when setting some critical values
         self.confidence = None
         # (int): numerical representation of the test at hand 
@@ -636,11 +637,13 @@ def first_digits(self, digs, confidence=None, high_Z='pos',
 
         # Mean absolute difference
         if MAD:
-            self.MAD = mad(df, test=digs, verbose=self.verbose)
+            self.MAD = df.AbsDif.mean()
+            if self.verbose:
+                _report_mad_(digs, self.MAD)
 
         # Mean Square Error
         if MSE:
-            self.MSE = mse(df, verbose=self.verbose)
+            self.MSE = (df.AbsDif ** 2).mean()
 
         # Chi-square statistic
         if chi_square:
@@ -697,8 +700,8 @@ def second_digit(self, confidence=None, high_Z='pos',
 
         conf = confs[confidence]
 
-        temp = self.loc[self.ZN >= 10]
-        temp['SD'] = (temp.ZN // 10**((log10(temp.ZN)).astype(
+        temp = self.loc[self.ZN >= 10, :]
+        temp['SD'] = (temp.ZN // 10 ** ((log10(temp.ZN)).astype(
                       int) - 1)) % 10
 
         if simple:
@@ -718,11 +721,12 @@ def second_digit(self, confidence=None, high_Z='pos',
 
         # Mean absolute difference
         if MAD:
-            self.MAD = mad(df, test=22, verbose=self.verbose)
-
+            self.MAD = df.AbsDif.mean()
+            if self.verbose:
+                _report_mad_(digs, self.MAD)
         # Mean Square Error
         if MSE:
-            self.MSE = mse(df, verbose=self.verbose)
+            self.MSE = (df.AbsDif ** 2).mean()
 
         # Chi-square statistic
         if chi_square:
@@ -794,11 +798,12 @@ def last_two_digits(self, confidence=None, high_Z='pos',
 
         # Mean absolute difference
         if MAD:
-            self.MAD = mad(df, test=-2, verbose=self.verbose)
-
+            self.MAD = df.AbsDif.mean()
+            if self.verbose:
+                _report_mad_(-2, self.MAD)
         # Mean Square Error
         if MSE:
-            self.MSE = mse(df, verbose=self.verbose)
+            self.MSE = (df.AbsDif ** 2).mean()
 
         # Chi-square statistic
         if chi_square:
@@ -977,7 +982,7 @@ def arc_test(self, grid=True, figsize=12):
         plot_mantissa_arc_test(self, stats['gravity_center'], figsize=figsize)
 
 
-class Roll_mad(Series):
+class Roll_mad(object):
     """Applies the MAD to sequential subsets of the Series, returning another
     Series.
 
@@ -1000,37 +1005,29 @@ class Roll_mad(Series):
 
     def __init__(self, data, test, window, decimals=2, sign='all'):
 
-        test = _check_test_(test)
+        #: the test (F1D, SD, F2D...) used for the MAD calculation and critical values
+        self.test = _check_test_(test)
 
         if not isinstance(data, Source):
             start = Source(data, sign=sign, decimals=decimals, verbose=False)
 
-        Exp, ind = prep_to_roll(start, test)
+        Exp, ind = prep_to_roll(start, self.test)
 
-        Series.__init__(self, start[digs_dict[test]].rolling(
-            window=window).apply(mad_to_roll, args=(Exp, ind), raw=False))
-
-        self.dropna(inplace=True)
-        #: the test (F1D, SD, F2D...) used for the MAD calculation and critical values
-        self.test = test
+        self.roll_series = start[digs_dict[test]].rolling(
+                                window=window).apply(mad_to_roll, 
+                                    args=(Exp, ind), raw=False)
+        self.roll_series.dropna(inplace=True)
 
     def show_plot(self, figsize=(15, 8)):
         """Shows the rolling MAD plot
         
         Args:
             figsize: the figure dimensions.
         """
-        fig, ax = plt.subplots(figsize=figsize)
-        ax.set_facecolor(colors['b'])
-        ax.plot(self, color=colors['m'])
-        if self.test != -2:
-            plt.axhline(y=mad_dict[self.test][0], color=colors['af'], linewidth=3)
-            plt.axhline(y=mad_dict[self.test][1], color=colors['h2'], linewidth=3)
-            plt.axhline(y=mad_dict[self.test][2], color=colors['s'], linewidth=3)
-        plt.show(block=False)
+        plot_roll_mad(self, figsize=figsize)
 
 
-class Roll_mse(Series):
+class Roll_mse(object):
     """Applies the MSE to sequential subsets of the Series, returning another
     Series.
 
@@ -1059,21 +1056,19 @@ def __init__(self, data, test, window, decimals=2, sign='all'):
 
         Exp, ind = prep_to_roll(start, test)
 
-        Series.__init__(self, start[digs_dict[test]].rolling(
-            window=window).apply(mse_to_roll, args=(Exp, ind), raw=False))
-
-        self.dropna(inplace=True)
+        self.roll_series = start[digs_dict[test]].rolling(
+                                window=window).apply(mse_to_roll, 
+                                    args=(Exp, ind), raw=False)
+        self.roll_series.dropna(inplace=True)
 
     def show_plot(self, figsize=(15, 8)):
         """Shows the rolling MSE plot
         
         Args:
             figsize: the figure dimensions.
         """
-        fig, ax = plt.subplots(figsize=figsize)
-        ax.set_facecolor(colors['b'])
-        ax.plot(self, color=colors['m'])
-        plt.show(block=False)
+        plot_roll_mse(self.roll_series, figsize=figsize)
+
 
 
 def first_digits(data, digs, decimals=2, sign='all', verbose=True,
@@ -1338,7 +1333,7 @@ def summation(data, digs=2, decimals=2, sign='all', top=20, verbose=True,
         return data
 
 
-def mad(data, test, decimals=2, sign='all'):
+def mad(data, test, decimals=2, sign='all', verbose=False):
     """Calculates the Mean Absolute Deviation of the Series
 
     Args:
@@ -1356,8 +1351,9 @@ def mad(data, test, decimals=2, sign='all'):
     Returns:
         float: the Mean Absolute Deviation of the Series
     """
-    _check_test_(test)
-    start = Source(data.values, sign=sign, decimals=decimals, verbose=False)
+    data = _check_num_array_(data)
+    test = _check_test_(test)
+    start = Source(data, sign=sign, decimals=decimals, verbose=verbose)
     if test in [1, 2, 3]:
         start.first_digits(digs=test, MAD=True, MSE=True, simple=True)
     elif test == 22:
@@ -1367,7 +1363,7 @@ def mad(data, test, decimals=2, sign='all'):
     return start.MAD
 
 
-def mse(data, test, decimals=2, sign='all'):
+def mse(data, test, decimals=2, sign='all', verbose=False):
     """Calculates the Mean Squared Error of the Series
 
     Args:
@@ -1385,8 +1381,9 @@ def mse(data, test, decimals=2, sign='all'):
     Returns:
         float: the Mean Squared Error of the Series
     """
+    data = _check_num_array_(data)
     test = _check_test_(test)
-    start = Source(data, sign=sign, decimals=decimals, verbose=False)
+    start = Source(data, sign=sign, decimals=decimals, verbose=verbose)
     if test in [1, 2, 3]:
         start.first_digits(digs=test, MAD=False, MSE=True, simple=True)
     elif test == 22:
@@ -1396,7 +1393,7 @@ def mse(data, test, decimals=2, sign='all'):
     return start.MSE
 
 
-def mad_summ(data, test, decimals=2, sign='all'):
+def mad_summ(data, test, decimals=2, sign='all', verbose=False):
     """Calculate the Mean Absolute Deviation of the Summation Test
 
     Args:
@@ -1415,9 +1412,10 @@ def mad_summ(data, test, decimals=2, sign='all'):
     Returns:
         float: the Mean Absolute Deviation of the Summation Test
     """
-    _check_digs_(test)
+    data = _check_num_array_(data)
+    test = _check_digs_(test)
 
-    start = Source(data, sign=sign, decimals=decimals, verbose=False)
+    start = Source(data, sign=sign, decimals=decimals, verbose=verbose)
     temp = start.loc[start.ZN >= 10 ** (test - 1)]
     temp[digs_dict[test]] = (temp.ZN // 10 ** ((log10(temp.ZN).astype(
                                                 int)) - (test - 1))).astype(
@@ -1450,11 +1448,11 @@ def rolling_mad(data, test, window, decimals=2, sign='all', show_plot=False):
     Returns:
         Series with sequentially computed MADs.
     """
-    test = _check_test_(test)
+    data = _check_num_array_(data)
     r_mad = Roll_mad(data, test, window, decimals, sign)
     if show_plot:
-        r_mad.show_plot(test)
-    return r_mad
+        r_mad.show_plot()
+    return r_mad.roll_series
 
 
 def rolling_mse(data, test, window, decimals=2, sign='all', show_plot=False):
@@ -1479,10 +1477,11 @@ def rolling_mse(data, test, window, decimals=2, sign='all', show_plot=False):
     Returns:
         Series with sequentially computed MSEs.
     """
+    data = _check_num_array_(data)
     r_mse = Roll_mse(data, test, window, decimals, sign)
     if show_plot:
         r_mse.show_plot()
-    return r_mse
+    return r_mse.roll_series
 
 
 def duplicates(data, top_Rep=20, verbose=True, inform=None):

diff --git a/benford/stats.py b/benford/stats.py
@@ -110,52 +110,3 @@ def KS_2(frame):
     ks_frame = frame.sort_index()[['Found', 'Expected']].cumsum()
     # finding the supremum - the largest cumul dist difference
     return ((ks_frame.Found - ks_frame.Expected).abs()).max()
-
-
-def mad(frame, test, verbose=True):
-    """Computes the Mean Absolute Deviation (MAD) between the found and the
-    expected proportions.
-
-    Args:
-        frame: DataFrame with the Absolute Deviations already calculated.
-        test: Test to compute the MAD from (F1D, SD, F2D...)
-        verbose: prints the MAD result and compares to limit values of
-            conformity. Defaults to True.
-    
-    Returns:
-        The Mean of the Absolute Deviations between the found and expected
-            proportions. 
-    """
-    mad = frame.AbsDif.mean()
-
-    if verbose:
-        print(f"\nThe Mean Absolute Deviation is {mad}")
-
-        if test != -2:
-            print(f"For the {mad_dict[digs_dict[test]]}:\n\
-            - 0.0000 to {mad_dict[test][0]}: Close Conformity\n\
-            - {mad_dict[test][0]} to {mad_dict[test][1]}: Acceptable Conformity\n\
-            - {mad_dict[test][1]} to {mad_dict[test][2]}: Marginally Acceptable Conformity\n\
-            - Above {mad_dict[test][2]}: Nonconformity")
-        else:
-            pass
-    return mad
-
-
-def mse(frame, verbose=True):
-    """Computes the test's Mean Square Error
-
-    Args:
-        frame: DataFrame with the already computed Absolute Deviations between
-            the found and expected proportions
-        verbose: Prints the MSE. Defaults to True.
-    
-    Returns:
-        Mean of the squared differences between the found and the expected proportions.
-    """
-    mse = (frame.AbsDif ** 2).mean()
-
-    if verbose:
-        print(f"\nMean Square Error = {mse}")
-
-    return mse
diff --git a/benford/utils.py b/benford/utils.py
@@ -130,8 +130,7 @@ def prep_to_roll(start, test):
 def mad_to_roll(arr, Exp, ind):
     """Mean Absolute Deviation used in the rolling function
     """
-    prop = Series(arr)
-    prop = prop.value_counts(normalize=True).sort_index()
+    prop = arr.value_counts(normalize=True).sort_index()
 
     if len(prop) < len(Exp):
         prop = prop.reindex(ind).fillna(0)
@@ -141,8 +140,7 @@ def mad_to_roll(arr, Exp, ind):
 def mse_to_roll(arr, Exp, ind):
     """Mean Squared Error used in the rolling function
     """
-    prop = Series(arr)
-    temp = prop.value_counts(normalize=True).sort_index()
+    temp = arr.value_counts(normalize=True).sort_index()
 
     if len(temp) < len(Exp):
         temp = temp.reindex(ind).fillna(0)

diff --git a/benford/viz.py b/benford/viz.py
@@ -1,7 +1,7 @@
 from numpy import array, arange, maximum, sqrt, ones
 import matplotlib.pyplot as plt
 from matplotlib.text import Annotation
-from .constants import colors
+from .constants import colors, mad_dict
 
 
 def plot_expected(df, digs):
@@ -194,3 +194,29 @@ def plot_mantissa_arc_test(df, gravity_center, grid=True, figsize=12):
     ax.legend(loc = 'lower left')
     ax.set_title("Mantissas Arc Test")
     plt.show(block=False);
+
+def plot_roll_mse(roll_series, figsize):
+    """Shows the rolling MSE plot
+    
+    Args:
+        figsize: the figure dimensions.
+    """
+    fig, ax = plt.subplots(figsize=figsize)
+    ax.set_facecolor(colors['b'])
+    ax.plot(roll_series, color=colors['m'])
+    plt.show(block=False)
+
+def plot_roll_mad(roll_mad, figsize):
+    """Shows the rolling MAD plot
+    
+    Args:
+        figsize: the figure dimensions.
+    """
+    fig, ax = plt.subplots(figsize=figsize)
+    ax.set_facecolor(colors['b'])
+    ax.plot(roll_mad.roll_series, color=colors['m'])
+    if roll_mad.test != -2:
+        plt.axhline(y=mad_dict[roll_mad.test][0], color=colors['af'], linewidth=3)
+        plt.axhline(y=mad_dict[roll_mad.test][1], color=colors['h2'], linewidth=3)
+        plt.axhline(y=mad_dict[roll_mad.test][2], color=colors['s'], linewidth=3)
+    plt.show(block=False)