## Data import

In [1]:
import pandas as pd

In [2]:
df_accounts_payable = pd.read_csv("case_auditing_data/case1_accountspayable.csv", sep=";", encoding="UTF-8")
df_accounts_payable

Unnamed: 0,Creditor_ID,Firm,Street,HouseNr.,City,Postcode,Country
0,1,Workflow GmbH,Berlinerstraße,12,Berlin,10115,Germany
1,2,Consulting_solutions,Elysees,11,Montord,33452,France
2,3,Consulting_regulations,Prugne,73,Cesset,24532,France
3,4,IT_Solutions,Maximilianstraße,11,Munich,80331,Germany
4,5,Governance GmbH,Frankstraße,75,Munich,80332,Germany
5,6,Business_Partnership,Rue de Grivats,60,Lyon,53522,France
6,7,Clera,Chemin,59,Paris,75000,France
7,8,Carbon,Hornerstraße,52,Hamburg,20090,Germany
8,9,Bon app',Gouzol,2,Beauron,67976,France
9,10,Bottom-Dollar Marketse,Robin,51,Lyon,53454,France


In [3]:
df_transactions = pd.read_csv("case_auditing_data/case1_transactions.csv", sep=";", decimal=",", encoding="UTF-8")

df_accounts_receivable = pd.read_csv("case_auditing_data/case1_accountsreceivable.csv", sep=";", encoding="UTF-8")

df_bank_accounts = pd.read_csv("case_auditing_data/case1_bankaccounts.csv", sep=";", encoding="UTF-8")

## Exploratory Data Analysis

In [4]:
type(df_accounts_payable)

pandas.core.frame.DataFrame

In [5]:
df_accounts_payable.head(5)

Unnamed: 0,Creditor_ID,Firm,Street,HouseNr.,City,Postcode,Country
0,1,Workflow GmbH,Berlinerstraße,12,Berlin,10115,Germany
1,2,Consulting_solutions,Elysees,11,Montord,33452,France
2,3,Consulting_regulations,Prugne,73,Cesset,24532,France
3,4,IT_Solutions,Maximilianstraße,11,Munich,80331,Germany
4,5,Governance GmbH,Frankstraße,75,Munich,80332,Germany


In [6]:
df_accounts_payable.describe()  # for a summary of the data set

Unnamed: 0,Creditor_ID,HouseNr.,Postcode
count,21.0,21.0,21.0
mean,11.0,40.095238,53375.380952
std,6.204837,25.125495,29396.611579
min,1.0,2.0,10115.0
25%,6.0,14.0,24532.0
50%,11.0,42.0,54677.0
75%,16.0,59.0,78654.0
max,21.0,83.0,94469.0


In [7]:
df_accounts_payable.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21 entries, 0 to 20
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Creditor_ID  21 non-null     int64 
 1   Firm         21 non-null     object
 2   Street       21 non-null     object
 3   HouseNr.     21 non-null     int64 
 4   City         21 non-null     object
 5   Postcode     21 non-null     int64 
 6   Country      21 non-null     object
dtypes: int64(3), object(4)
memory usage: 1.3+ KB


In [8]:
df_accounts_payable.shape

(21, 7)

In [9]:
df_accounts_payable.columns

Index(['Creditor_ID', 'Firm', 'Street', 'HouseNr.', 'City', 'Postcode',
       'Country'],
      dtype='object')

In [10]:
df_transactions.Value

0        33.40
1       122.00
2        22.44
3       182.40
4       243.20
        ...   
102      22.44
103      17.56
104      49.99
105    1123.00
106      49.99
Name: Value, Length: 107, dtype: float64

In [11]:
df_bank_accounts

Unnamed: 0,Bank_ID,Creditor_ID,Bankname,CountryKey,VerifCode,BankKey,SectorCode,BankAccNr,CheckDigit
0,1,1,Sparkasse,DE,44,85708525,-,9092573071,-
1,2,1,Sparkasse,DE,27,34010441,-,1517137596,-
2,3,2,BGFI,FR,29,53954085,12914,93097,31
3,4,3,BGFI,FR,92,29603999,98110,90295,6
4,5,3,Banco Frances,FR,63,67849978,16309,91904,9
5,6,4,Hypovereinsbank,DE,68,26188756,-,4469843889,-
6,7,4,Raiffeisen,DE,33,58846743,-,3216797033,-
7,8,5,Sparkasse,DE,83,87524535,-,3682779983,-
8,9,6,Banco Frances,FR,91,18824792,98919,56212,32
9,10,6,Banco Frances,FR,22,13305079,79522,53775,43


In [12]:
df_transactions

Unnamed: 0,Booking_ID,Debitor_ID,Creditor_ID,Product_ID,AccNr_Debit,AccNr_Credit,Value,Date,Time,PaymentTarget,Paid,Reminders
0,1,3,-,8,1200,8000,33.40,01.01.2018,11:00,15.01.2018,no,1
1,2,5,-,4,1200,8000,122.00,01.01.2018,20:56,15.01.2018,yes,-
2,3,18,-,10,1200,8000,22.44,03.01.2018,17:31,17.01.2018,yes,-
3,4,9,-,5,1000,8000,182.40,04.01.2018,00:26,-,yes,-
4,5,18,-,5,1200,8000,243.20,04.01.2018,06:56,18.01.2018,yes,-
...,...,...,...,...,...,...,...,...,...,...,...,...
102,103,19,-,10,1200,8000,22.44,04.04.2018,22:39,18.04.2018,yes,-
103,104,6,-,2,1200,8000,17.56,05.04.2018,16:37,19.04.2018,yes,-
104,105,11,-,6,1000,8000,49.99,06.04.2018,00:08,-,yes,-
105,106,6,-,1,1200,8000,1123.00,07.04.2018,06:59,21.04.2018,yes,-


## Data Preparation

Data preparation is often the most complex process step in a BDA (Big Data Analytics) project, yet this step is skipped in many educational examples of digital data analysis. This section is intended to fill this gap and provide you with some basics on data preparation.

> By using brackets '[]', we are creating a subset.

### Conversion of data types

In [13]:
df_transactions["Date"].dtypes

dtype('O')

In [14]:
df_transactions["Date"]

0      01.01.2018
1      01.01.2018
2      03.01.2018
3      04.01.2018
4      04.01.2018
          ...    
102    04.04.2018
103    05.04.2018
104    06.04.2018
105    07.04.2018
106    08.04.2018
Name: Date, Length: 107, dtype: object

In [15]:
# the variable 'Date' must be converted to the class of the same name

df_transactions["Date"] = pd.to_datetime(df_transactions["Date"], format="%d.%m.%Y")
df_transactions["Date"]

0     2018-01-01
1     2018-01-01
2     2018-01-03
3     2018-01-04
4     2018-01-04
         ...    
102   2018-04-04
103   2018-04-05
104   2018-04-06
105   2018-04-07
106   2018-04-08
Name: Date, Length: 107, dtype: datetime64[ns]

In [16]:
# additional column to assign the day of the week for the date

df_transactions["DayOfTheWeek"] = df_transactions["Date"].dt.day_name() # this function converts a character string 14-03-2012 into a date
df_transactions["DayOfTheWeek"]

0         Monday
1         Monday
2      Wednesday
3       Thursday
4       Thursday
         ...    
102    Wednesday
103     Thursday
104       Friday
105     Saturday
106       Sunday
Name: DayOfTheWeek, Length: 107, dtype: object

In [17]:
df_transactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 107 entries, 0 to 106
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   Booking_ID     107 non-null    int64         
 1   Debitor_ID     107 non-null    object        
 2   Creditor_ID    107 non-null    object        
 3   Product_ID     107 non-null    object        
 4   AccNr_Debit    107 non-null    int64         
 5   AccNr_Credit   107 non-null    int64         
 6   Value          107 non-null    float64       
 7   Date           107 non-null    datetime64[ns]
 8   Time           107 non-null    object        
 9   PaymentTarget  107 non-null    object        
 10  Paid           107 non-null    object        
 11  Reminders      107 non-null    object        
 12  DayOfTheWeek   107 non-null    object        
dtypes: datetime64[ns](1), float64(1), int64(3), object(8)
memory usage: 11.0+ KB


In [18]:
df_transactions["Reminders"] = df_transactions["Reminders"].replace("-", "0") # replacing a string by another string

In [19]:
df_transactions["Reminders"] = df_transactions["Reminders"].astype(str).astype(int)
df_transactions["Reminders"].head(3)

0    1
1    0
2    0
Name: Reminders, dtype: int64

### Filtering

In [53]:
df_transactions.AccNr_Debit == 1000 # boolean result

0      False
1      False
2      False
3       True
4      False
       ...  
102    False
103    False
104     True
105    False
106    False
Name: AccNr_Debit, Length: 107, dtype: bool

In [20]:
# postings that are not part of the cash account (number 1000) must be eliminated from the data set

df_cash_book = df_transactions[(df_transactions.AccNr_Debit == 1000) | (df_transactions.AccNr_Credit == 1000)]
df_cash_book

Unnamed: 0,Booking_ID,Debitor_ID,Creditor_ID,Product_ID,AccNr_Debit,AccNr_Credit,Value,Date,Time,PaymentTarget,Paid,Reminders,DayOfTheWeek
3,4,9,-,5,1000,8000,182.4,2018-01-04,00:26,-,yes,0,Thursday
5,6,-,11,-,5100,1000,193.33,2018-01-05,13:14,-,yes,0,Friday
6,7,13,-,5,1000,8000,364.8,2018-01-06,10:57,-,yes,0,Saturday
9,10,-,11,-,5100,1000,245.56,2018-01-07,15:21,-,yes,0,Sunday
13,14,18,-,1,1000,8000,1123.12,2018-01-09,22:23,-,yes,0,Tuesday
19,20,-,11,-,5100,1000,1500.0,2018-01-14,16:33,-,yes,0,Sunday
21,22,2,-,8,1000,8000,33.4,2018-01-16,22:46,-,yes,0,Tuesday
24,25,11,-,2,1000,8000,17.56,2018-01-18,03:32,-,yes,0,Thursday
33,34,10,-,9,1000,8000,3.8,2018-01-28,16:24,-,yes,0,Sunday
40,41,15,-,7,1000,8000,846.6,2018-02-04,15:04,-,yes,0,Sunday


In [27]:
# How many transactions were made between 22:00 and 04:00

df_transactions[(df_transactions.Time > "22:00") | (df_transactions.Time < "04:00")]

Unnamed: 0,Booking_ID,Debitor_ID,Creditor_ID,Product_ID,AccNr_Debit,AccNr_Credit,Value,Date,Time,PaymentTarget,Paid,Reminders,DayOfTheWeek
3,4,9,-,5,1000,8000,182.4,2018-01-04,00:26,-,yes,0,Thursday
8,9,-,18,-,3200,1600,3231.0,2018-01-06,23:06,20.01.2018,yes,0,Saturday
13,14,18,-,1,1000,8000,1123.12,2018-01-09,22:23,-,yes,0,Tuesday
21,22,2,-,8,1000,8000,33.4,2018-01-16,22:46,-,yes,0,Tuesday
24,25,11,-,2,1000,8000,17.56,2018-01-18,03:32,-,yes,0,Thursday
28,29,-,18,-,3200,1600,452.0,2018-01-22,00:18,05.02.2018,yes,0,Monday
29,30,-,3,-,3200,1600,1452.2,2018-01-24,03:00,07.02.2018,no,0,Wednesday
35,36,-,4,-,3200,1600,5312.0,2018-01-30,22:03,13.02.2018,yes,0,Tuesday
36,37,-,8,-,3200,1600,43.11,2018-01-31,22:06,-,yes,0,Wednesday
38,39,-,18,-,3200,1600,521.31,2018-02-02,00:53,16.02.2018,yes,0,Friday


### Merge data records

The central point here is the choice of column(s) according to which the two data records are linked (columns with same data).

In [21]:
df_accounts_receivable.columns

Index(['Creditor_ID', 'Firm', 'Street', 'HouseNr.', 'City', 'Postcode',
       'Country'],
      dtype='object')

In [22]:
df_bank_accounts.columns

Index(['Bank_ID', 'Creditor_ID', 'Bankname', 'CountryKey', 'VerifCode',
       'BankKey', 'SectorCode', 'BankAccNr', 'CheckDigit'],
      dtype='object')

In [23]:
df_accounts_receivable_banks = df_bank_accounts.merge(df_accounts_receivable, how="inner", left_on="Creditor_ID", right_on="Creditor_ID")

df_accounts_receivable_banks.head(3)

Unnamed: 0,Bank_ID,Creditor_ID,Bankname,CountryKey,VerifCode,BankKey,SectorCode,BankAccNr,CheckDigit,Firm,Street,HouseNr.,City,Postcode,Country
0,1,1,Sparkasse,DE,44,85708525,-,9092573071,-,Workflow GmbH,Berlinerstraße,12,Berlin,10115,Germany
1,2,1,Sparkasse,DE,27,34010441,-,1517137596,-,Workflow GmbH,Berlinerstraße,12,Berlin,10115,Germany
2,3,2,BGFI,FR,29,53954085,12914,93097,31,Consulting_solutions,Elysees,11,Montord,33452,France


# Modeling

This section examines the prepared data with regard to individual conspicuous transaction records. The evaluation steps introduced in subsection 'Theoretical foundations of the digital audit' are applied:

### Retrieving weekend bookings

In [24]:
# To select the booking records that were processed in the system on a Sunday, it is now sufficient to use the syntax 'df[]' introduced above

df_transactions_sundays = df_transactions[df_transactions["DayOfTheWeek"] == "Sunday"]
# If this logical query returns a 'TRUE', the corresponding line is included in df_transactions_sundays.

df_transactions_sundays # as a result a total of 15 of 107 bookings were made on a Sunday

Unnamed: 0,Booking_ID,Debitor_ID,Creditor_ID,Product_ID,AccNr_Debit,AccNr_Credit,Value,Date,Time,PaymentTarget,Paid,Reminders,DayOfTheWeek
9,10,-,11,-,5100,1000,245.56,2018-01-07,15:21,-,yes,0,Sunday
10,11,-,9,-,3200,1600,432.0,2018-01-07,04:13,21.01.2018,yes,0,Sunday
18,19,-,11,-,3200,1600,24541.0,2018-01-14,14:31,28.01.2018,no,0,Sunday
19,20,-,11,-,5100,1000,1500.0,2018-01-14,16:33,-,yes,0,Sunday
26,27,-,15,-,3200,1600,6743.0,2018-01-21,19:48,04.02.2018,yes,0,Sunday
33,34,10,-,9,1000,8000,3.8,2018-01-28,16:24,-,yes,0,Sunday
40,41,15,-,7,1000,8000,846.6,2018-02-04,15:04,-,yes,0,Sunday
55,56,2,-,8,1200,8000,33.4,2018-02-18,01:22,04.03.2018,yes,0,Sunday
62,63,17,-,7,1000,8000,507.96,2018-02-25,01:30,-,yes,0,Sunday
70,71,-,17,-,3200,1600,76443.6,2018-03-04,11:58,18.03.2018,yes,0,Sunday


### Cash minus inspection

In [25]:
# The following lines of code sum up the first five entries in df_transactions.Value

d = 0
for i in range(0,5):
    d = d + df_transactions.Value[i]
    print(d)

33.4
155.4
177.84
360.24
603.44


In [26]:
# alternatively

df_transactions.Value.cumsum()

0          33.40
1         155.40
2         177.84
3         360.24
4         603.44
         ...    
102    445945.44
103    445963.00
104    446012.99
105    447135.99
106    447185.98
Name: Value, Length: 107, dtype: float64

In [28]:
df_cash_book = df_cash_book.reset_index(drop=True) # because we want to iterate over the index without any breaks between them
df_cash_book

Unnamed: 0,Booking_ID,Debitor_ID,Creditor_ID,Product_ID,AccNr_Debit,AccNr_Credit,Value,Date,Time,PaymentTarget,Paid,Reminders,DayOfTheWeek
0,4,9,-,5,1000,8000,182.4,2018-01-04,00:26,-,yes,0,Thursday
1,6,-,11,-,5100,1000,193.33,2018-01-05,13:14,-,yes,0,Friday
2,7,13,-,5,1000,8000,364.8,2018-01-06,10:57,-,yes,0,Saturday
3,10,-,11,-,5100,1000,245.56,2018-01-07,15:21,-,yes,0,Sunday
4,14,18,-,1,1000,8000,1123.12,2018-01-09,22:23,-,yes,0,Tuesday
5,20,-,11,-,5100,1000,1500.0,2018-01-14,16:33,-,yes,0,Sunday
6,22,2,-,8,1000,8000,33.4,2018-01-16,22:46,-,yes,0,Tuesday
7,25,11,-,2,1000,8000,17.56,2018-01-18,03:32,-,yes,0,Thursday
8,34,10,-,9,1000,8000,3.8,2018-01-28,16:24,-,yes,0,Sunday
9,41,15,-,7,1000,8000,846.6,2018-02-04,15:04,-,yes,0,Sunday


In [29]:
c = 0

for i in range(0, len(df_cash_book)): # the final value of the iteration index 'i' is determined by the row dimension of the data set
    
    if df_cash_book.AccNr_Credit[i] == 1000:
        # If so, the corresponding posting value in the Value column is multiplied by -1, which ensures that a negative value is deducted when the total is added up
        df_cash_book.loc[i, "Value"] = df_cash_book.Value[i] * (-1) # loc[row, column]
    
    c = c + df_cash_book.Value[i]
    df_cash_book.loc[i, "cumulated"] = c # the results of these additions are stored in the just created column 'cumulated'
    
print("Cash register was ", sum(df_cash_book.cumulated < 0), "times in minus.")

Cash register was  5 times in minus.


In [30]:
# The negative entries in the column 'df_cash_book.cumulated' can be used to identify the posting records that have put the cash register in the red

df_cash_book[df_cash_book.cumulated < 0]

Unnamed: 0,Booking_ID,Debitor_ID,Creditor_ID,Product_ID,AccNr_Debit,AccNr_Credit,Value,Date,Time,PaymentTarget,Paid,Reminders,DayOfTheWeek,cumulated
1,6,-,11,-,5100,1000,-193.33,2018-01-05,13:14,-,yes,0,Friday,-10.93
5,20,-,11,-,5100,1000,-1500.0,2018-01-14,16:33,-,yes,0,Sunday,-268.57
6,22,2,-,8,1000,8000,33.4,2018-01-16,22:46,-,yes,0,Tuesday,-235.17
7,25,11,-,2,1000,8000,17.56,2018-01-18,03:32,-,yes,0,Thursday,-217.61
8,34,10,-,9,1000,8000,3.8,2018-01-28,16:24,-,yes,0,Sunday,-213.81


### Checking duplicates

In [31]:
# This function checks the column 'BankAccNr' for duplicate entries and assigns them to the vector 'BankAccNr_duplicates'

BankAccNr_duplicates = df_accounts_receivable_banks.loc[df_accounts_receivable_banks.BankAccNr.duplicated(), "BankAccNr"]
BankAccNr_duplicates

30    9092573071
Name: BankAccNr, dtype: int64

In [32]:
# the rows are selected from the data set whose account number matches an account number in the vector 'BankAccNr_duplicates'

df_accounts_receivable_banks["BankAccNr"].isin(BankAccNr_duplicates)

0      True
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
12    False
13    False
14    False
15    False
16    False
17    False
18    False
19    False
20    False
21    False
22    False
23    False
24    False
25    False
26    False
27    False
28    False
29    False
30     True
Name: BankAccNr, dtype: bool

In [33]:
df_accounts_receivable_banks[df_accounts_receivable_banks["BankAccNr"].isin(BankAccNr_duplicates)]

Unnamed: 0,Bank_ID,Creditor_ID,Bankname,CountryKey,VerifCode,BankKey,SectorCode,BankAccNr,CheckDigit,Firm,Street,HouseNr.,City,Postcode,Country
0,1,1,Sparkasse,DE,44,85708525,-,9092573071,-,Workflow GmbH,Berlinerstraße,12,Berlin,10115,Germany
30,31,21,Sparkasse,DE,44,85708525,-,9092573071,-,Work-flow_GmbH,Berlinerstraße,12,Berlin,10115,Germany


The output shows that the two creditors are in reality one and the same, “Workflow GmbH” from Berlin; an underscore was once used in the company name but has since been removed.

In [34]:
# alternatively (indicating only the duplicate)

df_accounts_receivable_banks[df_accounts_receivable_banks["BankAccNr"].duplicated()]

Unnamed: 0,Bank_ID,Creditor_ID,Bankname,CountryKey,VerifCode,BankKey,SectorCode,BankAccNr,CheckDigit,Firm,Street,HouseNr.,City,Postcode,Country
30,31,21,Sparkasse,DE,44,85708525,-,9092573071,-,Work-flow_GmbH,Berlinerstraße,12,Berlin,10115,Germany


In [35]:
help(pd.Series.isin)

Help on function isin in module pandas.core.series:

isin(self, values) -> 'Series'
    Whether elements in Series are contained in `values`.
    
    Return a boolean Series showing whether each element in the Series
    matches an element in the passed sequence of `values` exactly.
    
    Parameters
    ----------
    values : set or list-like
        The sequence of values to test. Passing in a single string will
        raise a ``TypeError``. Instead, turn a single string into a
        list of one element.
    
    Returns
    -------
    Series
        Series of booleans indicating if each element is in values.
    
    Raises
    ------
    TypeError
      * If `values` is a string
    
    See Also
    --------
    DataFrame.isin : Equivalent method on DataFrame.
    
    Examples
    --------
    >>> s = pd.Series(['llama', 'cow', 'llama', 'beetle', 'llama',
    ...                'hippo'], name='animal')
    >>> s.isin(['cow', 'llama'])
    0     True
    1     True
    2  

### Checking the pre-decimal digits for equal distribution

> A *Chi-square test* can be employed to determine whether a data series/random variable sufficiently corresponds to a given distribution or whether there are deviations (caused, for example, by errors or manipulations).

If the booking values have arisen by chance without the intentional intervention of a human being, the distribution of the pre-decimal digits of the booking values should not systematically deviate from the equal distribution.

In [36]:
# Conversion float to integer

INT = df_transactions.Value.astype(int) # 'INT' ensures that the irrelevant decimal places are truncated and the last digit of the integer represents the variable to be examined
INT

0        33
1       122
2        22
3       182
4       243
       ... 
102      22
103      17
104      49
105    1123
106      49
Name: Value, Length: 107, dtype: int64

In [37]:
# filter pre-comma digit

DigitBeforeComma = pd.DataFrame(map(int, [str(x)[-1] for x in INT])) # The function 'str()[-1]' extracts a defined section from a string, here the last character [-1] from a string.
DigitBeforeComma.head()

Unnamed: 0,0
0,3
1,2
2,2
3,2
4,3


In [38]:
# Count absolute frequencies of the values (0, 1, 2, …, 9)

H_DigitBeforeComma = DigitBeforeComma.value_counts()
H_DigitBeforeComma

3    31
2    23
6    12
7    11
9    11
5     6
1     5
4     3
8     3
0     2
Name: count, dtype: int64

In [39]:
# Carry out Chi-square-test for an equal distribution
from scipy.stats import chisquare

chisquare(H_DigitBeforeComma)

Power_divergenceResult(statistic=76.08411214953271, pvalue=9.650328851395323e-13)

The output shows that the null hypothesis can be rejected at any significance level alpha > 9.65e−13, meaning that the pre-comma digits of the booking values follow a distribution significantly different from the equal distribution.

Consider booking values in 'df_transactions.Value'. Test the null hypothesis "H0: The distribution of the first digit after the decimal point follows an equal distribution" with a Chi square test.

In [40]:
# filter after-comma digit

DigitAfterComma = pd.DataFrame(int(x * 10) % 10 for x in df_transactions.Value)
DigitAfterComma.head(20)

Unnamed: 0,0
0,4
1,0
2,4
3,4
4,2
5,3
6,8
7,5
8,0
9,5


In [41]:
# Count absolute frequencies of the values (0, 1, 2, …, 9)

H_DigitAfterComma = DigitAfterComma.value_counts()
H_DigitAfterComma

0    41
4    18
9    14
5     9
8     8
2     7
1     3
3     3
6     3
7     1
Name: count, dtype: int64

In [42]:
chisquare(H_DigitAfterComma)

Power_divergenceResult(statistic=119.44859813084113, pvalue=1.732664729994546e-21)

### Check the leading digit for a Benford distribution

> Benford’s law describes the probability of occurrence of leading digits in numbers.

For randomly generated posting records, the distribution of the first digits of the posting values should deviate from the equal distribution in accordance with Benford’s law and the distribution according to Benford.

In [54]:
import math 
import numpy as np # a package containing arrays

# it creates an array from the values 1 up to 9, meaning that 10 is not included
numbers = np.arange(1,10)
numbers

array([1, 2, 3, 4, 5, 6, 7, 8, 9])

In [55]:
# this creates a list

Probs = []

# 'append' adds everything within its parentheses to the list Prob
# given the value for distribution, 'math.log10' calculates the theoretic frequencies of getting 1, 2, ..., 9, e.g. the probability of getting a 1 is 0.30, the probability of getting a 2 is 0.17, and so on. 

for x in numbers: Probs.append(math.log10(1 + 1/x))

Probs

[0.3010299956639812,
 0.17609125905568124,
 0.12493873660829993,
 0.09691001300805642,
 0.07918124604762482,
 0.06694678963061322,
 0.05799194697768673,
 0.05115252244738129,
 0.04575749056067514]

In [56]:
np.arange(len(numbers)) # length = 9

array([0, 1, 2, 3, 4, 5, 6, 7, 8])

In [57]:
import plotly.express as px

fig = px.histogram(x = numbers, y = Probs, nbins = 9, histnorm ="probability density",
                   category_orders=dict(day=["1", "2", "3", "4", "5", "6", "7", "8", "9"]))
fig.update_layout(bargap=0.1) # it adds a gap between the bins
fig

In [45]:
# The chi-square test can be used to verify whether the first digits of the booking records of the Benford distribution are sufficiently adjusted. The procedure is analogous to examining the first digits for equal distribution.

FirstDigit = pd.DataFrame(map(int, [str(x)[0] for x in INT]))
FirstDigit

Unnamed: 0,0
0,3
1,1
2,2
3,1
4,2
...,...
102,2
103,1
104,4
105,1


In [60]:
H_FirstDigit = pd.DataFrame(FirstDigit.value_counts().sort_index())
H_FirstDigit.loc[9,"count"] = 0 # manually add a 0 for the row 9 that has no observation (NaN)

H_FirstDigit

Unnamed: 0_level_0,count
0,Unnamed: 1_level_1
1,30.0
2,12.0
3,15.0
4,15.0
5,12.0
6,9.0
7,10.0
8,4.0
9,0.0


In [61]:
Probs = []

for x in numbers:
    Probs.append(math.log10(1 + 1/x)) # create the expected probabilities

for i in range(0, len(Probs)):
    Probs[i] = Probs[i] * len(FirstDigit)

Probs

[32.21020953604599,
 18.841764718957894,
 13.368444817088093,
 10.369371391862037,
 8.472393327095855,
 7.163306490475614,
 6.20513832661248,
 5.473319901869798,
 4.8960514899922405]

In [62]:
chisquare(f_obs=H_FirstDigit["count"], f_exp=Probs)

Power_divergenceResult(statistic=14.45619805497751, pvalue=0.07062274443468691)

The output shows that the null hypothesis cannot be rejected at a significance level of alpha=5%, since 0.05 < 0.07062. Thus, at the 5% significance level no deviation of the distribution of the first digit from the Benford distribution can be substantiated, though at the 10% significance level it can be.

Consider booking values in 'df_transactions.Value'. Test the null hypothesis "H0: The distribution of the first digit of transaction values greater than 100 follows a Benford distribution" with a Chi square test.

In [49]:
TransactionsFirstDigit = pd.DataFrame(int(x / pow(10, math.floor(math.log10(x)))) for x in df_transactions.Value[df_transactions.Value > 100])
TransactionsFirstDigit.head()

Unnamed: 0,0
0,1
1,1
2,2
3,1
4,3


In [50]:
H_TransactionsFirstDigit = pd.DataFrame(TransactionsFirstDigit.value_counts().sort_index())
H_TransactionsFirstDigit.loc[9,] = 0

H_TransactionsFirstDigit

Unnamed: 0_level_0,count
0,Unnamed: 1_level_1
1,22.0
2,6.0
3,4.0
4,4.0
5,12.0
6,8.0
7,2.0
8,4.0
9,0.0


In [51]:
Probs = []

for x in numbers:
    Probs.append(math.log10(1 + 1/x))
for i in range(0, len(Probs)):
    Probs[i] = Probs[i] * len(TransactionsFirstDigit)

Probs

[18.663859731166834,
 10.917658061452236,
 7.746201669714596,
 6.008420806499498,
 4.909237254952739,
 4.15070095709802,
 3.595500712616577,
 3.17145639173764,
 2.8369644147618587]

In [52]:
chisquare(f_obs=H_TransactionsFirstDigit["count"], f_exp=Probs)

Power_divergenceResult(statistic=22.867382781613838, pvalue=0.0035387498663053355)