### Examining payments (within the vendor payments dataset) of the same amount, to the same vendor, on the same day, with the same committment name, as well as the same department description.

In [70]:
import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.ticker as ticker
import seaborn as sns
import datetime
import warnings
# warnings.simplefilter(action='ignore', category=FutureWarning)


In [None]:
# plt.rcParams.update(plt.rcParamsDefault)

In [None]:
# import csv data
files = glob.glob('data/cleaned_data/*.csv')

payments = pd.concat([pd.read_csv(f, dtype={'commitmt_item_name': 'str', 'dept._descrptn': 'str', 'vendor_name': 'str'}) \
                      for f in files], sort=True)

In [None]:
# examine data types that have been imported
payments.dtypes

In [None]:
payments.head()

In [None]:
# convert check/payment date into datetime, it was imported as an object
payments.iloc[:,0] = pd.to_datetime(payments.iloc[:,0])

In [None]:
# examine date range of data
print(payments.iloc[:,0].min())
print(payments.iloc[:,0].max())

In [None]:
# create Day_of_Year column to perform time series analysis over the year
payments['day_of_year'] = payments['check/payment_date'].dt.dayofyear

# create Year column to perform annual time series analysis
payments['year'] = payments['check/payment_date'].dt.year

# create Year/Month column
payments['year_month'] = payments['check/payment_date'].apply(lambda x: str(x.year) + '.' + str(x.month).zfill(2))

In [None]:
payments.head()

### Find identical payments that are larger then 5 dollars in amount, and also smaller than 1,000 dollars.  My reasoning being if fraud is take place, it probably is within this range.  Amounts of greater than 1000 dollars often require many levels of approval.

In [None]:
# exclude payments of less than $5
# find vendors that have the highest number of identical payments
identical_pay_group = payments[payments['invoice_net_amt'] > 5].groupby(['check/payment_date',
     'commitmt_item_name',
     'dept._descrptn',
     'invoice_net_amt',
     'vendor_name']).size().reset_index()

In [None]:
# how many vendors have more than 20 payments of the same amount on the same day?
# payment amounts are less than $1,000 (a few extreme outliers were proving troublesome)
identical_pay_group[(identical_pay_group[0] > 20) & (identical_pay_group['invoice_net_amt'] < 1000)].shape

In [None]:
identical_pay_group.head()

In [None]:
# graph payments as a histogram
fig, ax = plt.subplots(figsize=(19, 5))
identical_pay_group[(identical_pay_group[0] > 20) & (identical_pay_group['invoice_net_amt'] < 1000)]['invoice_net_amt'].hist(bins=100)

In [None]:
plt.figure(figsize=(15, 8))
# plt.style.use('ggplot')
# plt.style.use('seaborn-talk')

plt.style.use('bmh')

# graph payment amount on y axis
# graph payment count on x axis
plt.scatter(identical_pay_group[(identical_pay_group[0] > 20) 
                                & (identical_pay_group['invoice_net_amt'] < 1000)].iloc[:,5],
            identical_pay_group[(identical_pay_group[0] > 20) 
                                & (identical_pay_group['invoice_net_amt'] < 1000)]['invoice_net_amt'],
           alpha=.1,
           s=250,
           color='maroon')

plt.xlabel("Count of Payments")
plt.ylabel("Payment Amount($)")

# plt.savefig("visualizations/identical_payments_less_than_1000.png")

In [None]:
plt.figure(figsize=(15, 8))
plt.style.use('bmh')

# graph payment amount on y axis
# graph payment count on x axis
plt.scatter(identical_pay_group[(identical_pay_group[0] > 20) 
                                & (identical_pay_group['invoice_net_amt'] < 1000)].iloc[:,5],
            identical_pay_group[(identical_pay_group[0] > 20) 
                                & (identical_pay_group['invoice_net_amt'] < 1000)]['invoice_net_amt'],
           alpha=.1,
           s=250,
           color='maroon')

plt.xlabel("Count of Payments")
plt.ylabel("Payment Amount($)")

plt.ylim(0,450)
plt.xlim(20,75)

# plt.savefig("visualizations/identical_payments_less_than_1000_reduced_$450.png")

### For the top 30 vendors with duplicate payments between 5 and 1000 dollars, how are the duplicate payments distributed?

In [None]:
top_30 = identical_pay_group[(identical_pay_group[0] > 10) & \
                    (identical_pay_group['invoice_net_amt'] < 1000)]['vendor_name'].value_counts().head(30)

In [None]:
top_30.index

In [None]:
# top 15 vendors with identical payments greater than 5 and less than 1000, with groupings greater than 20
identical_pay_group[(identical_pay_group[0] > 10) & \
                    (identical_pay_group['invoice_net_amt'] < 1000) & \
                    (identical_pay_group['vendor_name'].isin(top_30.index))].head()

In [None]:
plt.figure(figsize=(15, 15))
# plt.style.use('ggplot')


plt.style.use('bmh')
plt.style.use('seaborn-talk')
# graph payment amount on y axis
# graph payment count on x axis
plt.scatter(identical_pay_group[(identical_pay_group[0] > 10) & \
                    (identical_pay_group['invoice_net_amt'] < 1000) & \
                    (identical_pay_group['vendor_name'].isin(top_30.index))].iloc[:,3],
            identical_pay_group[(identical_pay_group[0] > 10) & \
                    (identical_pay_group['invoice_net_amt'] < 1000) & \
                    (identical_pay_group['vendor_name'].isin(top_30.index))].iloc[:,4],
           alpha=.05,
           s=500,
           color='darkblue')

plt.xlabel("Payment Amount")
plt.ylabel("Vendor Name")
plt.tight_layout()
# plt.savefig("visualizations/identical_payments_less_than_1000_top_30_vendors.png")

### Find the top 5 vendors, how are their payments distributed?

In [None]:
# find top 5 vendors within above grouping 
identical_pay_group[(identical_pay_group[0] > 20) & (identical_pay_group['invoice_net_amt'] < 1000)] \
    ['vendor_name'].value_counts().head(5)

In [None]:
# extract just the top 5 vendor names
identical_pay_group[(identical_pay_group[0] > 20) & (identical_pay_group['invoice_net_amt'] < 1000)] \
    ['vendor_name'].value_counts().head(5).index

In [None]:
# subset payments data based on the payments of these top 5 vendors
id_payments_top_vendors = payments[payments['vendor_name'].isin(identical_pay_group[(identical_pay_group[0] > 20) & (identical_pay_group['invoice_net_amt'] < 1000)] \
    ['vendor_name'].value_counts().head(5).index)]

In [None]:
# how many rows?
id_payments_top_vendors.shape

### Use dot plots to show dispersion of all payments for top 5 vendors with idential payments

In [None]:
fig, ax = plt.subplots(nrows=5, ncols=1, figsize=(15, 17))

ax[0].scatter(payments[payments['vendor_name'] == 'CREATIVE COMMUNICATIONS']['invoice_net_amt'],
              payments[payments['vendor_name'] == 'CREATIVE COMMUNICATIONS']['invoice_net_amt'].apply(lambda x: x * 0 + 1),
              alpha=.1, 
              s=200)
ax[0].set_title('CREATIVE COMMUNICATIONS')

ax[1].scatter(payments[payments['vendor_name'] == 'WESTERN TOWING']['invoice_net_amt'],
              payments[payments['vendor_name'] == 'WESTERN TOWING']['invoice_net_amt'].apply(lambda x: x * 0 + 1), 
              c='red',
              alpha=.1, 
              s=200)
ax[1].set_title('WESTERN TOWING')

ax[2].scatter(payments[payments['vendor_name'] == 'CITY WIDE PEST CONTROL']['invoice_net_amt'],
                 payments[payments['vendor_name'] == 'CITY WIDE PEST CONTROL']['invoice_net_amt'].apply(lambda x: x * 0 + 1), 
              c='green',
              alpha=.1, 
              s=200)
ax[2].set_title('CITY WIDE PEST CONTROL')

ax[3].scatter(payments[payments['vendor_name'] == 'EUROFINS EATON ANALYTICAL INC']['invoice_net_amt'],
                 payments[payments['vendor_name'] == 'EUROFINS EATON ANALYTICAL INC']['invoice_net_amt'].apply(lambda x: x * 0 + 1), 
              c='brown',alpha=.1, s=200)
ax[3].set_title('EUROFINS EATON ANALYTICAL INC')

ax[4].scatter(payments[payments['vendor_name'] == 'COMMUNITY TIRE PROS & AUTO REPAIR']['invoice_net_amt'],
                 payments[payments['vendor_name'] == 'COMMUNITY TIRE PROS & AUTO REPAIR']['invoice_net_amt'].apply(lambda x: x * 0 + 1), 
              c='orange',alpha=.1, s=200)
ax[4].set_title('COMMUNITY TIRE PROS & AUTO REPAIR')

# remove tick marks on the y-axis
for axis in ax.flat:
    axis.yaxis.set_major_locator(plt.NullLocator())

plt.xlabel("Payment Amount($)")
plt.tight_layout()

# plt.savefig("visualizations/identical_payments_top_5_vendors_distributions.png")

### Graph all payments over time to the top 5 vendors who were paid identical payments.  Limit to only payments under 1,000.

In [None]:
plt.figure(figsize=(15, 8))
plt.style.use('bmh')

ax = sns.scatterplot(x=id_payments_top_vendors.iloc[:,7],
           y=id_payments_top_vendors.iloc[:,3],
           hue=id_payments_top_vendors.iloc[:,4],
           alpha=.1,
           s=50)

plt.xlabel("Year/Month")
plt.ylabel("Payment Amount($)")

ax.xaxis.set_major_locator(ticker.MultipleLocator(5))

plt.ylim(0, 1000)

handles, labels = ax.get_legend_handles_labels()
ax.legend(handles=handles[1:], labels=labels[1:], loc='upper center', bbox_to_anchor=(0.5, -0.10), ncol=5)

plt.tight_layout()
# plt.savefig("visualizations/identical_payments_top_5_vendors_all_payments_graph_under_$1000.png")

### Graph count of identical payments under 1,000 against the payment amount - limit to only the top 5 vendors with idential payments.

In [None]:
id_payments_less_1000 = identical_pay_group[(identical_pay_group[0] > 20) & \
                                            (identical_pay_group['invoice_net_amt'] < 1000)]

In [None]:
id_payments_less_1000.head()

In [None]:
top_5_id_payments_less_1000 = id_payments_less_1000[id_payments_less_1000['vendor_name'].isin(identical_pay_group[(identical_pay_group[0] > 20) & (identical_pay_group['invoice_net_amt'] < 1000)] \
    ['vendor_name'].value_counts().head(5).index)]

In [None]:
top_5_id_payments_less_1000.head()

In [None]:
id_payments_less_1000[id_payments_less_1000['vendor_name'].isin(identical_pay_group[(identical_pay_group[0] > 20) & (identical_pay_group['invoice_net_amt'] < 1000)] \
    ['vendor_name'].value_counts().head(5).index)].shape

In [None]:
plt.figure(figsize=(15, 8))

sns.scatterplot(x=top_5_id_payments_less_1000.iloc[:,5],
           y=top_5_id_payments_less_1000.iloc[:,3],
           hue=top_5_id_payments_less_1000.iloc[:,4],
           alpha=.3,
               s=200)

plt.xlabel("Count of Payments")
plt.ylabel("Payment Amount($)")

plt.tight_layout()
# plt.savefig("visualizations/identical_payments_top_5_vendors_graph_under_$1000.png")

#### next steps
~~exclude invoices with leading digit of 0~~
1. which departments are most associated with duplicate payments? uniform cleaning?
1. are certain time of the year associated with high volumes of payments to certain vendors?
1. for vendors with more than 50 payments of the same amount on the same day . . .
    - do the leading digits correspond to Benford's law?
    - are thy provide similar goods/services?
1. for vendors with a high number of negative payments . . .
    - year over year, is the volume of negative payments about the same?
    - negative payments as a percentage of total payments
1. vendors with payments at specific times of the year(similar to what I did with the City of Scottsdale vendor payments).
1. vendors with payments within a tight range(essentially a low variance in payment amount).
1. group by dept. description and committment item name, within this combination
which vendors make up the largest share of the spending?
1. 