## Load Packages

In [None]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go


ModuleNotFoundError: No module named 'ace_tools'

## Set Configs

In [2]:
data_dir = "/Users/mrla/Documents/Projects/data/credit_card/"

## Load Data

This dataset contains information on purchases made through the purchase card programs administered by the state and higher ed institutions. The purchase card information will be updated monthly after the end of the month. For example, July information will be added in August.

This data can be obtained [here](https://data.ok.gov/dataset/purchase-card-pcard-fiscal-year-2014)

In [8]:
df = pd.read_csv(data_dir + "purchase_credit_card.csv")

In [9]:
print(f"Shape of data: {df.shape[0]:,} rows, {df.shape[1]:,} columns")
print(f"Columns in data: {df.columns.tolist()}")
print(f"First 5 rows of data:\n{df.head()}")


Shape of data: 442,458 rows, 11 columns
Columns in data: ['Year-Month', 'Agency Number', 'Agency Name', 'Cardholder Last Name', 'Cardholder First Initial', 'Description', 'Amount', 'Vendor', 'Transaction Date', 'Posted Date', 'Merchant Category Code (MCC)']
First 5 rows of data:
   Year-Month  Agency Number                Agency Name Cardholder Last Name  \
0      201307           1000  OKLAHOMA STATE UNIVERSITY                Mason   
1      201307           1000  OKLAHOMA STATE UNIVERSITY                Mason   
2      201307           1000  OKLAHOMA STATE UNIVERSITY               Massey   
3      201307           1000  OKLAHOMA STATE UNIVERSITY               Massey   
4      201307           1000  OKLAHOMA STATE UNIVERSITY        Mauro-Herrera   

  Cardholder First Initial                   Description  Amount  \
0                        C              GENERAL PURCHASE  890.00   
1                        C                  ROOM CHARGES  368.96   
2                        J         

## Exploratory Data Analysis

### Total Amount per Year Month

In [36]:
# Function to compute boxplot statistics
def boxplot_stats(group):
    q1 = group.quantile(0.25)
    q3 = group.quantile(0.75)
    iqr = q3 - q1
    lower = q1 - 1.5 * iqr
    upper = q3 + 1.5 * iqr
    outliers = ((group < lower) | (group > upper)).sum()
    return pd.Series({
        '01.count': group.count(),
        '02.min': group.min(),
        '03.Q1': q1,
        '04.median': group.median(),
        '05.Q3': q3,
        '06.max': group.max(),
        '07.IQR': iqr,
        '08.lower_bound': lower,
        '09.upper_bound': upper,
        '10.num_outliers': outliers
    })

# Step 2: Apply and reformat
summary_df = df.groupby('Year-Month')['Amount'].apply(boxplot_stats).reset_index()
summary_pivot = summary_df.pivot(index='level_1', columns='Year-Month', values='Amount')

# Optional: format column headers to YYYY-MM
summary_pivot.columns = [col.strftime('%Y-%m') for col in summary_pivot.columns]

# Optional: round for readability
summary_pivot = summary_pivot.round(2)

# Final result
display(summary_pivot)

Unnamed: 0_level_0,2013-07,2013-08,2013-09,2013-10,2013-11,2013-12,2014-01,2014-02,2014-03,2014-04,2014-05,2014-06
level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
01.count,37635.0,39314.0,38762.0,40266.0,34275.0,26969.0,37230.0,35831.0,38188.0,39249.0,36784.0,37955.0
02.min,-38506.87,-18899.0,-33075.32,-7188.61,-7860.22,-3718.0,-27864.0,-34108.0,-42863.04,-10140.0,-4889.36,-21000.0
03.Q1,30.32,32.64,30.85,30.0,31.08,29.99,30.0,30.74,31.79,31.42,31.25,32.36
04.median,105.0,107.02,102.35,100.0,100.0,103.0,101.98,105.12,110.0,107.95,106.98,108.8
05.Q3,350.0,346.31,342.99,325.0,316.0,338.04,337.49,339.0,361.83,360.5,353.84,359.82
06.max,343148.5,1750379.98,1903858.37,1089180.0,335197.99,281185.0,306143.75,306165.68,855343.0,373150.26,348053.75,132790.14
07.IQR,319.68,313.67,312.14,295.0,284.92,308.05,307.49,308.26,330.04,329.08,322.59,327.45
08.lower_bound,-449.19,-437.86,-437.35,-412.5,-396.3,-432.08,-431.24,-431.64,-463.27,-462.2,-452.64,-458.81
09.upper_bound,829.51,816.81,811.19,767.5,743.38,800.12,798.73,801.38,856.89,854.12,837.73,850.99
10.num_outliers,3923.0,4137.0,4028.0,4210.0,3768.0,3006.0,3759.0,3556.0,3855.0,4032.0,3908.0,4148.0


In [21]:
# Aggregate monthly data
monthly_summary = df.groupby('Year-Month').agg(
    total_amount=('Amount', 'sum'),
    transaction_count=('Amount', 'count')
).reset_index()

# Create the dual-axis chart
fig = go.Figure()

# Line for total amount
fig.add_trace(go.Bar(
    x=monthly_summary['Year-Month'],
    y=monthly_summary['total_amount'],
    name='Total Amount ($)',
    yaxis='y1',
    marker_color='steelblue'
))

# Line for transaction count
fig.add_trace(go.Scatter(
    x=monthly_summary['Year-Month'],
    y=monthly_summary['transaction_count'],
    name='Transaction Count',
    yaxis='y2',
    mode='lines+markers',
    line=dict(color='darkorange')
))

# Layout with two y-axes
fig.update_layout(
    title='Monthly Total Amount and Transaction Count',
    xaxis=dict(title='Month'),
    yaxis=dict(title='Total Amount ($)', side='left'),
    yaxis2=dict(
        title='Transaction Count',
        overlaying='y',
        side='right',
        showgrid=False,
        range=[0, monthly_summary['transaction_count'].max() * 1.1]
    ),
    legend=dict(x=0.01, y=0.99),
    template='plotly_white'
)

fig.show()

### Amount per Agency

In [53]:
# Aggregate total amount per agency
agency_totals = df.groupby(['Agency Number', 'Agency Name'])['Amount'].sum().reset_index()

# Optional: sort by total amount
agency_totals = agency_totals.sort_values('Amount', ascending=True)

# Create readable label: "Agency Number – Agency Name"
agency_totals['Agency Label'] = agency_totals['Agency Number'].astype(str) + ' – ' + agency_totals['Agency Name']

fig = px.bar(
    agency_totals.tail(20),
    y='Agency Label',    # categorical axis
    x='Amount',          # numeric axis
    orientation='h',     # vertical bars
    title='Top 20 Total Purchase Amount per Agency',
    labels={'Amount': 'Total Amount (USD)', 'Agency Label': 'Agency'},
    height=600
)

# Rotate x-axis labels for readability if there are many agencies
fig.update_layout(
    xaxis_tickangle=-45,
    xaxis_tickfont=dict(size=10),
    margin=dict(l=40, r=40, t=60, b=60),
)

fig.show()

### Amount per Description

In [56]:

desc_totals = (
    df.groupby('Description')['Amount']
      .sum()
      .reset_index()
      .sort_values('Amount', ascending=False)
)

# If there are too many descriptions, you can take the top N, e.g.:
# desc_totals = desc_totals.head(20)

# Plot horizontal bar chart: descriptions on y-axis, amount on x-axis
fig = px.bar(
    desc_totals.head(20),
    y='Description',
    x='Amount',
    orientation='h',
    title='Total Purchase Amount per Description',
    labels={'Amount': 'Total Amount (USD)', 'Description': 'Transaction Description'},
    template='plotly_white',
    height=800
)

# Invert the y-axis so the largest bars appear at the top
fig.update_layout(
    yaxis=dict(autorange='reversed'),
    margin=dict(l=300, r=40, t=60, b=40)
)

fig.show()

### Amount per Vendor

In [58]:
vendor_totals = (
    df.groupby('Vendor')['Amount']
      .sum()
      .reset_index()
      .sort_values('Amount', ascending=False)
)

# Optionally, limit to top N vendors to keep the chart readable
# vendor_totals = vendor_totals.head(20)

# Plot horizontal bar chart: vendors on y-axis, total amount on x-axis
fig = px.bar(
    vendor_totals.head(40),
    y='Vendor',
    x='Amount',
    orientation='h',
    title='Total Purchase Amount per Vendor',
    labels={'Amount': 'Total Amount (USD)', 'Vendor': 'Vendor Name'},
    template='plotly_white',
    height=800
)

# Invert the y-axis so the highest spenders are at the top
fig.update_layout(
    yaxis=dict(autorange='reversed'),
    margin=dict(l=300, r=40, t=60, b=40)
)

fig.show()