## **Data Exploration and Visualization**

### **Read data from the SQLite database**

In [5]:
# import libraries
import pandas as pd
import sqlite3 as sq
import numpy as np
import altair as alt
from datetime import datetime

In [2]:
# read data from SQLite database file, parse dates from 'date' column
cnx = sq.connect("data/EXPENSES.db")
df = pd.read_sql_query("SELECT * FROM travel_expenses",
                       cnx, parse_dates='date')
df.head()


Unnamed: 0,date,category,subcategory,amount,account,payment_type,lat,lng,place,country
0,2022-08-31 14:33:25,Food and Drinks,Coffee,1.4,Hanseatic Visa,Credit card,12.923556,100.882455,Pattaya,Thailand
1,2022-08-31 14:33:25,Food and Drinks,Groceries,5.53,Hanseatic Visa,Credit card,12.923556,100.882455,Pattaya,Thailand
2,2022-08-31 14:02:09,Transportation,Public transport,0.55,Thai Baht cash,Cash,12.923556,100.882455,Pattaya,Thailand
3,2022-08-31 12:21:11,Transportation,Public transport,0.27,Thai Baht cash,Cash,12.923556,100.882455,Pattaya,Thailand
4,2022-08-31 11:12:21,Food and Drinks,Coffee,3.3,Thai Baht cash,Cash,12.923556,100.882455,Pattaya,Thailand


### **Handle Outliers**  
Plot the data distribution and see if there are outliers.

In [None]:
# plot the distribution of the amounts
alt.Chart(df).mark_boxplot(size=50, extent=0.5, outliers={'size': 5}).encode(
    y='category:N',
    x=alt.Y('amount:Q', scale=alt.Scale(zero=False))
).properties(height=250).interactive()


In [None]:
# create a new df without significant outliers
q = df["amount"].quantile(0.99)
df_no_outl = df[df["amount"] < q]
alt.Chart(df_no_outl).mark_boxplot(size=50, extent=0.5, outliers={'size': 5}).encode(
    y='category:N',
    x=alt.Y('amount:Q', scale=alt.Scale(zero=False))
).properties(height=300).interactive()

### **Grouping, Aggregation, and Visualizing the data**  

#### **Calculate totals per category and place**

Total per category and Place

In [67]:
# total per category
total_cat = alt.Chart(df).mark_bar(cornerRadius=1).encode(
    alt.Y('category', sort='-x', title='Category'),
    alt.X('sum(amount)', title='Total Amount')
).properties(height=300)

total_place = alt.Chart(df).mark_bar(cornerRadius=1).encode(
    alt.Y('place', sort='-x', title='Place'),
    alt.X('sum(amount)', title='Total Amount')
)

total_cat | total_place


Average expense value per place and day

In [3]:
# aggregate and group
df['date'].dt.date.groupby([df.place]).nunique()
place_count_days = df.groupby(['place'])['date'].apply(
    lambda x: x.dt.date.nunique()).reset_index()
sum_place = df.groupby('place')['amount'].sum().reset_index()
place_sum = pd.merge(place_count_days, sum_place, on='place')
place_sum.rename(columns={'date': 'days_cnt'}, inplace=True)
place_sum['place_avg_day'] = (
    round(place_sum['amount'] / place_sum['days_cnt']))


In [4]:
place_sum.head()

Unnamed: 0,place,days_cnt,amount,place_avg_day
0,Bangkok,159,9860.53,62.0
1,Chiang Mai,10,428.64,43.0
2,Hua Hin,9,612.56,68.0
3,Kanchanaburi,1,30.17,30.0
4,Khao Sok,3,135.12,45.0


In [10]:
chart1 = alt.Chart(place_sum).mark_bar().encode(
    alt.Y('place', sort='-x', title='Place'),
    alt.X('days_cnt', title='Days spent'),
    color=alt.condition(
        alt.datum.days_cnt > 50,
        alt.value('orange'),
        alt.value('steelblue')
    )
).transform_window(
    rank='rank(days_cnt)',
    sort=[alt.SortField('days_cnt', order='descending')]
).transform.filter(
    (alt.datum.rank < 10)
)

chart2 = alt.Chart(place_sum).mark_bar().encode(
    alt.Y('place', sort='-x', axis=alt.Axis(title=None)),
    alt.X('place_avg_day', title='Mean Expense Value per Day'),
    color=alt.condition(
        alt.datum.place_avg_day
    )
)

chart1 | chart2

In [None]:
    alt.Chart(place_sum).mark_bar().encode(
    x='sum(yield):Q',
    y='year:O',
    color='year:N',
    row='site:N'

Monthly totals and budget

In [70]:
df = df
bars = alt.Chart(df).mark_bar().encode(
    x='sum(amount):Q',
    y='month(date):O',
    # color='category:N',
)

budget = alt.Chart().mark_rule(color='red', size=2).encode(
    x='Budget:Q'
)

text = bars.mark_text(
    align='left',
    baseline='middle',
    dx=3  # Nudges text to right so it doesn't appear on top of the bar
).encode(
    text='sum(amount):Q'
)

alt.layer(
    bars,
    budget,
    text,
    data=df
).transform_calculate(Budget='1200'
                      ).facet(row='year(date)')


Number of records per category & amount

In [None]:
alt.Chart(df_no_outl).mark_rect().encode(
    x=alt.X('amount', bin=alt.Bin(maxbins=50)),
    color = 'count()',
    y='category'
)

Spending pattern per weekday

In [None]:
alt.Chart(df_no_outl).mark_rect().encode(
    alt.X('month(date):O', title='Month'),
    alt.Y('day(date):O', title='Day'),
    alt.Color('sum(amount):Q', title='Expense Amount'),
)

In [None]:
alt.Chart(df_no_outl).mark_rect().encode(
    alt.Y('category'),
    alt.X('place'),
    color=('sum(amount)')
)