Author: Murilo Farias

Starting: 2025-10-27

College: CCTB - Canadian College of Technology and Business

Course Title: Data Warehouse (EDW) Concepts


In [None]:
# Import python packages
import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# We can also use Snowpark for our analyses!
from snowflake.snowpark.context import get_active_session
session = get_active_session()


In [None]:
# Load dimension tables
dim_customer = session.table("PUBLIC.DIM_CUSTOMER").to_pandas()
dim_location = session.table("PUBLIC.DIM_LOCATION").to_pandas()
dim_device = session.table("PUBLIC.DIM_DEVICE").to_pandas()

# Load fact table
fact_shopping = session.table("PUBLIC.FACT_SHOPPING").to_pandas()

In [None]:
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.width', None)        # Auto-detect width
pd.set_option('display.max_colwidth', None) # Show full column content

In [None]:
print("FACT_SHOPPING - First 5 rows")
display(fact_shopping.head(5))

In [None]:
print(fact_shopping.info())

In [None]:
print(fact_shopping.describe())

In [None]:
total_transactions = len(fact_shopping)
total_purchases = fact_shopping['PURCHASE'].sum()
conversion_rate = (total_purchases / total_transactions) * 100
print("KEY PERFORMANCE INDICATORS (KPIs)")
print(f"Total Transactions: {total_transactions:,}")
print(f"Total Purchases: {total_purchases:,}")
print(f"Conversion Rate: {conversion_rate:.2f}%")

In [None]:
print("MISSING VALUES ANALYSIS")
missing = fact_shopping.isnull().sum()
print(missing)

In [None]:
print("DEVICE DISTRIBUTION:")
print(fact_shopping['DEVICEID'].value_counts().sort_index())

print("PURCHASE DISTRIBUTION:")
print(fact_shopping['PURCHASE'].value_counts().sort_index())

In [None]:
print("CORRELATION MATRIX")
numeric_cols = fact_shopping.select_dtypes(include='number')
correlation = numeric_cols.corr()

# Plot heatmap
plt.figure(figsize=(10, 5))
sns.heatmap(correlation, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Heatmap", fontsize=14, pad=10)
plt.show()


In [None]:
# Distribution of Visit Duration
fig, ax = plt.subplots(figsize=(12, 6))
ax.hist(fact_shopping['VISITDURATION'], bins=50, color='steelblue', edgecolor='black', alpha=0.7)
ax.set_title('Distribution of Visit Duration', fontsize=16, fontweight='bold')
ax.set_xlabel('Visit Duration', fontsize=12)
ax.set_ylabel('Frequency', fontsize=12)
ax.grid(axis='y', alpha=0.3)
st.pyplot(fig)


In [None]:
print(dim_customer.head(5))

In [None]:
display(dim_customer.describe())

In [None]:
gender_counts = dim_customer['GENDER'].value_counts()

fig, ax = plt.subplots()
ax.pie(gender_counts.values, labels=gender_counts.index, autopct='%1.1f%%', startangle=90)
ax.set_title('Gender Distribution')
plt.close(fig)
st.pyplot(fig)

In [None]:
age_counts = dim_customer['AGEGROUP'].value_counts()
age_counts = age_counts.reindex(
    ['Under 18'] + [age for age in sorted(age_counts.index) if age != 'Under 18']
)

fig, ax = plt.subplots()
bars = ax.bar(age_counts.index, age_counts.values)

# Add value labels
for bar in bars:
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height(),
            str(bar.get_height()), ha='center', va='bottom')

plt.close(fig)
st.pyplot(fig)

In [None]:
print(dim_device.shape)

In [None]:
print(dim_device)

In [None]:
print(dim_location.shape)

In [None]:
print(dim_location.columns)

In [None]:
display(dim_location.head(5))

In [None]:
# City counts and percentages
city_counts = dim_location['CITY'].value_counts().sort_values(ascending=True)
city_percentage = (city_counts / len(dim_location) * 100).round(2)

# Horizontal bar chart
fig, ax = plt.subplots(figsize=(10,6))
bars = ax.barh(city_counts.index, city_counts.values, color='steelblue')

# Add value labels (count and percentage)
for bar, pct in zip(bars, city_percentage):
    ax.text(bar.get_width() + 0.5, bar.get_y() + bar.get_height()/2,
            f"{int(bar.get_width())} ({pct}%)", va='center', fontsize=9)

ax.set_title('City Distribution')
ax.set_xlabel('Count')
ax.set_ylabel('City')
plt.tight_layout()
plt.close(fig)
st.pyplot(fig)

In [None]:
print(dim_location['REGION'].value_counts())

In [None]:
# Neighborhood counts and percentages
neigh_counts = dim_location['NEIGHBORHOOD'].value_counts().sort_values(ascending=True)
neigh_percentage = (neigh_counts / len(dim_location) * 100).round(2)

# Horizontal bar chart
fig, ax = plt.subplots(figsize=(10,6))
bars = ax.barh(neigh_counts.index, neigh_counts.values, color='steelblue')

# Add value labels (count and percentage)
for bar, pct in zip(bars, neigh_percentage):
    ax.text(bar.get_width() + 0.5, bar.get_y() + bar.get_height()/2,
            f"{int(bar.get_width())} ({pct}%)", va='center', fontsize=9)

ax.set_title('Neighborhood Distribution')
ax.set_xlabel('Count')
ax.set_ylabel('Neighborhood')
plt.tight_layout()
plt.close(fig)
st.pyplot(fig)