# Data Cleaning & Preparation

> Load and Profile Data

In [None]:
import pandas as pd

# Load data
file_path = 'Senior Data Analyst Task Transactions.csv'
df = pd.read_csv(file_path)

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe(include='all')

> Data Profiling

In [None]:
from ydata_profiling import ProfileReport

profile = ProfileReport(df, title="Crypto Transactions Profiling Report", explorative=True)
profile.to_file("crypto_transactions_profile.html")

# Data Cleaning

In [None]:
# Handle missing values
df_clean = df.copy()
for col in df_clean.columns:
    if df_clean[col].dtype == 'object':
        df_clean[col].fillna('Unknown', inplace=True)
    else:
        df_clean[col].fillna(df_clean[col].median(), inplace=True)

# Drop duplicates
df_clean.drop_duplicates(inplace=True)

# Convert date column
if 'Transaction_Date' in df_clean.columns:
    df_clean['Transaction_Date'] = pd.to_datetime(df_clean['Transaction_Date'], errors='coerce')

# Save cleaned CSV for dashboards
df_clean.to_csv('cleaned_crypto_transactions.csv', index=False)
df_clean

# Summary Stats & Trends

In [None]:
# Summary statistics
summary = {
    'total_transactions': len(df_clean),
    'unique_assets': df_clean['Crypto'].nunique(),
    'unique_wallets': df_clean['Wallet_Type'].nunique(),
    'total_volume': df_clean['Amount'].sum(),
    'median_txn_size': df_clean['Amount'].median(),
}

print(summary)

# Plot 1 : Transaction Volume Over Time (Line Chart)

In [None]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

# Load data
df = pd.read_csv('cleaned_crypto_transactions.csv')

# Convert date column
df['Transaction_Date'] = pd.to_datetime(df['Transaction_Date'])

# --- PLOT 1: Transaction Volume Over Time (USD) ---
daily = df.groupby(df['Transaction_Date'].dt.date)['Total_Value'].sum().reset_index()
fig_line = px.line(
    daily, x='Transaction_Date', y='Total_Value',
    title='Transaction Volume (USD) Over Time',
    labels={'Total_Value': 'Total Volume (USD)', 'Transaction_Date': 'Date'}
)
fig_line.show()

# PLOT 2: Top Assets by Transaction Value

In [None]:

asset_totals = df.groupby('Crypto')['Total_Value'].sum().sort_values(ascending=False).head(10)
fig_bar = px.bar(
    x=asset_totals.index, y=asset_totals.values,
    title='Top 10 Crypto Assets by Transaction Value (USD)',
    labels={'x': 'Crypto Asset', 'y': 'Total Transaction Value (USD)'}
)
fig_bar.show()

# PLOT 3: Distribution of Transaction Amounts

In [None]:
fig_hist = px.histogram(
    df, x='Total_Value', nbins=50,
    title='Distribution of Transaction Values (USD)',
    labels={'Total_Value': 'Transaction Value (USD)'}
)
fig_hist.show()

# KPI SUMMARY TABLE

In [None]:
summary = {
    'Total Transactions': [len(df)],
    'Unique Crypto Assets': [df['Crypto'].nunique()],
    'Unique Users': [df['User_ID'].nunique()],
    'Total Volume (USD)': [df['Total_Value'].sum()],
    'Median Transaction Value (USD)': [df['Total_Value'].median()]
}

fig_table = go.Figure(data=[go.Table(
    header=dict(values=list(summary.keys())),
    cells=dict(values=[v for v in summary.values()])
)])
fig_table.show()

# Outliers

In [None]:
# Define outlier threshold using IQR
Q1 = df['Total_Value'].quantile(0.25)
Q3 = df['Total_Value'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers = df[(df['Total_Value'] < lower_bound) | (df['Total_Value'] > upper_bound)]
print(f"Outlier transactions based on Total_Value ({len(outliers)} found):")
display(outliers[['Transaction_ID', 'Total_Value', 'Crypto', 'Transaction_Date']].head())

In [None]:
# Outlier detection using IQR for Total_Value
Q1 = df['Total_Value'].quantile(0.25)
Q3 = df['Total_Value'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers = df[(df['Total_Value'] < lower_bound) | (df['Total_Value'] > upper_bound)]
normal = df[(df['Total_Value'] >= lower_bound) & (df['Total_Value'] <= upper_bound)]
hover_cols = ['Transaction_ID', 'Crypto', 'Platform', 'Transaction_Type', 'Transaction_Date']

normal_customdata = normal[hover_cols].values
outliers_customdata = outliers[hover_cols].values

fig_scatter = go.Figure()
fig_scatter.add_trace(go.Scatter(
    x=normal['Total_Value'],
    y=normal['Transaction_Fee'],
    mode='markers',
    name='Normal',
    marker=dict(color='blue', opacity=0.5),
    customdata=normal_customdata,
    hovertemplate=
        "<b>Total Value</b>: %{x}<br>"+
        "<b>Transaction Fee</b>: %{y}<br>"+
        "<b>Transaction ID</b>: %{customdata[0]}<br>"+
        "<b>Crypto</b>: %{customdata[1]}<br>"+
        "<b>Platform</b>: %{customdata[2]}<br>"+
        "<b>Type</b>: %{customdata[3]}<br>"+
        "<b>Date</b>: %{customdata[4]}<br>"+
        "<extra></extra>"
))
fig_scatter.add_trace(go.Scatter(
    x=outliers['Total_Value'],
    y=outliers['Transaction_Fee'],
    mode='markers',
    name='Outliers',
    marker=dict(color='red', size=10, symbol='x'),
    customdata=outliers_customdata,
    hovertemplate=
        "<b>Total Value</b>: %{x}<br>"+
        "<b>Transaction Fee</b>: %{y}<br>"+
        "<b>Transaction ID</b>: %{customdata[0]}<br>"+
        "<b>Crypto</b>: %{customdata[1]}<br>"+
        "<b>Platform</b>: %{customdata[2]}<br>"+
        "<b>Type</b>: %{customdata[3]}<br>"+
        "<b>Date</b>: %{customdata[4]}<br>"+
        "<extra></extra>"
))
fig_scatter.update_layout(
    title='Transaction Value vs Transaction Fee (Outliers Highlighted)',
    xaxis_title='Total Value (USD)',
    yaxis_title='Transaction Fee (USD)',
    legend_title='Transaction Type'
)
fig_scatter.show()

In [None]:
# Histogram of Transaction Fee with Outlier Threshold
Q3 = df['Transaction_Fee'].quantile(0.75)
Q1 = df['Transaction_Fee'].quantile(0.25)
IQR = Q3 - Q1
outlier_threshold = Q3 + 1.5 * IQR

fig = px.histogram(df, x='Transaction_Fee', nbins=50,
                   title='Distribution of Transaction Fees with Outlier Threshold',
                   labels={'Transaction_Fee': 'Transaction Fee (USD)'})
fig.add_vline(x=outlier_threshold, line_dash="dash", line_color="red", annotation_text="Outlier Threshold")
fig.show()

In [None]:
fee_by_crypto = df.groupby('Crypto')['Transaction_Fee'].mean().sort_values(ascending=False).reset_index()
fig = px.bar(fee_by_crypto, x='Crypto', y='Transaction_Fee',
             title='Average Transaction Fee by Crypto Asset',
             labels={'Transaction_Fee': 'Avg Fee (USD)', 'Crypto': 'Crypto Asset'})
fig.show()