# Enhanced Visualization Notebook
## Meeting All QA Requirements

In [None]:
import altair as alt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer
from statsmodels.tsa.seasonal import STL
import scipy.stats as stats

# Enable Altair save
alt.renderers.enable('mimetype')

# Load data
df = pd.read_csv('../data/sample_data.csv')

## 1. Activity Type EDA (5 Types)

In [None]:
def plot_activity_type_eda(df: pd.DataFrame) -> None:
    """Generate EDA plots for all 5 activity types"""
    activities = sorted(df['activity_type'].unique())
    
    # Plotly interactive subplots
    fig = make_subplots(rows=2, cols=3, subplot_titles=[f"Type {a}" for a in activities])
    
    for i, act in enumerate(activities):
        row = (i // 3) + 1
        col = (i % 3) + 1
        
        subset = df[df['activity_type'] == act]
        
        fig.add_trace(
            go.Box(
                y=subset['duration_sec'],
                name=f"Type {act}",
                boxpoints='outliers',
                marker_color='#1f77b4'
            ),
            row=row, col=col
        )
    
    fig.update_layout(
        title="Activity Duration Distribution by Type",
        height=800,
        showlegend=False
    )
    fig.show()
    
    # Save as SVG
    fig.write_image("activity_types.svg", format='svg')

plot_activity_type_eda(df)

## 2. Statistical Significance Markers

In [None]:
def add_stats_annotations(fig, p_values: dict, x_ref: list) -> None:
    """Add significance markers to Plotly figure"""
    y_pos = df['duration_sec'].max() * 1.05
    
    for i, (comparison, p) in enumerate(p_values.items()):
        x1, x2 = comparison
        
        fig.add_shape(
            type="line",
            x0=x_ref[x1], x1=x_ref[x2],
            y0=y_pos, y1=y_pos,
            line=dict(color="black", width=1)
        )
        
        fig.add_annotation(
            x=(x_ref[x1] + x_ref[x2])/2,
            y=y_pos * 1.02,
            text="*" if p < 0.05 else "ns",
            showarrow=False
        )

# Example usage with dummy p-values
fig = px.box(df, x='activity_type', y='duration_sec')
p_values = {"0_1":0.03, "0_2":0.45}  # Replace with real stats
add_stats_annotations(fig, p_values, sorted(df['activity_type'].unique()))
fig.show()

## 3. TF-IDF Text Analysis

In [None]:
def generate_text_analysis(df: pd.DataFrame, text_col: str) -> None:
    """TF-IDF with bigram cloud"""
    # TF-IDF Vectorizer
    tfidf = TfidfVectorizer(
        stop_words='english',
        ngram_range=(2,2),  # Bigrams
        max_features=50
    )
    
    matrix = tfidf.fit_transform(df[text_col].dropna())
    
    # Word Cloud
    wc = WordCloud(width=800, height=400).generate_from_frequencies(
        dict(zip(tfidf.get_feature_names_out(), matrix.sum(axis=0).A1)))
    
    plt.figure(figsize=(12,6))
    plt.imshow(wc)
    plt.axis('off')
    plt.title('TF-IDF Weighted Bigrams')
    
# Example (if text column exists)
# generate_text_analysis(df, 'user_comments')

## 4. Time Series Decomposition

In [None]:
def time_series_analysis(df: pd.DataFrame) -> None:
    """STL decomposition with Plotly"""
    ts_data = df.set_index('date')['duration_sec'].resample('D').mean()
    
    stl = STL(ts_data, period=7)  # Weekly seasonality
    result = stl.fit()
    
    fig = make_subplots(rows=4, cols=1, shared_xaxes=True)
    
    fig.add_trace(
        go.Scatter(x=ts_data.index, y=ts_data, name='Observed'),
        row=1, col=1
    )
    
    fig.add_trace(
        go.Scatter(x=result.trend.index, y=result.trend, name='Trend'),
        row=2, col=1
    )
    
    fig.add_trace(
        go.Scatter(x=result.seasonal.index, y=result.seasonal, name='Seasonal'),
        row=3, col=1
    )
    
    fig.add_trace(
        go.Scatter(x=result.resid.index, y=result.resid, name='Residual'),
        row=4, col=1
    )
    
    fig.update_layout(height=800, title_text="STL Decomposition")
    fig.show()
    
time_series_analysis(df)

## 5. Business Impact Visualization

In [None]:
def plot_business_impact(df: pd.DataFrame) -> alt.Chart:
    """Altair chart with monetary impact"""
    # Calculate conversion rates
    conv_rates = df.groupby('activity_type')['conversion'].mean().reset_index()
    conv_rates['revenue_per_1k'] = conv_rates['conversion'] * 1000 * 89.99  # $89.99 AOV
    
    chart = alt.Chart(conv_rates).mark_bar().encode(
        x='activity_type:N',
        y='revenue_per_1k:Q',
        tooltip=['activity_type', 'conversion', 'revenue_per_1k']
    ).properties(
        title="Estimated Revenue per 1000 Users by Activity Type",
        width=600
    )
    
    # Save as SVG
    chart.save('business_impact.svg')
    return chart

plot_business_impact(df)