# Exploratory Analysis of Shiny App A/B Test Data

This notebook explores the data collected from our A/B test experiment on the Shiny web application. We'll examine the data structure, distribution of metrics, and perform preliminary analyses to understand patterns and insights.

In [None]:
import sys
import json
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Add the src directory to path so we can import our modules
sys.path.append('..')

## 1. Data Loading and Initial Exploration

In [None]:
# Load the data
file_path = '../data/shiny_user_data.csv'
df = pd.read_csv(file_path)

# Display basic information
print(f"Dataset shape: {df.shape}")
df.head()

In [None]:
# Check data types and missing values
df.info()

In [None]:
# Check distribution of groups
group_counts = df['Group'].value_counts()
print("Group distribution:")
print(group_counts)

# Visualize group distribution
fig = px.pie(names=group_counts.index, values=group_counts.values, title="Group Distribution")
fig.show()

## 2. Processing Time Spent Data

In [None]:
# Examine the Time_Spent column
print("Sample Time_Spent values:")
for i in range(3):
    print(f"[{i}] {df['Time_Spent'].iloc[i]}")

In [None]:
# Function to parse the Time_Spent column
def parse_time_spent(time_str):
    try:
        # Convert string representation of dict to actual dict
        time_dict = json.loads(time_str.replace("'", '"'))
        # Return total time across all sections
        return sum(time_dict.values())
    except (json.JSONDecodeError, AttributeError):
        return np.nan

# Apply the parser to create a Total_Time_Spent column
df['Total_Time_Spent'] = df['Time_Spent'].apply(parse_time_spent)

# Extract time spent in individual sections
sections = ['data_upload', 'data_cleaning', 'feature_engineering', 'exploratory_analysis']

for section in sections:
    df[f'Time_{section}'] = df['Time_Spent'].apply(
        lambda x: json.loads(x.replace("'", '"')).get(section, np.nan) 
        if isinstance(x, str) else np.nan
    )

# Display the processed data
time_cols = ['Total_Time_Spent'] + [f'Time_{s}' for s in sections]
df[['User_ID', 'Group'] + time_cols].head()

## 3. Descriptive Statistics by Group

In [None]:
# Get descriptive statistics by group
def get_group_stats(group):
    group_df = df[df['Group'] == group]
    numeric_cols = group_df.select_dtypes(include=[np.number]).columns
    stats_df = group_df[numeric_cols].describe().T
    stats_df = stats_df.reset_index().rename(columns={'index': 'metric'})
    return stats_df

group_a_stats = get_group_stats('A')
group_b_stats = get_group_stats('B')

print("Group A Statistics:")
group_a_stats[['metric', 'count', 'mean', 'std', 'min', 'max']]

In [None]:
print("Group B Statistics:")
group_b_stats[['metric', 'count', 'mean', 'std', 'min', 'max']]

## 4. Exploratory Visualizations

In [None]:
# Plot distribution of key metrics
key_metrics = [
    'Task_Completion_Rate',
    'Download_Interaction_Rate',
    'Button_Click_Rate',
    'Plot_Interactions',
    'Total_Time_Spent'
]

for metric in key_metrics:
    fig = px.histogram(
        df, 
        x=metric, 
        color="Group",
        marginal="box",  # Add a box plot on the margin
        barmode="overlay",  # Overlay the histograms
        title=f"Distribution of {metric.replace('_', ' ')}",
        opacity=0.7
    )
    fig.show()

In [None]:
# Examine correlations between metrics
numeric_cols = df.select_dtypes(include=[np.number]).columns
corr_metrics = [col for col in numeric_cols if col != 'User_ID']

# Calculate correlation matrix
corr_matrix = df[corr_metrics].corr()

# Plot heatmap
fig = px.imshow(
    corr_matrix,
    text_auto=True,
    color_continuous_scale='RdBu_r',
    title="Correlation Matrix of Metrics"
)
fig.update_layout(height=700, width=700)
fig.show()

## 5. Preliminary Comparison of Key Metrics

In [None]:
# Visualize key metrics by group
comparison_data = []

for group in ['A', 'B']:
    group_df = df[df['Group'] == group]
    
    for metric in key_metrics:
        comparison_data.append({
            'Group': group,
            'Metric': metric.replace('_', ' '),
            'Mean': group_df[metric].mean(),
            'StdErr': group_df[metric].std() / np.sqrt(len(group_df))
        })

comparison_df = pd.DataFrame(comparison_data)

# Create bar chart
fig = px.bar(
    comparison_df, 
    x="Metric", 
    y="Mean", 
    color="Group",
    barmode="group",
    error_y="StdErr",
    title="Comparison of Key Metrics Between Groups"
)
fig.update_layout(height=500, width=800)
fig.show()

## 6. Time Spent Analysis

In [None]:
# Analyze time spent in different sections
time_sections = [f'Time_{s}' for s in sections]
time_data = []

for group in ['A', 'B']:
    group_df = df[df['Group'] == group]
    
    for section in time_sections:
        clean_name = section.replace('Time_', '').replace('_', ' ').title()
        
        time_data.append({
            'Group': group,
            'Section': clean_name,
            'Average Time (seconds)': group_df[section].mean()
        })

time_df = pd.DataFrame(time_data)

# Create grouped bar chart
fig = px.bar(
    time_df,
    x='Section',
    y='Average Time (seconds)',
    color='Group',
    barmode='group',
    title="Average Time Spent in Each Section by Group"
)
fig.update_layout(height=500, width=800)
fig.show()

## 7. Initial Hypothesis Testing

In [None]:
from scipy import stats

# Function to perform t-test between groups
def run_ttest(metric):
    group_a = df[df['Group'] == 'A'][metric].dropna()
    group_b = df[df['Group'] == 'B'][metric].dropna()
    
    if len(group_a) < 2 or len(group_b) < 2:
        return f"Insufficient data for {metric}"
    
    t_stat, p_val = stats.ttest_ind(group_a, group_b, equal_var=False)
    
    return {
        'metric': metric,
        't_statistic': t_stat,
        'p_value': p_val,
        'significant': p_val < 0.05,
        'group_a_mean': group_a.mean(),
        'group_b_mean': group_b.mean(),
        'difference': group_b.mean() - group_a.mean(),
        'percent_change': ((group_b.mean() - group_a.mean()) / group_a.mean() * 100) if group_a.mean() != 0 else np.nan,
    }

# Test all metrics
test_results = []
for metric in corr_metrics:
    result = run_ttest(metric)
    if isinstance(result, dict):
        test_results.append(result)
    else:
        print(result)

# Create results dataframe
results_df = pd.DataFrame(test_results)
results_df = results_df.sort_values('p_value')

# Display results
results_df[['metric', 'p_value', 'significant', 'group_a_mean', 'group_b_mean', 'percent_change']]

## 8. Summary of Initial Findings

In [None]:
# Filter for significant results
significant_results = results_df[results_df['significant']]

print(f"Found {len(significant_results)} significant differences between groups:")
for _, row in significant_results.iterrows():
    metric = row['metric'].replace('_', ' ')
    change = row['percent_change']
    direction = "higher" if change > 0 else "lower"
    print(f"- {metric}: Group B is {abs(change):.2f}% {direction} than Group A (p={row['p_value']:.4f})")

## 9. Next Steps

Based on the exploratory analysis, our next steps will be:

1. Refine our statistical testing approach (consider using non-parametric tests for non-normal distributions)
2. Develop more detailed visualizations for the key metrics
3. Analyze user behavior patterns and engagement funnel
4. Prepare the final statistical report with conclusions and recommendations