# Class Performance Analysis

This notebook analyzes the performance of individual classes in terms of:
- Download success rates
- Image counts per class
- Quality metrics
- Problem identification

In [1]:
# Import required modules
import sys
import os
sys.path.append('..')

from visualizations.data_loader import (
    load_all_metadata,
    create_summary_dataframe,
    create_image_details_dataframe
)

from visualizations.plotters import (
    plot_class_distribution,
    plot_success_rates
)

import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

print("✅ Modules imported successfully")

✅ Modules imported successfully


In [2]:
# Load data
metadata_list = load_all_metadata()
summary_df = create_summary_dataframe(metadata_list)
image_df = create_image_details_dataframe(metadata_list)

print(f"📊 Analyzing {len(summary_df)} classes")

Found 100 metadata files
✅ Successfully loaded 100 metadata files
📊 Analyzing 100 classes


In [3]:
# Class distribution analysis
plot_class_distribution(summary_df, top_n=20, use_plotly=True)

In [4]:
# Success rate analysis
plot_success_rates(summary_df, use_plotly=True)

In [5]:
# Identify problematic classes
def analyze_problem_classes(summary_df, image_df):
    """
    Identify classes with potential issues
    """
    problems = []
    
    for _, row in summary_df.iterrows():
        class_name = row['class_name']
        category = row['category']
        
        issues = []
        
        # Low success rate
        if row['download_success_rate'] < 80:
            issues.append(f"Low success rate ({row['download_success_rate']:.1f}%)")
        
        # Very few images
        if row['downloaded_images'] < 10:
            issues.append(f"Few images ({row['downloaded_images']})")
        
        # No downloads at all
        if row['downloaded_images'] == 0:
            issues.append("No successful downloads")
        
        # Large gap between requested and found
        if row['urls_requested'] > 0 and row['urls_found'] < row['urls_requested'] * 0.5:
            issues.append(f"Poor URL discovery ({row['urls_found']}/{row['urls_requested']})")
        
        if issues:
            problems.append({
                'class_name': class_name,
                'category': category,
                'issues': ', '.join(issues),
                'downloaded_images': row['downloaded_images'],
                'success_rate': row['download_success_rate'],
                'urls_found': row['urls_found'],
                'urls_requested': row['urls_requested']
            })
    
    return pd.DataFrame(problems)

problem_classes = analyze_problem_classes(summary_df, image_df)

print(f"⚠️ Found {len(problem_classes)} classes with potential issues:")
if len(problem_classes) > 0:
    display(problem_classes.sort_values('success_rate'))
else:
    print("✅ No significant issues found!")

⚠️ Found 0 classes with potential issues:
✅ No significant issues found!


In [6]:
# Performance comparison by category
def plot_category_performance():
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=(
            'Average Images per Class by Category',
            'Success Rate Distribution by Category',
            'Total Images by Category',
            'Performance Score by Category'
        )
    )
    
    # Average images per class
    avg_by_category = summary_df.groupby('category')['downloaded_images'].agg(['mean', 'std']).reset_index()
    
    fig.add_trace(
        go.Bar(
            x=avg_by_category['category'],
            y=avg_by_category['mean'],
            error_y=dict(type='data', array=avg_by_category['std']),
            text=[f"{v:.1f}" for v in avg_by_category['mean']],
            textposition='auto',
            name="Avg Images"
        ),
        row=1, col=1
    )
    
    # Success rate box plots by category
    for category in summary_df['category'].unique():
        cat_data = summary_df[summary_df['category'] == category]
        fig.add_trace(
            go.Box(
                y=cat_data['download_success_rate'],
                name=category,
                boxpoints='outliers'
            ),
            row=1, col=2
        )
    
    # Total images by category
    total_by_category = summary_df.groupby('category')['downloaded_images'].sum()
    
    fig.add_trace(
        go.Bar(
            x=total_by_category.index,
            y=total_by_category.values,
            text=[f"{v:,}" for v in total_by_category.values],
            textposition='auto',
            name="Total Images"
        ),
        row=2, col=1
    )
    
    # Performance score (combination of success rate and image count)
    performance_by_category = summary_df.groupby('category').agg({
        'download_success_rate': 'mean',
        'downloaded_images': 'mean'
    })
    
    # Normalize scores to 0-100 scale
    max_images = performance_by_category['downloaded_images'].max()
    performance_by_category['image_score'] = (performance_by_category['downloaded_images'] / max_images) * 100
    performance_by_category['combined_score'] = (performance_by_category['download_success_rate'] + performance_by_category['image_score']) / 2
    
    fig.add_trace(
        go.Bar(
            x=performance_by_category.index,
            y=performance_by_category['combined_score'],
            text=[f"{v:.1f}" for v in performance_by_category['combined_score']],
            textposition='auto',
            name="Performance Score"
        ),
        row=2, col=2
    )
    
    fig.update_layout(
        title_text="📊 Category Performance Analysis",
        title_x=0.5,
        height=800,
        showlegend=False
    )
    
    fig.show()

plot_category_performance()

In [7]:
# Top and bottom performers
def show_top_bottom_performers():
    # Calculate performance metrics
    summary_df['performance_score'] = (
        summary_df['download_success_rate'] * 0.7 +  # 70% weight to success rate
        (summary_df['downloaded_images'] / summary_df['downloaded_images'].max() * 100) * 0.3  # 30% weight to image count
    )
    
    # Top performers
    top_performers = summary_df.nlargest(10, 'performance_score')[[
        'class_name', 'category', 'downloaded_images', 'download_success_rate', 'performance_score'
    ]]
    
    # Bottom performers (excluding those with 0 images)
    bottom_performers = summary_df[summary_df['downloaded_images'] > 0].nsmallest(10, 'performance_score')[[
        'class_name', 'category', 'downloaded_images', 'download_success_rate', 'performance_score'
    ]]
    
    print("🏆 TOP 10 PERFORMING CLASSES:")
    print("=" * 50)
    display(top_performers)
    
    print("\n⚠️ BOTTOM 10 PERFORMING CLASSES:")
    print("=" * 50)
    display(bottom_performers)

show_top_bottom_performers()

🏆 TOP 10 PERFORMING CLASSES:


Unnamed: 0,class_name,category,downloaded_images,download_success_rate,performance_score
1,Adobong Sitaw,Glow,500,100.0,100.0
3,Ampalaya con Itlog,Glow,500,100.0,100.0
4,Bulanglang,Glow,500,100.0,100.0
5,Chopsuey,Glow,500,100.0,100.0
10,Ginataang Kalabasa at Sitaw,Glow,500,100.0,100.0
12,Ginataang Puso ng Saging,Glow,500,100.0,100.0
15,Ginisang Pechay,Glow,500,100.0,100.0
16,Ginisang Repolyo,Glow,500,100.0,100.0
19,Gising-Gising,Glow,500,100.0,100.0
24,Lumpiang Sariwa,Glow,500,100.0,100.0



⚠️ BOTTOM 10 PERFORMING CLASSES:


Unnamed: 0,class_name,category,downloaded_images,download_success_rate,performance_score
23,Laswa,Glow,258,99.230769,84.941538
82,Inasal na Pakpak,Grow,407,93.135011,89.614508
20,Inabraw,Glow,362,100.0,91.72
9,Ensaladang Talong,Glow,386,97.969543,91.73868
7,Ensaladang Lato,Glow,392,99.745547,93.341883
8,Ensaladang Mangga,Glow,449,95.127119,93.528983
13,Ginisang Ampalaya,Glow,457,99.132321,96.812625
31,Pritong Talong,Glow,453,99.56044,96.872308
17,Ginisang Sayote,Glow,451,100.0,97.06
11,Ginataang Langka,Glow,474,100.0,98.44


In [8]:
# Class-level recommendations
def generate_recommendations(summary_df):
    recommendations = []
    
    # Classes with zero downloads
    zero_downloads = summary_df[summary_df['downloaded_images'] == 0]
    if len(zero_downloads) > 0:
        recommendations.append({
            'priority': 'HIGH',
            'issue': 'Classes with zero downloads',
            'count': len(zero_downloads),
            'action': 'Review search terms and URL fetching for these classes',
            'classes': zero_downloads['class_name'].tolist()[:5]  # Show first 5
        })
    
    # Low success rate classes
    low_success = summary_df[(summary_df['download_success_rate'] < 70) & (summary_df['downloaded_images'] > 0)]
    if len(low_success) > 0:
        recommendations.append({
            'priority': 'MEDIUM',
            'issue': 'Classes with low success rates (<70%)',
            'count': len(low_success),
            'action': 'Check URL quality and download error patterns',
            'classes': low_success['class_name'].tolist()[:5]
        })
    
    # Classes with very few images
    few_images = summary_df[(summary_df['downloaded_images'] < 10) & (summary_df['downloaded_images'] > 0)]
    if len(few_images) > 0:
        recommendations.append({
            'priority': 'LOW',
            'issue': 'Classes with very few images (<10)',
            'count': len(few_images),
            'action': 'Consider increasing search parameters or adding more search terms',
            'classes': few_images['class_name'].tolist()[:5]
        })
    
    # Poor URL discovery
    poor_discovery = summary_df[
        (summary_df['urls_requested'] > 0) & 
        (summary_df['urls_found'] < summary_df['urls_requested'] * 0.5)
    ]
    if len(poor_discovery) > 0:
        recommendations.append({
            'priority': 'MEDIUM',
            'issue': 'Classes with poor URL discovery',
            'count': len(poor_discovery),
            'action': 'Review search terms and Google search parameters',
            'classes': poor_discovery['class_name'].tolist()[:5]
        })
    
    print("💡 RECOMMENDATIONS FOR IMPROVEMENT:")
    print("=" * 50)
    
    if not recommendations:
        print("✅ No major issues found! Your scraping setup is performing well.")
    else:
        for i, rec in enumerate(recommendations, 1):
            print(f"\n{i}. [{rec['priority']}] {rec['issue']}")
            print(f"   Count: {rec['count']} classes")
            print(f"   Action: {rec['action']}")
            print(f"   Examples: {', '.join(rec['classes'])}")
            if len(rec['classes']) == 5 and rec['count'] > 5:
                print(f"   ... and {rec['count'] - 5} more")

generate_recommendations(summary_df)

💡 RECOMMENDATIONS FOR IMPROVEMENT:
✅ No major issues found! Your scraping setup is performing well.


## Summary

This analysis provides insights into:

### Class Performance Metrics
- **Success Rates**: How well each class performed in terms of successful downloads
- **Image Counts**: Distribution of images across different classes
- **Category Comparison**: Performance differences between Go, Grow, and Glow categories

### Identified Issues
- **Problem Classes**: Classes with low success rates or other issues
- **Recommendations**: Actionable suggestions for improvement
- **Priority Levels**: High, medium, and low priority issues to address

### Next Steps
1. Address high-priority issues first (classes with zero downloads)
2. Review search terms for classes with poor URL discovery
3. Investigate download errors for classes with low success rates
4. Consider adjusting search parameters for classes with few images