In [6]:
from datasets import load_dataset
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from typing import Dict, List
import numpy as np
from datetime import datetime
import os

class DataAnalysisSystem:
    def __init__(self):
        # Create workspace directory
        os.makedirs("analysis_workspace", exist_ok=True)
        os.makedirs("analysis_workspace/visualizations", exist_ok=True)

    def load_and_process_dataset(self, dataset_name: str, subset: str = None):
        """Load dataset from Hugging Face and perform initial processing"""
        try:
            print(f"Attempting to load dataset: {dataset_name}")
            
            # Try to load the dataset
            dataset = load_dataset(dataset_name, subset) if subset else load_dataset(dataset_name)
            
            # Determine which split to use
            if 'train' in dataset:
                split = 'train'
            elif len(dataset.keys()) > 0:
                split = list(dataset.keys())[0]
            else:
                raise ValueError("No suitable data split found")
            
            return self._convert_to_pandas(dataset[split])
        
        except Exception as e:
            print(f"Error loading dataset: {e}")
            return None

    def _convert_to_pandas(self, dataset):
        """Convert Hugging Face dataset to pandas DataFrame"""
        return pd.DataFrame(dataset)

    def analyze_dataset(self, df: pd.DataFrame) -> Dict:
        """Perform comprehensive data analysis"""
        analysis_results = {
            'basic_stats': {},
            'class_distribution': {},
            'missing_values': {},
            'correlations': None
        }
        
        # Basic statistics
        analysis_results['basic_stats'] = {
            'rows': len(df),
            'columns': len(df.columns),
            'dtypes': df.dtypes.to_dict(),
            'numeric_summary': df.describe().to_dict(),
            'summary':df.info().to_dict()
            
        }
        
        # Class distribution for categorical columns
        for col in df.select_dtypes(include=['object', 'category']).columns:
            analysis_results['class_distribution'][col] = dict(df[col].value_counts())
        
        # Missing values analysis
        analysis_results['missing_values'] = dict(df.isnull().sum())
        
        # Correlation analysis for numeric columns
        numeric_cols = df.select_dtypes(include=[np.number]).columns
        if len(numeric_cols) > 1:
            analysis_results['correlations'] = df[numeric_cols].corr().to_dict()
        
        return analysis_results

    def generate_visualizations(self, df: pd.DataFrame, analysis_results: Dict) -> List[str]:
        """Generate and save visualizations"""
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        viz_files = []
        
        # Distribution plots for numeric columns
        numeric_cols = df.select_dtypes(include=[np.number]).columns
        if len(numeric_cols) > 0:
            plt.figure(figsize=(15, 5 * ((len(numeric_cols) + 1) // 2)))
            for i, col in enumerate(numeric_cols, 1):
                plt.subplot((len(numeric_cols) + 1) // 2, 2, i)
                try:
                    sns.histplot(df[col], kde=True)
                except Exception as e:
                    plt.hist(df[col])
                plt.title(f'Distribution of {col}')
            plt.tight_layout()
            filename = f'analysis_workspace/visualizations/numeric_distributions_{timestamp}.png'
            plt.savefig(filename)
            plt.close()
            viz_files.append(filename)
        
        # Correlation heatmap
        if analysis_results['correlations']:
            plt.figure(figsize=(10, 8))
            correlation_df = pd.DataFrame(analysis_results['correlations'])
            sns.heatmap(correlation_df, 
                       annot=True, 
                       cmap='coolwarm', 
                       center=0,
                       square=True)
            plt.title('Correlation Heatmap')
            filename = f'analysis_workspace/visualizations/correlation_heatmap_{timestamp}.png'
            plt.savefig(filename)
            plt.close()
            viz_files.append(filename)
        
        return viz_files

    def generate_report(self, analysis_results: Dict, viz_files: List[str]) -> str:
        """Generate comprehensive analysis report"""
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        report_filename = f'analysis_workspace/analysis_report_{timestamp}.md'
        
        with open(report_filename, 'w') as f:
            f.write("# Data Analysis Report\n\n")
            
            # Dataset Overview
            f.write("## Dataset Overview\n")
            f.write(f"- Number of rows: {analysis_results['basic_stats']['rows']}\n")
            f.write(f"- Number of columns: {analysis_results['basic_stats']['columns']}\n\n")
            
            # Column Types
            f.write("## Column Types\n")
            for col, dtype in analysis_results['basic_stats']['dtypes'].items():
                f.write(f"- {col}: {dtype}\n")
            f.write("\n")
            
            # Missing Values
            f.write("## Missing Values\n")
            for col, count in analysis_results['missing_values'].items():
                f.write(f"- {col}: {count}\n")
            f.write("\n")
            
            # Class Distribution
            if analysis_results['class_distribution']:
                f.write("## Class Distribution\n")
                for col, distribution in analysis_results['class_distribution'].items():
                    f.write(f"### {col}\n")
                    for category, count in distribution.items():
                        f.write(f"- {category}: {count}\n")
                    f.write("\n")
            
            # Visualizations
            f.write("## Visualizations\n")
            for viz_file in viz_files:
                f.write(f"![{os.path.basename(viz_file)}]({viz_file})\n\n")
        
        return report_filename

    def run_analysis(self, dataset_name: str, subset: str = None):
        """Run the complete analysis pipeline"""
        try:
            # Load dataset
            df = self.load_and_process_dataset(dataset_name, subset)
            if df is None:
                return "Failed to load dataset"

            # Perform analysis
            analysis_results = self.analyze_dataset(df)
            
            # Generate visualizations
            viz_files = self.generate_visualizations(df, analysis_results)
            
            # Generate report
            report_file = self.generate_report(analysis_results, viz_files)
            
            print(f"Analysis complete. Report generated: {report_file}")
            return f"Analysis complete. Report generated: {report_file}"
            
        except Exception as e:
            print(f"Error during analysis: {e}")
            return f"Error during analysis: {e}"

# Example usage
if __name__ == "__main__":
    # Initialize the system
    system = DataAnalysisSystem()
    
    # Example dataset: "emotion" from Hugging Face
    result = system.run_analysis("emotion")
    print(result)

Attempting to load dataset: emotion


Downloading readme: 100%|█████████████████████████████████████████████████████████████████████████████████████████████| 9.05k/9.05k [00:00<00:00, 4.73MB/s]
Downloading data: 100%|███████████████████████████████████████████████████████████████████████████████████████████████| 1.03M/1.03M [00:00<00:00, 1.04MB/s]
Downloading data: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 127k/127k [00:00<00:00, 192kB/s]
Downloading data: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 129k/129k [00:00<00:00, 194kB/s]
Generating train split: 100%|█████████████████████████████████████████████████████████████████████████████| 16000/16000 [00:00<00:00, 144164.81 examples/s]
Generating validation split: 100%|██████████████████████████████████████████████████████████████████████████| 2000/2000 [00:00<00:00, 285200.69 examples/s]
Generating test split: 100%|████████████████████████████████████

Analysis complete. Report generated: analysis_workspace/analysis_report_20241216_222655.md
Analysis complete. Report generated: analysis_workspace/analysis_report_20241216_222655.md


In [7]:
from datasets import load_dataset
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from typing import Dict, List, Union
import numpy as np
from datetime import datetime
import os

class MultiDatasetAnalysisSystem:
    def __init__(self, datasets: Union[str, List[str]]):
        """
        Initialize the system with one or multiple datasets
        
        Args:
            datasets (str or List[str]): Single dataset name or list of dataset names
        """
        # Ensure datasets is a list
        self.datasets = [datasets] if isinstance(datasets, str) else datasets
        
        # Create workspace directory
        self.workspace_dir = "multi_dataset_analysis_workspace"
        os.makedirs(self.workspace_dir, exist_ok=True)
        os.makedirs(f"{self.workspace_dir}/visualizations", exist_ok=True)

    def load_and_process_dataset(self, dataset_name: str, subset: str = None):
        """Load dataset from Hugging Face and perform initial processing"""
        try:
            print(f"Attempting to load dataset: {dataset_name}")
            
            # Try to load the dataset
            dataset = load_dataset(dataset_name, subset) if subset else load_dataset(dataset_name)
            
            # Determine which split to use
            if 'train' in dataset:
                split = 'train'
            elif len(dataset.keys()) > 0:
                split = list(dataset.keys())[0]
            else:
                raise ValueError("No suitable data split found")
            
            return self._convert_to_pandas(dataset[split])
        
        except Exception as e:
            print(f"Error loading dataset {dataset_name}: {e}")
            return None

    def _convert_to_pandas(self, dataset):
        """Convert Hugging Face dataset to pandas DataFrame"""
        return pd.DataFrame(dataset)

    def analyze_multiple_datasets(self):
        """
        Analyze multiple datasets and generate comprehensive reports
        
        Returns:
            Dict: Analysis results for each dataset
        """
        overall_results = {}
        
        # Loop through each dataset
        for dataset_name in self.datasets:
            print(f"\n--- Analyzing Dataset: {dataset_name} ---")
            
            # Load dataset
            df = self.load_and_process_dataset(dataset_name)
            if df is None:
                print(f"Skipping {dataset_name} due to loading error")
                continue
            
            # Perform analysis
            analysis_results = self.analyze_dataset(df)
            
            # Generate visualizations
            viz_files = self.generate_visualizations(df, analysis_results, dataset_name)
            
            # Generate report
            report_file = self.generate_report(analysis_results, viz_files, dataset_name)
            
            # Store results
            overall_results[dataset_name] = {
                'analysis_results': analysis_results,
                'report_file': report_file,
                'visualizations': viz_files
            }
        
        # Generate comparative report
        self.generate_comparative_report(overall_results)
        
        return overall_results

    def analyze_dataset(self, df: pd.DataFrame) -> Dict:
        """Perform comprehensive data analysis"""
        analysis_results = {
            'basic_stats': {},
            'class_distribution': {},
            'missing_values': {},
            'correlations': None
        }
        
        # Basic statistics
        analysis_results['basic_stats'] = {
            'rows': len(df),
            'columns': len(df.columns),
            'dtypes': df.dtypes.to_dict(),
            'numeric_summary': df.describe().to_dict()
        }
        
        # Class distribution for categorical columns
        for col in df.select_dtypes(include=['object', 'category']).columns:
            analysis_results['class_distribution'][col] = dict(df[col].value_counts())
        
        # Missing values analysis
        analysis_results['missing_values'] = dict(df.isnull().sum())
        
        # Correlation analysis for numeric columns
        numeric_cols = df.select_dtypes(include=[np.number]).columns
        if len(numeric_cols) > 1:
            analysis_results['correlations'] = df[numeric_cols].corr().to_dict()
        
        return analysis_results

    def generate_visualizations(self, df: pd.DataFrame, analysis_results: Dict, dataset_name: str) -> List[str]:
        """Generate and save visualizations"""
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        viz_files = []
        
        # Visualization directory for this dataset
        dataset_viz_dir = f"{self.workspace_dir}/visualizations/{dataset_name}"
        os.makedirs(dataset_viz_dir, exist_ok=True)
        
        # Distribution plots for numeric columns
        numeric_cols = df.select_dtypes(include=[np.number]).columns
        if len(numeric_cols) > 0:
            plt.figure(figsize=(15, 5 * ((len(numeric_cols) + 1) // 2)))
            for i, col in enumerate(numeric_cols, 1):
                plt.subplot((len(numeric_cols) + 1) // 2, 2, i)
                try:
                    sns.histplot(df[col], kde=True)
                except Exception as e:
                    plt.hist(df[col])
                plt.title(f'Distribution of {col}')
            plt.tight_layout()
            filename = f'{dataset_viz_dir}/numeric_distributions_{timestamp}.png'
            plt.savefig(filename)
            plt.close()
            viz_files.append(filename)
        
        # Correlation heatmap
        if analysis_results['correlations']:
            plt.figure(figsize=(10, 8))
            correlation_df = pd.DataFrame(analysis_results['correlations'])
            sns.heatmap(correlation_df, 
                       annot=True, 
                       cmap='coolwarm', 
                       center=0,
                       square=True)
            plt.title(f'Correlation Heatmap - {dataset_name}')
            filename = f'{dataset_viz_dir}/correlation_heatmap_{timestamp}.png'
            plt.savefig(filename)
            plt.close()
            viz_files.append(filename)
        
        return viz_files

    def generate_report(self, analysis_results: Dict, viz_files: List[str], dataset_name: str) -> str:
        """Generate comprehensive analysis report for a single dataset"""
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        report_filename = f'{self.workspace_dir}/{dataset_name}_analysis_report_{timestamp}.md'
        
        with open(report_filename, 'w') as f:
            f.write(f"# Data Analysis Report - {dataset_name}\n\n")
            
            # Dataset Overview
            f.write("## Dataset Overview\n")
            f.write(f"- Number of rows: {analysis_results['basic_stats']['rows']}\n")
            f.write(f"- Number of columns: {analysis_results['basic_stats']['columns']}\n\n")
            
            # Column Types
            f.write("## Column Types\n")
            for col, dtype in analysis_results['basic_stats']['dtypes'].items():
                f.write(f"- {col}: {dtype}\n")
            f.write("\n")
            
            # Missing Values
            f.write("## Missing Values\n")
            for col, count in analysis_results['missing_values'].items():
                f.write(f"- {col}: {count}\n")
            f.write("\n")
            
            # Visualizations
            f.write("## Visualizations\n")
            for viz_file in viz_files:
                f.write(f"![{os.path.basename(viz_file)}]({viz_file})\n\n")
        
        return report_filename

    def generate_comparative_report(self, overall_results: Dict):
        """Generate a comparative report across all datasets"""
        comparative_report_path = f'{self.workspace_dir}/comparative_analysis_report.md'
        
        with open(comparative_report_path, 'w') as f:
            f.write("# Comparative Dataset Analysis Report\n\n")
            
            # Comparative Statistics
            f.write("## Comparative Dataset Statistics\n")
            for dataset_name, results in overall_results.items():
                analysis = results['analysis_results']
                f.write(f"### {dataset_name}\n")
                f.write(f"- Rows: {analysis['basic_stats']['rows']}\n")
                f.write(f"- Columns: {analysis['basic_stats']['columns']}\n")
                f.write(f"- Missing Values: {sum(analysis['missing_values'].values())}\n\n")
            
            # Visualization Comparison Section
            f.write("## Visualizations Comparison\n")
            for dataset_name, results in overall_results.items():
                f.write(f"### {dataset_name} Visualizations\n")
                for viz_file in results['visualizations']:
                    f.write(f"![{os.path.basename(viz_file)}]({viz_file})\n\n")


In [10]:
# Example usage
if __name__ == "__main__":
    # List of datasets to analyze
    datasets_to_analyze = [
        "emotion",      # Text emotion dataset
        "imdb",         # Movie review sentiment dataset
    ]

    # Initialize multi-dataset analysis system
    multi_analysis_system = MultiDatasetAnalysisSystem(datasets_to_analyze)
    
    # Run analysis on multiple datasets
    results = multi_analysis_system.analyze_multiple_datasets()
    
    # Print summary of analysis
    for dataset, details in results.items():
        print(f"\nDataset: {dataset}")
        print(f"Report File: {details['report_file']}")
        print(f"Visualizations: {details['visualizations']}")


--- Analyzing Dataset: emotion ---
Attempting to load dataset: emotion


  with pd.option_context('mode.use_inf_as_na', True):



--- Analyzing Dataset: imdb ---
Attempting to load dataset: imdb


  with pd.option_context('mode.use_inf_as_na', True):



Dataset: emotion
Report File: multi_dataset_analysis_workspace/emotion_analysis_report_20241217_185445.md
Visualizations: ['multi_dataset_analysis_workspace/visualizations/emotion/numeric_distributions_20241217_185445.png']

Dataset: imdb
Report File: multi_dataset_analysis_workspace/imdb_analysis_report_20241217_185458.md
Visualizations: ['multi_dataset_analysis_workspace/visualizations/imdb/numeric_distributions_20241217_185457.png']
