In [30]:
import os
import numpy as np
import pandas as pd
from datasets import load_dataset
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline


class ComprehensiveDatasetAnalyzer:
    def __init__(self, datasets, model_name="distilbert-base-uncased-distilled-squad"):
        
        self.datasets = datasets
        self.workspace_dir = "comprehensive_dataset_analysis_workspace"
        os.makedirs(self.workspace_dir, exist_ok=True)
        os.makedirs(f"{self.workspace_dir}/reports", exist_ok=True)
        os.makedirs(f"{self.workspace_dir}/visualizations", exist_ok=True)

        # Load QA model for text insights
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForQuestionAnswering.from_pretrained(model_name)
        self.qa_pipeline = pipeline("question-answering", model=self.model, tokenizer=self.tokenizer)

    def perform_comprehensive_analysis(self):

        comprehensive_results = {}

        for dataset_name in self.datasets:
            try:
                print(f"\nAnalyzing dataset: {dataset_name}")
                dataset = load_dataset(dataset_name)

                # split by keys
                split = list(dataset.keys())[0]
                data = dataset[split]

                # Converting HuggingFace dataset to a Pandas DataFrame
                try:
                    df = pd.DataFrame(data)
                except Exception as e:
                    df = pd.DataFrame(data.to_pandas())
                
                # separate key value by using applymap
                df = df.applymap(lambda x: str(x) if isinstance(x, dict) else x)

                # Perform analysis
                analysis_results = {
                    "exploratory_analysis": self._perform_exploratory_data_analysis(df),
                    "preprocessing_needs": self._analyze_preprocessing_needs(df),
                    "text_insights": self._generate_text_insights(df),
                }

                # Generate visualizations
                analysis_results["visualizations"] = self._generate_visualizations(df, dataset_name)

                # Generate reports
                self._generate_comprehensive_report(analysis_results, dataset_name)

                comprehensive_results[dataset_name] = analysis_results

            except Exception as e:
                print(f"Error analyzing {dataset_name}: {e}")
                import traceback
                traceback.print_exc()

        # Generate a comparative report
        self._generate_comparative_report(comprehensive_results)

        return comprehensive_results

    def _perform_exploratory_data_analysis(self, df):
        eda_results = {
            "basic_info": {
                "total_rows": len(df),
                "total_columns": len(df.columns),
                "column_types": dict(df.dtypes),
            },
            "summary_statistics": {},
            "column_details": {},
        }

        # Numeric column analysis
        numeric_cols = df.select_dtypes(include=["number"]).columns
        if len(numeric_cols) > 0:
            eda_results["summary_statistics"] = df[numeric_cols].describe().to_dict()

        #  column analysis for prepreocessing
        for column in df.columns:
            col_details = {
                "unique_values": df[column].nunique(),
                "null_count": df[column].isnull().sum(),
                "null_percentage": (df[column].isnull().sum() / len(df)) * 100,
            }
            eda_results["column_details"][column] = col_details

        return eda_results

    def _analyze_preprocessing_needs(self, df):
        """Analyze preprocessing requirements."""
        return {
            "missing_values": self._detect_missing_values(df),
            "duplicate_rows": self._detect_duplicate_rows(df),
        }

    def _detect_missing_values(self, df):
        """Detect and analyze missing values."""
        return df.isnull().sum().to_dict()

    def _detect_duplicate_rows(self, df):
        """Detect duplicate rows."""
        return {"total_duplicates": df.duplicated().sum()}

    def _generate_text_insights(self, df):
        """Generate text insights using a QA model."""
        text_insights = {}
        text_cols = df.select_dtypes(include=["object"]).columns

        for col in text_cols:
            try:
                text_sample = df[col].dropna().sample(n=1, random_state=1).iloc[0]
                if len(str(text_sample)) > 20:  # Ensure valid context
                    insights = self.qa_pipeline(
                        {"question": "What is the main idea?", "context": text_sample}
                    )
                    text_insights[col] = insights["answer"]
            except Exception:
                continue

        return text_insights

    def _generate_visualizations(self, df, dataset_name):
        """Generate visualizations for numeric columns."""
        viz_files = []
        viz_dir = f"{self.workspace_dir}/visualizations/{dataset_name}"
        os.makedirs(viz_dir, exist_ok=True)

        numeric_cols = df.select_dtypes(include=["number"]).columns
        for col in numeric_cols:
            plt.figure(figsize=(10, 6))
            sns.histplot(df[col], kde=True)
            plt.title(f"Distribution of {col}")
            plot_path = f"{viz_dir}/{col}_distribution.png"
            plt.savefig(plot_path)
            plt.close()
            viz_files.append(plot_path)

        return viz_files

    def _generate_comprehensive_report(self, analysis_results, dataset_name):
        """Generate a comprehensive report for a single dataset."""
        report_file = f"{self.workspace_dir}/reports/{dataset_name}_report.txt"
        with open(report_file, "w") as f:
            for key, value in analysis_results.items():
                f.write(f"{key.upper()}:\n")
                f.write(f"{value}\n\n")

    def _generate_comparative_report(self, comprehensive_results):
        """Generate a comparative report across datasets."""
        comparative_file = f"{self.workspace_dir}/reports/comparative_report.txt"
        with open(comparative_file, "w") as f:
            for dataset_name, results in comprehensive_results.items():
                f.write(f"Dataset: {dataset_name}\n")
                f.write(f"{results}\n\n")


# Main execution
def main():
    datasets_to_analyze = ["imdb", "emotion",'/phihung/titanic']

    analyzer = ComprehensiveDatasetAnalyzer(datasets_to_analyze)
    results = analyzer.perform_comprehensive_analysis()
    print("Analysis complete!")


if __name__ == "__main__":
    main()



Analyzing dataset: imdb


  df = df.applymap(lambda x: str(x) if isinstance(x, dict) else x)
  with pd.option_context('mode.use_inf_as_na', True):



Analyzing dataset: emotion


  df = df.applymap(lambda x: str(x) if isinstance(x, dict) else x)
  with pd.option_context('mode.use_inf_as_na', True):


Analysis complete!
