In [30]:
import os
import numpy as np
import pandas as pd
from datasets import load_dataset
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline


class ComprehensiveDatasetAnalyzer:
    def __init__(self, datasets, model_name="distilbert-base-uncased-distilled-squad"):
        
        self.datasets = datasets
        self.workspace_dir = "comprehensive_dataset_analysis_workspace"
        os.makedirs(self.workspace_dir, exist_ok=True)
        os.makedirs(f"{self.workspace_dir}/reports", exist_ok=True)
        os.makedirs(f"{self.workspace_dir}/visualizations", exist_ok=True)

        # Load QA model for text insights
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForQuestionAnswering.from_pretrained(model_name)
        self.qa_pipeline = pipeline("question-answering", model=self.model, tokenizer=self.tokenizer)

    def perform_comprehensive_analysis(self):

        comprehensive_results = {}

        for dataset_name in self.datasets:
            try:
                print(f"\nAnalyzing dataset: {dataset_name}")
                dataset = load_dataset(dataset_name)

                # split by keys
                split = list(dataset.keys())[0]
                data = dataset[split]

                # Converting HuggingFace dataset to a Pandas DataFrame
                try:
                    df = pd.DataFrame(data)
                except Exception as e:
                    df = pd.DataFrame(data.to_pandas())
                
                # separate key value by using applymap
                df = df.applymap(lambda x: str(x) if isinstance(x, dict) else x)

                # Perform analysis
                analysis_results = {
                    "exploratory_analysis": self._perform_exploratory_data_analysis(df),
                    "preprocessing_needs": self._analyze_preprocessing_needs(df),
                    "text_insights": self._generate_text_insights(df),
                }

                # Generate visualizations
                analysis_results["visualizations"] = self._generate_visualizations(df, dataset_name)

                # Generate reports
                self._generate_comprehensive_report(analysis_results, dataset_name)

                comprehensive_results[dataset_name] = analysis_results

            except Exception as e:
                print(f"Error analyzing {dataset_name}: {e}")
                import traceback
                traceback.print_exc()

        # Generate a comparative report
        self._generate_comparative_report(comprehensive_results)

        return comprehensive_results

    def _perform_exploratory_data_analysis(self, df):
        eda_results = {
            "basic_info": {
                "total_rows": len(df),
                "total_columns": len(df.columns),
                "column_types": dict(df.dtypes),
            },
            "summary_statistics": {},
            "column_details": {},
        }

        # Numeric column analysis
        numeric_cols = df.select_dtypes(include=["number"]).columns
        if len(numeric_cols) > 0:
            eda_results["summary_statistics"] = df[numeric_cols].describe().to_dict()

        #  column analysis for prepreocessing
        for column in df.columns:
            col_details = {
                "unique_values": df[column].nunique(),
                "null_count": df[column].isnull().sum(),
                "null_percentage": (df[column].isnull().sum() / len(df)) * 100,
            }
            eda_results["column_details"][column] = col_details

        return eda_results

    def _analyze_preprocessing_needs(self, df):
        """Analyze preprocessing requirements."""
        return {
            "missing_values": self._detect_missing_values(df),
            "duplicate_rows": self._detect_duplicate_rows(df),
        }

    def _detect_missing_values(self, df):
        """Detect and analyze missing values."""
        return df.isnull().sum().to_dict()

    def _detect_duplicate_rows(self, df):
        """Detect duplicate rows."""
        return {"total_duplicates": df.duplicated().sum()}

    def _generate_text_insights(self, df):
        """Generate text insights using a QA model."""
        text_insights = {}
        text_cols = df.select_dtypes(include=["object"]).columns

        for col in text_cols:
            try:
                text_sample = df[col].dropna().sample(n=1, random_state=1).iloc[0]
                if len(str(text_sample)) > 20:  # Ensure valid context
                    insights = self.qa_pipeline(
                        {"question": "What is the main idea?", "context": text_sample}
                    )
                    text_insights[col] = insights["answer"]
            except Exception:
                continue

        return text_insights

    def _generate_visualizations(self, df, dataset_name):
        """Generate visualizations for numeric columns."""
        viz_files = []
        viz_dir = f"{self.workspace_dir}/visualizations/{dataset_name}"
        os.makedirs(viz_dir, exist_ok=True)

        numeric_cols = df.select_dtypes(include=["number"]).columns
        for col in numeric_cols:
            plt.figure(figsize=(10, 6))
            sns.histplot(df[col], kde=True)
            plt.title(f"Distribution of {col}")
            plot_path = f"{viz_dir}/{col}_distribution.png"
            plt.savefig(plot_path)
            plt.close()
            viz_files.append(plot_path)

        return viz_files

    def _generate_comprehensive_report(self, analysis_results, dataset_name):
        """Generate a comprehensive report for a single dataset."""
        report_file = f"{self.workspace_dir}/reports/{dataset_name}_report.txt"
        with open(report_file, "w") as f:
            for key, value in analysis_results.items():
                f.write(f"{key.upper()}:\n")
                f.write(f"{value}\n\n")

    def _generate_comparative_report(self, comprehensive_results):
        """Generate a comparative report across datasets."""
        comparative_file = f"{self.workspace_dir}/reports/comparative_report.txt"
        with open(comparative_file, "w") as f:
            for dataset_name, results in comprehensive_results.items():
                f.write(f"Dataset: {dataset_name}\n")
                f.write(f"{results}\n\n")


# Main execution
def main():
    datasets_to_analyze = ["imdb", "emotion",'/phihung/titanic']

    analyzer = ComprehensiveDatasetAnalyzer(datasets_to_analyze)
    results = analyzer.perform_comprehensive_analysis()
    print("Analysis complete!")


if __name__ == "__main__":
    main()



Analyzing dataset: imdb


  df = df.applymap(lambda x: str(x) if isinstance(x, dict) else x)
  with pd.option_context('mode.use_inf_as_na', True):



Analyzing dataset: emotion


  df = df.applymap(lambda x: str(x) if isinstance(x, dict) else x)
  with pd.option_context('mode.use_inf_as_na', True):


Analysis complete!


In [29]:
import os
import traceback
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np
from transformers import pipeline  # For NLP insights
from fpdf import FPDF  # For generating PDF reports
from PIL import Image
import cv2

class ComprehensiveDatasetAnalyzer:
    def __init__(self, datasets):
        self.datasets = datasets

    def analyze_nlp_data(self, dataset_name, data):
        """Analyze text-based NLP datasets."""
        print(f"Analyzing NLP dataset: {dataset_name}")
        results = {}

        # Basic text statistics
        data['text_length'] = data['text'].apply(len)
        data['word_count'] = data['text'].apply(lambda x: len(x.split()))

        results['text_length_stats'] = data['text_length'].describe()
        results['word_count_stats'] = data['word_count'].describe()

        # NLP insights using a small LLM
        sentiment_analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased")
        data['sentiment'] = data['text'].apply(lambda x: sentiment_analyzer(x)[0]['label'])
        results['sentiment_distribution'] = data['sentiment'].value_counts()

        self.generate_visualizations("nlp", dataset_name, data, results)
        self.export_pdf_report(dataset_name, results, data_type="nlp")
        return results

    def analyze_image_data(self, dataset_name, image_paths):
        """Analyze image datasets."""
        print(f"Analyzing Image dataset: {dataset_name}")
        results = {}

        brightness_values = []
        pixel_values = []

        for img_path in image_paths:
            try:
                img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)  # Grayscale image
                pixel_values.append(img.flatten())
                brightness_values.append(np.mean(img))
            except Exception as e:
                print(f"Error processing image {img_path}: {e}")

        results['brightness_stats'] = pd.Series(brightness_values).describe()

        # Visualize pixel intensity distribution
        self.generate_visualizations("image", dataset_name, image_paths, results)
        self.export_pdf_report(dataset_name, results, data_type="image")
        return results

    def analyze_structured_data(self, dataset_name, data):
        """Analyze structured tabular data."""
        print(f"Analyzing Structured dataset: {dataset_name}")
        results = {}

        results['info'] = str(data.info())
        results['description'] = data.describe().to_string()
        results['null_values'] = data.isnull().sum()
        results['duplicates'] = data[data.duplicated()].index.tolist()
        results['outliers'] = {col: self.detect_outliers(data[col]) for col in data.select_dtypes(include=np.number)}

        self.generate_visualizations("structured", dataset_name, data, results)
        self.export_pdf_report(dataset_name, results, data_type="structured")
        return results

    def detect_outliers(self, series):
        """Detect outliers in a numerical series using the IQR method."""
        Q1 = series.quantile(0.25)
        Q3 = series.quantile(0.75)
        IQR = Q3 - Q1
        outliers = series[(series < (Q1 - 1.5 * IQR)) | (series > (Q3 + 1.5 * IQR))]
        return outliers.index.tolist()

    def generate_visualizations(self, data_type, dataset_name, data, results):
        """Generate and save visualizations specific to the data type."""
        output_dir = f"visualizations/{dataset_name}"
        os.makedirs(output_dir, exist_ok=True)

        if data_type == "nlp":
            plt.figure()
            sns.histplot(data['text_length'], kde=True)
            plt.title("Text Length Distribution")
            plt.savefig(f"{output_dir}/text_length_distribution.png")

        elif data_type == "image":
            plt.figure()
            sns.histplot(results['brightness_stats'], kde=True)
            plt.title("Image Brightness Distribution")
            plt.savefig(f"{output_dir}/brightness_distribution.png")

        elif data_type == "structured":
            plt.figure()
            sns.heatmap(data.corr(), annot=True, cmap="coolwarm")
            plt.title("Correlation Matrix")
            plt.savefig(f"{output_dir}/correlation_matrix.png")

    def export_pdf_report(self, dataset_name, analysis_results, data_type="general"):
        """Export a consolidated analysis report as a PDF."""
        pdf = FPDF()
        pdf.add_page()
        pdf.set_font("Arial", size=12)

        pdf.cell(200, 10, txt=f"Analysis Report for {dataset_name}", ln=True, align="C")
        pdf.ln(10)

        for key, value in analysis_results.items():
            pdf.set_font("Arial", style="B", size=12)
            pdf.cell(200, 10, txt=str(key), ln=True)
            pdf.set_font("Arial", size=10)
            pdf.multi_cell(0, 10, txt=str(value))
            pdf.ln(5)

        output_path = f"reports/{dataset_name}_analysis_report.pdf"
        os.makedirs("reports", exist_ok=True)
        pdf.output(output_path)
        print(f"PDF report saved at {output_path}")

    def perform_comprehensive_analysis(self):
        """Analyze all datasets and direct them to their relevant type function."""
        comprehensive_results = {}

        for dataset_name in self.datasets:
            print(f"Analyzing dataset: {dataset_name}")

            try:
                # Identify dataset type and analyze accordingly
                if dataset_name == "imdb":  # NLP Example
                    data = pd.DataFrame({"text": ["This is a sample review.", "Another review."]})
                    comprehensive_results[dataset_name] = self.analyze_nlp_data(dataset_name, data)

                elif dataset_name == "image-dataset":  # Image Example
                    image_paths = ["path_to_image1.jpg", "path_to_image2.jpg"]
                    comprehensive_results[dataset_name] = self.analyze_image_data(dataset_name, image_paths)

                elif dataset_name == "titanic":  # Structured Example
                    data = sns.load_dataset("titanic")
                    comprehensive_results[dataset_name] = self.analyze_structured_data(dataset_name, data)

                else:
                    print(f"Unknown dataset type for {dataset_name}, skipping.")

            except Exception:
                print(f"Error analyzing {dataset_name}")
                traceback.print_exc()

        return comprehensive_results

if __name__ == "__main__":
    datasets_to_analyze = ["imdb", "titanic", "image-dataset"]
    analyzer = ComprehensiveDatasetAnalyzer(datasets_to_analyze)
    results = analyzer.perform_comprehensive_analysis()
    print("Analysis complete!")


ModuleNotFoundError: No module named 'fpdf'