In [26]:
from datasets import load_dataset
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [27]:
from datasets import load_dataset
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

class SentimentDatasetQualityAssessment:
    def __init__(self, dataset_name):
        try:
            # Load dataset using explicit import
            self.dataset_name = dataset_name
            
            # Special handling for different datasets
            if dataset_name == 'imdb':
                self.dataset = load_dataset(dataset_name, split=['train', 'test', 'unsupervised'])
                self.train_df = pd.DataFrame(self.dataset[0])
                self.test_df = pd.DataFrame(self.dataset[1])
            elif dataset_name == 'amazon_polarity':
                # Amazon Polarity dataset specific handling
                self.dataset = load_dataset(dataset_name)
                self.train_df = self.dataset['train'].to_pandas()
                self.test_df = self.dataset['test'].to_pandas()
                
                # Ensure proper column naming
                if len(self.train_df.columns) > 2:
                    self.train_df.columns = ['label', 'title', 'text']
                    self.test_df.columns = ['label', 'title', 'text']
            else:
                # Standard dataset loading
                self.dataset = load_dataset(dataset_name)
                self.train_df = self.dataset['train'].to_pandas()
                self.test_df = self.dataset['test'].to_pandas()
            
        except Exception as e:
            print(f"Error loading dataset {dataset_name}: {e}")
            raise

    def generate_comprehensive_analysis(self):
        """
        Generate a comprehensive analysis report
        """
        # Flexible text column detection
        text_columns = ['text', 'Text', 'content', 'review']
        text_column = next((col for col in text_columns if col in self.train_df.columns), 
                           self.train_df.columns[1] if len(self.train_df.columns) > 1 else self.train_df.columns[0])
        
        # Flexible label column detection
        label_columns = ['label', 'Label', 'sentiment', 'Sentiment']
        label_column = next((col for col in label_columns if col in self.train_df.columns), 
                            self.train_df.columns[0])
        
        # Detailed Analysis
        analysis = {
            'Dataset Overview': {
                'Total Train Samples': len(self.train_df),
                'Total Test Samples': len(self.test_df),
                'Columns': list(self.train_df.columns)
            },
            
            'Label Distribution': {
                'Train': self.train_df[label_column].value_counts(normalize=True).to_dict(),
                'Test': self.test_df[label_column].value_counts(normalize=True).to_dict()
            },
            
            'Text Characteristics': {
                'Train': {
                    'Avg Text Length': self.train_df[text_column].str.len().mean(),
                    'Median Text Length': self.train_df[text_column].str.len().median(),
                    'Max Text Length': self.train_df[text_column].str.len().max(),
                    'Min Text Length': self.train_df[text_column].str.len().min(),
                    'Avg Word Count': self.train_df[text_column].str.split().str.len().mean()
                },
                'Test': {
                    'Avg Text Length': self.test_df[text_column].str.len().mean(),
                    'Median Text Length': self.test_df[text_column].str.len().median(),
                    'Max Text Length': self.test_df[text_column].str.len().max(),
                    'Min Text Length': self.test_df[text_column].str.len().min(),
                    'Avg Word Count': self.test_df[text_column].str.split().str.len().mean()
                }
            },
            
            'Duplicate Analysis': {
                'Train Duplicates': self.train_df.duplicated().sum(),
                'Test Duplicates': self.test_df.duplicated().sum(),
                'Train Duplicate Percentage': self.train_df.duplicated().mean() * 100,
                'Test Duplicate Percentage': self.test_df.duplicated().mean() * 100
            },
            
            'Missing Values': {
                'Train': self.train_df.isnull().sum().to_dict(),
                'Test': self.test_df.isnull().sum().to_dict()
            },
            
            'Advanced Text Analysis': self._advanced_text_analysis(text_column)
        }
        
        return analysis

    def _advanced_text_analysis(self, text_column):
        """
        Perform advanced text analysis
        """
        def text_complexity_metrics(texts):
            # Convert to string and lowercase
            texts = texts.astype(str).str.lower()
            
            # Special character analysis
            special_char_ratio = texts.apply(lambda x: len(re.findall(r'[^a-z0-9\s]', x)) / len(x) if len(x) > 0 else 0)
            
            # Unique word analysis
            unique_words = texts.apply(lambda x: len(set(x.split())))
            
            return {
                'Avg Special Character Ratio': special_char_ratio.mean(),
                'Avg Unique Words': unique_words.mean(),
                'Max Unique Words': unique_words.max(),
                'Min Unique Words': unique_words.min()
            }
        
        return {
            'Train': text_complexity_metrics(self.train_df[text_column]),
            'Test': text_complexity_metrics(self.test_df[text_column])
        }

    def visualize_analysis(self, analysis):
        """
        Create visualizations for the analysis
        """
        # Label Distribution
        plt.figure(figsize=(12, 5))
        plt.subplot(1, 2, 1)
        train_labels = pd.Series(analysis['Label Distribution']['Train'])
        train_labels.plot(kind='pie', autopct='%1.1f%%')
        plt.title('Train Label Distribution')
        
        plt.subplot(1, 2, 2)
        test_labels = pd.Series(analysis['Label Distribution']['Test'])
        test_labels.plot(kind='pie', autopct='%1.1f%%')
        plt.title('Test Label Distribution')
        plt.tight_layout()
        plt.savefig(f'{self.dataset_name}_label_distribution.png')
        plt.close()
        
        # Text Length Distribution
        plt.figure(figsize=(12, 5))
        plt.subplot(1, 2, 1)
        plt.hist(self.train_df['text'].str.len(), bins=50)
        plt.title('Train Text Length Distribution')
        plt.xlabel('Text Length')
        plt.ylabel('Frequency')
        
        plt.subplot(1, 2, 2)
        plt.hist(self.test_df['text'].str.len(), bins=50)
        plt.title('Test Text Length Distribution')
        plt.xlabel('Text Length')
        plt.ylabel('Frequency')
        plt.tight_layout()
        plt.savefig(f'{self.dataset_name}_text_length_distribution.png')
        plt.close()

def main():
    # List of datasets to assess
    datasets = ['imdb', 'yelp_polarity', 'amazon_polarity']
    
    for dataset_name in datasets:
        try:
            print(f"\nAssessing {dataset_name} dataset:")
            quality_assessor = SentimentDatasetQualityAssessment(dataset_name)
            
            # Generate comprehensive analysis
            analysis = quality_assessor.generate_comprehensive_analysis()
            
            # Print detailed analysis
            for section, details in analysis.items():
                print(f"\n{section}:")
                if isinstance(details, dict):
                    for key, value in details.items():
                        print(f"{key}: {value}")
                else:
                    print(details)
            
            # # Generate visualizations
            # quality_assessor.visualize_analysis(analysis)
            
            # print(f"\nVisualizations saved for {dataset_name} dataset")
        
        except Exception as e:
            print(f"Error processing {dataset_name}: {e}")

# Ensure this is run only if the script is executed directly
if __name__ == "__main__":
    main()


Assessing imdb dataset:

Dataset Overview:
Total Train Samples: 25000
Total Test Samples: 25000
Columns: ['text', 'label']

Label Distribution:
Train: {0: 0.5, 1: 0.5}
Test: {0: 0.5, 1: 0.5}

Text Characteristics:
Train: {'Avg Text Length': np.float64(1325.06964), 'Median Text Length': np.float64(979.0), 'Max Text Length': np.int64(13704), 'Min Text Length': np.int64(52), 'Avg Word Count': np.float64(233.7872)}
Test: {'Avg Text Length': np.float64(1293.7924), 'Median Text Length': np.float64(962.0), 'Max Text Length': np.int64(12988), 'Min Text Length': np.int64(32), 'Avg Word Count': np.float64(228.52668)}

Duplicate Analysis:
Train Duplicates: 96
Test Duplicates: 199
Train Duplicate Percentage: 0.384
Test Duplicate Percentage: 0.796

Missing Values:
Train: {'text': 0, 'label': 0}
Test: {'text': 0, 'label': 0}

Advanced Text Analysis:
Train: {'Avg Special Character Ratio': np.float64(0.0403472361543248), 'Avg Unique Words': np.float64(149.46488), 'Max Unique Words': np.int64(822), 'M