In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Set a style for the plots
sns.set_style("whitegrid")

try:
    # Load the dataset from the data folder
    df = pd.read_csv("data/labeled_data.csv")

    print("--- 1. Dataset Overview ---")
    print(f"Shape of the dataset: {df.shape}")
    print("\nFirst 5 rows:")
    print(df.head())
    
    print("\n--- 2. Checking for Missing Values ---")
    print(df.isnull().sum())

    print("\n--- 3. Class Distribution ---")
    # Using normalize=True gives us the percentage
    class_distribution = df['label'].value_counts(normalize=True) * 100
    print(class_distribution)

    # Visualize the class distribution
    plt.figure(figsize=(7, 5))
    sns.countplot(x='label', data=df, palette="viridis")
    plt.title('Class Distribution')
    plt.ylabel('Number of Samples')
    plt.xlabel('Label')
    plt.show()

    print("\n--- 4. Analyzing Text Length ---")
    # Ensure text is treated as a string before calculating length
    df['text_length'] = df['text'].astype(str).str.len()
    print(df['text_length'].describe())

    # Visualize text length distribution by label
    plt.figure(figsize=(12, 6))
    sns.histplot(data=df, x='text_length', hue='label', multiple='stack', bins=100, palette="magma")
    plt.title('Text Length Distribution by Label')
    plt.xlabel('Text Length (Number of Characters)')
    plt.ylabel('Frequency')
    plt.show()

except FileNotFoundError:
    print("Error: 'data/labeled_data.csv' not found.")
    print("Please make sure your dataset is correctly named and placed in the 'data' directory.")