# Data Exploration Notebook

This notebook is designed for basic data exploration tasks. It includes common operations such as loading data, examining its structure, and visualizing key characteristics.

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Configure notebook settings
%matplotlib inline
plt.rcParams['figure.figsize'] = (10, 6)

print("Libraries imported successfully!")

## Load Data

Load your dataset here. You can load from a file, database, API, or create a sample dataset for exploration.

In [None]:
# Load your dataset
# Example: df = pd.read_csv('path/to/your/data.csv', encoding='utf-8')

# For demonstration, let's create a sample dataset
np.random.seed(42)
sample_data = {
    'feature_1': np.random.normal(0, 1, 1000),
    'feature_2': np.random.uniform(0, 10, 1000),
    'feature_3': np.random.exponential(2, 1000),
    'target': np.random.choice([0, 1], size=1000, p=[0.7, 0.3]),
    'category': np.random.choice(['A', 'B', 'C'], size=1000, p=[0.4, 0.4, 0.2])
}
df = pd.DataFrame(sample_data)
print(f"Dataset loaded successfully with shape: {df.shape}")

## Basic Dataset Information

Let's get an overview of the dataset structure and content.

In [None]:
# Basic information about the dataset
print("--- Dataset Info ---")
print(df.info())

In [None]:
print("--- First 5 rows ---")
print(df.head())

In [None]:
print("--- Last 5 rows ---")
print(df.tail())

In [None]:
print("--- Dataset shape ---")
print(f"Rows: {df.shape[0]}, Columns: {df.shape[1]}")

In [None]:
print("--- Column names ---")
print(df.columns.tolist())

In [None]:
print("--- Data types ---")
print(df.dtypes)

## Summary Statistics

Let's examine the statistical properties of the numerical columns.

In [None]:
print("--- Descriptive Statistics ---")
print(df.describe())

In [None]:
print("--- Descriptive Statistics (including categorical) ---")
print(df.describe(include='all'))

## Missing Values Analysis

Check for any missing values in the dataset.

In [None]:
# Check for missing values
missing_values = df.isnull().sum()
missing_percent = 100 * missing_values / len(df)

missing_data = pd.DataFrame({
    'Missing Count': missing_values,
    'Percentage': missing_percent
})
print("--- Missing Values Analysis ---")
print(missing_data)

## Unique Values Analysis

Check the number of unique values in each column.

In [None]:
print("--- Unique Values Count ---")
for col in df.columns:
    print(f"{col}: {df[col].nunique()} unique values")

## Distribution Plots

Visualize the distributions of numerical features.

In [None]:
# Distribution plots for numerical features
numerical_columns = df.select_dtypes(include=[np.number]).columns

if len(numerical_columns) > 0:
    fig, axes = plt.subplots(nrows=len(numerical_columns), ncols=1, figsize=(10, 4*len(numerical_columns)))
    
    if len(numerical_columns) == 1:
        axes = [axes]
        
    for i, col in enumerate(numerical_columns):
        axes[i].hist(df[col], bins=30, edgecolor='black')
        axes[i].set_title(f'Distribution of {col}')
        axes[i].set_xlabel(col)
        axes[i].set_ylabel('Frequency')
        
    plt.tight_layout()
    plt.show()
else:
    print("No numerical columns found to plot.")

## Correlation Analysis

Examine the correlation between numerical features.

In [None]:
if len(numerical_columns) > 1:
    # Create correlation matrix
    correlation_matrix = df[numerical_columns].corr()
    
    # Plot correlation heatmap
    plt.figure(figsize=(10, 8))
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
                square=True, linewidths=0.5, cbar_kws={'shrink': 0.8})
    plt.title('Correlation Matrix of Numerical Features')
    plt.show()
else:
    print("Not enough numerical columns for correlation analysis.")

## Categorical Data Analysis

Analyze categorical variables if they exist in the dataset.

In [None]:
# Check for categorical columns
categorical_columns = df.select_dtypes(include=['object', 'category']).columns

if len(categorical_columns) > 0:
    for col in categorical_columns:
        print(f"\n--- Value counts for {col} ---")
        print(df[col].value_counts())
        
        # Bar plot for categorical data
        plt.figure(figsize=(8, 5))
        sns.countplot(data=df, x=col)
        plt.title(f'Distribution of {col}')
        plt.xticks(rotation=45)
        plt.show()
else:
    print("No categorical columns found in the dataset.")

## Outlier Detection

Identify potential outliers using box plots.

In [None]:
if len(numerical_columns) > 0:
    # Boxplot to identify outliers
    df[numerical_columns].plot(kind='box', subplots=True, layout=(len(numerical_columns), 1), 
                               figsize=(10, 4*len(numerical_columns)))
    plt.suptitle('Box Plots for Outlier Detection')
    plt.tight_layout()
    plt.show()
else:
    print("No numerical columns for outlier detection.")

## Target Variable Analysis (if applicable)

Analyze the target variable if present in the dataset.

In [None]:
# Check for potential target variable (column with name 'target' or similar)
potential_targets = ['target', 'label', 'class', 'y']
target_col = None

for col_name in potential_targets:
    for col in df.columns:
        if col_name in col.lower():
            target_col = col
            break
    if target_col:
        break

if target_col:
    print(f"Potential target variable detected: {target_col}")
    print(f"Value counts for {target_col}:")
    print(df[target_col].value_counts())
    
    # Plot distribution of target variable
    plt.figure(figsize=(8, 5))
    sns.countplot(data=df, x=target_col)
    plt.title(f'Distribution of {target_col}')
    plt.show()
    
    # If target is numerical, show distribution
    if df[target_col].dtype in ['int64', 'float64']:
        plt.figure(figsize=(8, 5))
        plt.hist(df[target_col], bins=30, edgecolor='black')
        plt.title(f'Distribution of {target_col}')
        plt.xlabel(target_col)
        plt.ylabel('Frequency')
        plt.show()
else:
    print("No clear target variable detected (columns named 'target', 'label', 'class', or 'y' not found).")

## Feature Relationships

Explore relationships between features.

In [None]:
if len(numerical_columns) >= 2:
    # Pair plot for numerical features (sample if dataset is large)
    if len(df) > 1000:
        sample_df = df.sample(n=1000, random_state=42)
        print("Using a sample of 1000 rows for pairplot (dataset is large)")
    else:
        sample_df = df
        
    sns.pairplot(sample_df[numerical_columns])
    plt.show()
else:
    print("Need at least 2 numerical columns for pairplot.")

In [None]:
# Optional: Save the processed data for later use
# df.to_csv('processed_data.csv', index=False)
# print("Processed data saved to 'processed_data.csv'")