# Data Overview - Porto Seguro Safe Driver Prediction

This notebook provides a comprehensive overview of the dataset including:
- Basic information about the data
- Statistical summaries
- Missing value analysis
- Data type information
- Visual exploratory data analysis

## 1. Import Necessary Libraries

In [None]:
# Import required libraries for data manipulation and visualization
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

# Configure visualization settings
warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')
%matplotlib inline

# Set display options for better readability
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

print("Libraries imported successfully!")

## 2. Read the Dataset

In [None]:
# Read the training data from CSV file
# The data contains information about drivers for insurance prediction
df = pd.read_csv('data/train.csv')

print(f"Dataset loaded successfully!")
print(f"Dataset shape: {df.shape[0]} rows and {df.shape[1]} columns")

## 3. Display First Five Rows

In [None]:
# Display the first 5 rows to understand the structure of the data
print("First 5 rows of the dataset:")
df.head()

## 4. Basic Statistics of the Dataset

In [None]:
# Generate descriptive statistics for numerical columns
# This includes count, mean, std, min, quartiles, and max
print("Basic statistical summary of the dataset:")
df.describe()

In [None]:
# Additional statistics for all columns including non-numeric
print("\nDetailed information about the dataset:")
df.info()

## 5. Check for Missing Values

In [None]:
# Calculate the number and percentage of missing values for each column
missing_values = df.isnull().sum()
missing_percentage = (missing_values / len(df)) * 100

# Create a dataframe to display missing value information
missing_df = pd.DataFrame({
    'Column': missing_values.index,
    'Missing_Count': missing_values.values,
    'Missing_Percentage': missing_percentage.values
})

# Filter to show only columns with missing values
missing_df = missing_df[missing_df['Missing_Count'] > 0].sort_values('Missing_Count', ascending=False)

if len(missing_df) > 0:
    print("Columns with missing values:")
    print(missing_df.to_string(index=False))
else:
    print("No missing values found in the dataset!")

In [None]:
# Visualize missing values if any exist
if len(missing_df) > 0:
    plt.figure(figsize=(12, 6))
    plt.bar(missing_df['Column'], missing_df['Missing_Percentage'])
    plt.xlabel('Columns', fontsize=12)
    plt.ylabel('Missing Percentage (%)', fontsize=12)
    plt.title('Percentage of Missing Values by Column', fontsize=14, fontweight='bold')
    plt.xticks(rotation=90)
    plt.tight_layout()
    plt.show()
else:
    print("No visualization needed - dataset is complete!")

## 6. Check Data Types

In [None]:
# Display data types of all columns
print("Data types of columns:")
dtypes_df = pd.DataFrame({
    'Column': df.dtypes.index,
    'Data_Type': df.dtypes.values
})
print(dtypes_df.to_string(index=False))

In [None]:
# Summary of data types
print("\nSummary of data types:")
print(df.dtypes.value_counts())

## 7. Data Visualizations

### 7.1 Target Variable Distribution

In [None]:
# Analyze the distribution of the target variable
# This shows the balance between classes (claim vs no claim)
if 'target' in df.columns:
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Count plot
    target_counts = df['target'].value_counts()
    axes[0].bar(target_counts.index, target_counts.values, color=['skyblue', 'salmon'])
    axes[0].set_xlabel('Target', fontsize=12)
    axes[0].set_ylabel('Count', fontsize=12)
    axes[0].set_title('Distribution of Target Variable', fontsize=14, fontweight='bold')
    axes[0].set_xticks([0, 1])
    axes[0].set_xticklabels(['No Claim (0)', 'Claim (1)'])
    
    # Add value labels on bars
    for i, v in enumerate(target_counts.values):
        axes[0].text(i, v + 1000, str(v), ha='center', fontsize=10)
    
    # Pie chart
    axes[1].pie(target_counts.values, labels=['No Claim (0)', 'Claim (1)'], 
                autopct='%1.1f%%', colors=['skyblue', 'salmon'], startangle=90)
    axes[1].set_title('Target Variable Proportion', fontsize=14, fontweight='bold')
    
    plt.tight_layout()
    plt.show()
    
    print(f"\nTarget distribution:")
    print(df['target'].value_counts())
    print(f"\nTarget proportion:")
    print(df['target'].value_counts(normalize=True))

### 7.2 Distribution of Numerical Features

In [None]:
# Select numerical columns for visualization (excluding id and target)
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
if 'id' in numerical_cols:
    numerical_cols.remove('id')
if 'target' in numerical_cols:
    numerical_cols.remove('target')

# Display histograms for the first 12 numerical features
# This helps understand the distribution and range of values
num_features_to_plot = min(12, len(numerical_cols))

if num_features_to_plot > 0:
    fig, axes = plt.subplots(4, 3, figsize=(15, 12))
    axes = axes.ravel()
    
    for idx, col in enumerate(numerical_cols[:num_features_to_plot]):
        axes[idx].hist(df[col].dropna(), bins=30, color='steelblue', edgecolor='black', alpha=0.7)
        axes[idx].set_title(f'Distribution of {col}', fontsize=10, fontweight='bold')
        axes[idx].set_xlabel(col, fontsize=9)
        axes[idx].set_ylabel('Frequency', fontsize=9)
        axes[idx].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
else:
    print("No numerical columns to visualize.")

### 7.3 Box Plots for Outlier Detection

In [None]:
# Create box plots to identify outliers in numerical features
# Box plots show the median, quartiles, and outliers
if num_features_to_plot > 0:
    fig, axes = plt.subplots(4, 3, figsize=(15, 12))
    axes = axes.ravel()
    
    for idx, col in enumerate(numerical_cols[:num_features_to_plot]):
        axes[idx].boxplot(df[col].dropna(), vert=True)
        axes[idx].set_title(f'Box Plot of {col}', fontsize=10, fontweight='bold')
        axes[idx].set_ylabel(col, fontsize=9)
        axes[idx].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
else:
    print("No numerical columns to visualize.")

### 7.4 Categorical Features Analysis

In [None]:
# Identify binary and categorical features based on column names
binary_cols = [col for col in df.columns if 'bin' in col]
categorical_cols = [col for col in df.columns if 'cat' in col]

print(f"Number of binary features: {len(binary_cols)}")
print(f"Number of categorical features: {len(categorical_cols)}")

# Visualize distribution of first 6 binary features
if len(binary_cols) > 0:
    num_binary_to_plot = min(6, len(binary_cols))
    fig, axes = plt.subplots(2, 3, figsize=(15, 8))
    axes = axes.ravel()
    
    for idx, col in enumerate(binary_cols[:num_binary_to_plot]):
        value_counts = df[col].value_counts().sort_index()
        axes[idx].bar(value_counts.index, value_counts.values, color='teal', alpha=0.7)
        axes[idx].set_title(f'Distribution of {col}', fontsize=10, fontweight='bold')
        axes[idx].set_xlabel(col, fontsize=9)
        axes[idx].set_ylabel('Count', fontsize=9)
        axes[idx].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

In [None]:
# Visualize distribution of first 6 categorical features
if len(categorical_cols) > 0:
    num_cat_to_plot = min(6, len(categorical_cols))
    fig, axes = plt.subplots(2, 3, figsize=(15, 8))
    axes = axes.ravel()
    
    for idx, col in enumerate(categorical_cols[:num_cat_to_plot]):
        value_counts = df[col].value_counts().sort_index()
        axes[idx].bar(value_counts.index, value_counts.values, color='coral', alpha=0.7)
        axes[idx].set_title(f'Distribution of {col}', fontsize=10, fontweight='bold')
        axes[idx].set_xlabel(col, fontsize=9)
        axes[idx].set_ylabel('Count', fontsize=9)
        axes[idx].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

### 7.5 Correlation Analysis

In [None]:
# Calculate correlation matrix for a subset of features
# Due to large number of features, we'll analyze a subset
subset_cols = numerical_cols[:15] if len(numerical_cols) > 15 else numerical_cols
if 'target' in df.columns:
    subset_cols = ['target'] + subset_cols

if len(subset_cols) > 1:
    correlation_matrix = df[subset_cols].corr()
    
    # Create a heatmap to visualize correlations
    plt.figure(figsize=(14, 10))
    sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', 
                center=0, square=True, linewidths=1, cbar_kws={"shrink": 0.8})
    plt.title('Correlation Matrix (Subset of Features)', fontsize=14, fontweight='bold')
    plt.tight_layout()
    plt.show()
    
    # Display features most correlated with target
    if 'target' in subset_cols:
        target_corr = correlation_matrix['target'].sort_values(ascending=False)
        print("\nFeatures most correlated with target:")
        print(target_corr.head(10))

## Summary

This notebook has provided a comprehensive overview of the Porto Seguro Safe Driver dataset including:
- Dataset structure and size
- Statistical summaries of all features
- Missing value analysis
- Data type information
- Visual analysis through histograms, box plots, and count plots
- Correlation analysis

The dataset appears to be from a Kaggle competition focused on predicting insurance claims. Next steps could include:
- Feature engineering
- Handling class imbalance
- Building predictive models
- Model evaluation and optimization