# Python Data Analysis Toolkit - Basic Usage Example

This notebook demonstrates basic usage of the datoolkit package for data analysis, visualization, and machine learning tasks.

In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_diabetes
from sklearn.ensemble import RandomForestRegressor

# Import functions from datoolkit
from datoolkit.stats import descriptive_stats, correlation_analysis
from datoolkit.visualization import plot_histogram, plot_correlation_matrix, plot_scatter
from datoolkit.preprocessing import normalization, handle_missing_values
from datoolkit.ml import train_test_split_stratified, cross_validation_metrics, feature_importance_analysis

## Load and Prepare Sample Data

For this example, we'll use the diabetes dataset from scikit-learn.

In [None]:
# Load the diabetes dataset
diabetes = load_diabetes()
feature_names = diabetes.feature_names

# Create a pandas DataFrame
X = pd.DataFrame(diabetes.data, columns=feature_names)
y = pd.Series(diabetes.target, name='target')

# Display the first few rows
X.head()

## 1. Statistical Analysis

First, let's calculate descriptive statistics for the target variable.

In [None]:
# Calculate descriptive statistics for the target variable
target_stats = descriptive_stats(y)
print("Target Variable Statistics:")
for key, value in target_stats.items():
    print(f"{key}: {value:.2f}" if isinstance(value, float) else f"{key}: {value}")

Now, let's analyze correlations between the features.

In [None]:
# Calculate correlation matrix and p-values
corr_matrix, p_values = correlation_analysis(X, method='pearson')

# Display the correlation matrix
print("Correlation Matrix:")
corr_matrix

## 2. Data Visualization

Let's create some visualizations to better understand our data.

In [None]:
# Visualize the distribution of the target variable
fig, ax = plot_histogram(y, bins=20, title='Distribution of Diabetes Progression', 
                        xlabel='Disease Progression')
plt.show()

In [None]:
# Visualize the correlation matrix
fig, ax = plot_correlation_matrix(corr_matrix, p_values=p_values, p_threshold=0.05,
                                 title='Feature Correlations')
plt.show()

In [None]:
# Scatter plot for the most correlated feature with the target
# First, find the most correlated feature
correlations_with_target = X.corrwith(y)
most_correlated = correlations_with_target.abs().sort_values(ascending=False).index[0]

# Create the scatter plot
fig, ax = plot_scatter(X[most_correlated], y, 
                      title=f'Relationship between {most_correlated} and Disease Progression',
                      xlabel=most_correlated, ylabel='Disease Progression')
plt.show()

## 3. Data Preprocessing

Now, let's normalize our features and demonstrate handling missing values.

In [None]:
# Normalize the features using Z-score normalization
X_normalized, scaler = normalization(X, method='zscore')

# Display the first few rows of normalized data
X_normalized.head()

In [None]:
# Artificially introduce some missing values for demonstration
X_with_missing = X.copy()
np.random.seed(42)
for col in X_with_missing.columns[:3]:  # Add missing values to first 3 columns
    mask = np.random.random(size=len(X_with_missing)) < 0.1  # 10% missing values
    X_with_missing.loc[mask, col] = np.nan

# Display the count of missing values
print("Missing values count:")
print(X_with_missing.isna().sum())

# Handle missing values
X_cleaned, imputation_info = handle_missing_values(X_with_missing, strategy='mean')

# Display imputation information
print("\nImputation information:")
for col, info in imputation_info['imputation_values'].items():
    print(f"{col}: {info['strategy']} = {info['value']:.4f}")

## 4. Machine Learning Utilities

Finally, let's demonstrate the machine learning utilities.

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split_stratified(
    X_normalized, y, test_size=0.2, random_state=42, stratify=False  # Regression task
)

print(f"Training set size: {X_train.shape}")
print(f"Testing set size: {X_test.shape}")

In [None]:
# Train a Random Forest model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate the model using cross-validation
cv_results = cross_validation_metrics(model, X_normalized, y, cv=5, task='regression')

# Display the cross-validation results
print("Cross-validation results:")
for metric, values in cv_results.items():
    if metric in ['mean_squared_error', 'mean_absolute_error', 'r2']:
        print(f"{metric}:")
        print(f"  Test mean: {values['test_mean']:.4f} ± {values['test_std']:.4f}")
        print(f"  Train mean: {values['train_mean']:.4f} ± {values['train_std']:.4f}")

In [None]:
# Analyze feature importance
importance_df = feature_importance_analysis(model, X.columns)

# Display the feature importances
print("Feature importances:")
importance_df

In [None]:
# Visualize feature importances
plt.figure(figsize=(10, 6))
plt.barh(importance_df['feature'], importance_df['importance'])
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Feature Importances')
plt.gca().invert_yaxis()  # Display most important at the top
plt.tight_layout()
plt.show()

## Conclusion

This notebook demonstrated the basic functionality of the Python Data Analysis Toolkit. The package provides convenient utilities for:

1. Statistical analysis
2. Data visualization
3. Data preprocessing
4. Machine learning tasks

These tools can streamline your data science workflow and make common tasks more efficient.