# Data Science Project Framework
A comprehensive framework for data science projects that includes all essential steps from data loading to evaluation and visualization.

## Import Required Libraries

In [None]:
# Import essential libraries for data analysis
import pandas as pd
import numpy as np

# Import visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

# Import machine learning libraries
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    mean_squared_error, r2_score, confusion_matrix, classification_report
)

# Common ML algorithms
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.svm import SVC, SVR
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.cluster import KMeans, DBSCAN

# Configure visualization settings
plt.style.use('seaborn-whitegrid')
sns.set(style="whitegrid", palette="muted")

## Data Loading and Exploration

In [None]:
# Load your dataset
# Replace 'your_dataset.csv' with the actual file path
# df = pd.read_csv('your_dataset.csv')

# For demonstration purposes, let's use a sample dataset from sklearn
from sklearn.datasets import load_iris
iris = load_iris()
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
df['target'] = iris.target
df['target_name'] = df['target'].map({i: name for i, name in enumerate(iris.target_names)})

# Display the first few rows
print("First 5 rows of the dataset:")
display(df.head())

# Basic information about the dataset
print("\nBasic information about the dataset:")
df.info()

# Statistical summary
print("\nStatistical summary:")
display(df.describe())

# Check for missing values
print("\nMissing values:")
display(df.isnull().sum())

# Check the distribution of target variable (if applicable)
print("\nTarget variable distribution:")
if 'target' in df.columns:
    display(df['target_name'].value_counts())
    
# Correlation matrix
print("\nCorrelation matrix:")
corr = df.select_dtypes(include=['float64', 'int64']).corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix')
plt.show()

## Data Preprocessing

In [None]:
# Function to identify numerical and categorical columns
def identify_column_types(df):
    numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_cols = df.select_dtypes(include=['object', 'category', 'bool']).columns.tolist()
    return numerical_cols, categorical_cols

# Identify column types
numerical_cols, categorical_cols = identify_column_types(df)
print(f"Numerical columns: {numerical_cols}")
print(f"Categorical columns: {categorical_cols}")

# Handle missing values if any
# For numerical columns: impute with mean
# For categorical columns: impute with most frequent value

# Create preprocessing pipeline
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Define the target variable
# For this example, we use 'target' from iris dataset
X = df.drop(['target', 'target_name'], axis=1)
y = df['target']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y if 'target' in df.columns else None
)

print(f"Training set shape: {X_train.shape}")
print(f"Testing set shape: {X_test.shape}")

# Apply preprocessing
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# If using OneHotEncoder, transform the sparse matrix to dense array if needed
# X_train_processed = X_train_processed.toarray()
# X_test_processed = X_test_processed.toarray()

## Model Creation

In [None]:
# Define a list of models to try (adapt based on your problem type)
# Here we'll set up for a classification task assuming the iris dataset

# Check the type of problem
is_classification = len(np.unique(y)) < 10  # Arbitrary threshold
problem_type = "Classification" if is_classification else "Regression"
print(f"Detected problem type: {problem_type}")

# Initialize models based on problem type
if is_classification:
    models = {
        'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
        'Random Forest': RandomForestClassifier(random_state=42),
        'Decision Tree': DecisionTreeClassifier(random_state=42),
        'SVM': SVC(probability=True, random_state=42),
        'KNN': KNeighborsClassifier()
    }
else:
    models = {
        'Linear Regression': LinearRegression(),
        'Random Forest': RandomForestRegressor(random_state=42),
        'Decision Tree': DecisionTreeRegressor(random_state=42),
        'SVR': SVR(),
        'KNN': KNeighborsRegressor()
    }

print(f"Models to be evaluated: {list(models.keys())}")

## Model Training

In [None]:
# Model training and cross-validation function
def train_and_evaluate_models(models, X_train, y_train, X_test, y_test, cv=5):
    results = {}
    
    for name, model in models.items():
        print(f"Training {name}...")
        
        # Train the model
        model.fit(X_train, y_train)
        
        # Make predictions
        y_pred = model.predict(X_test)
        
        # Cross-validation
        cv_scores = cross_val_score(model, X_train, y_train, cv=cv)
        
        # Store results
        results[name] = {
            'model': model,
            'cv_scores': cv_scores,
            'cv_mean': np.mean(cv_scores),
            'cv_std': np.std(cv_scores),
            'test_predictions': y_pred
        }
        
        print(f"{name} - Cross-validation Mean: {np.mean(cv_scores):.4f}, Std: {np.std(cv_scores):.4f}")
    
    return results

# Train models
cv_results = train_and_evaluate_models(models, X_train_processed, y_train, X_test_processed, y_test)

# Find the best model based on cross-validation scores
best_model_name = max(cv_results, key=lambda x: cv_results[x]['cv_mean'])
print(f"\nBest model based on cross-validation: {best_model_name} with score {cv_results[best_model_name]['cv_mean']:.4f}")

## Evaluation

In [None]:
# Function for model evaluation based on problem type
def evaluate_model(y_true, y_pred, problem_type="Classification"):
    results = {}
    
    if problem_type == "Classification":
        # Classification metrics
        results['accuracy'] = accuracy_score(y_true, y_pred)
        
        # For multi-class, use 'weighted' average
        results['precision'] = precision_score(y_true, y_pred, average='weighted', zero_division=0)
        results['recall'] = recall_score(y_true, y_pred, average='weighted', zero_division=0)
        results['f1'] = f1_score(y_true, y_pred, average='weighted', zero_division=0)
        
        # Confusion Matrix
        results['confusion_matrix'] = confusion_matrix(y_true, y_pred)
        
        # Classification report
        results['classification_report'] = classification_report(y_true, y_pred)
    else:
        # Regression metrics
        results['mse'] = mean_squared_error(y_true, y_pred)
        results['rmse'] = np.sqrt(results['mse'])
        results['r2'] = r2_score(y_true, y_pred)
    
    return results

# Evaluate all models
for name, result in cv_results.items():
    print(f"\nEvaluating {name}:")
    eval_results = evaluate_model(y_test, result['test_predictions'], problem_type)
    
    if problem_type == "Classification":
        print(f"Accuracy: {eval_results['accuracy']:.4f}")
        print(f"Precision: {eval_results['precision']:.4f}")
        print(f"Recall: {eval_results['recall']:.4f}")
        print(f"F1 Score: {eval_results['f1']:.4f}")
        print("\nConfusion Matrix:")
        print(eval_results['confusion_matrix'])
        print("\nClassification Report:")
        print(eval_results['classification_report'])
    else:
        print(f"MSE: {eval_results['mse']:.4f}")
        print(f"RMSE: {eval_results['rmse']:.4f}")
        print(f"R²: {eval_results['r2']:.4f}")
        
    # Store evaluation results
    cv_results[name]['evaluation'] = eval_results

# Get the best model
best_model = cv_results[best_model_name]['model']

## Visualization

In [None]:
# 1. Cross-validation scores comparison
cv_means = [result['cv_mean'] for result in cv_results.values()]
cv_stds = [result['cv_std'] for result in cv_results.values()]
model_names = list(cv_results.keys())

plt.figure(figsize=(12, 6))
bars = plt.bar(model_names, cv_means, yerr=cv_stds, capsize=10)
plt.title('Model Comparison: Cross-validation Scores')
plt.ylabel('Mean CV Score')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# 2. Feature importance (if available)
if hasattr(best_model, 'feature_importances_'):
    importances = best_model.feature_importances_
    indices = np.argsort(importances)[::-1]
    feature_names = X.columns
    
    plt.figure(figsize=(12, 6))
    plt.title(f'Feature Importances from {best_model_name}')
    plt.bar(range(len(importances)), importances[indices], align='center')
    plt.xticks(range(len(importances)), [feature_names[i] for i in indices], rotation=90)
    plt.tight_layout()
    plt.show()

# 3. Confusion Matrix visualization (for classification)
if problem_type == "Classification":
    confusion_mat = cv_results[best_model_name]['evaluation']['confusion_matrix']
    plt.figure(figsize=(10, 8))
    sns.heatmap(confusion_mat, annot=True, fmt='d', cmap='Blues',
                xticklabels=iris.target_names,
                yticklabels=iris.target_names)
    plt.xlabel('Predicted Labels')
    plt.ylabel('True Labels')
    plt.title(f'Confusion Matrix - {best_model_name}')
    plt.tight_layout()
    plt.show()

# 4. ROC Curve (for binary classification)
if problem_type == "Classification" and len(np.unique(y)) == 2:
    from sklearn.metrics import roc_curve, auc
    
    plt.figure(figsize=(10, 8))
    
    for name, result in cv_results.items():
        if hasattr(result['model'], 'predict_proba'):
            # Get probabilities for positive class
            y_score = result['model'].predict_proba(X_test_processed)[:, 1]
            fpr, tpr, _ = roc_curve(y_test, y_score)
            roc_auc = auc(fpr, tpr)
            
            plt.plot(fpr, tpr, lw=2, label=f'{name} (AUC = {roc_auc:.4f})')
    
    plt.plot([0, 1], [0, 1], 'k--', lw=2)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curves')
    plt.legend(loc="lower right")
    plt.show()

# 5. Scatter plot of actual vs predicted (for regression)
if problem_type == "Regression":
    y_pred = best_model.predict(X_test_processed)
    
    plt.figure(figsize=(10, 6))
    plt.scatter(y_test, y_pred, alpha=0.5)
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
    plt.xlabel('Actual')
    plt.ylabel('Predicted')
    plt.title(f'Actual vs. Predicted Values - {best_model_name}')
    plt.tight_layout()
    plt.show()

# 6. Learning curve
from sklearn.model_selection import learning_curve

train_sizes, train_scores, test_scores = learning_curve(
    best_model, X_train_processed, y_train, cv=5, n_jobs=-1, 
    train_sizes=np.linspace(0.1, 1.0, 10)
)

train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

plt.figure(figsize=(10, 6))
plt.plot(train_sizes, train_mean, color='blue', marker='o', markersize=5, label='Training score')
plt.fill_between(train_sizes, train_mean + train_std, train_mean - train_std, alpha=0.15, color='blue')
plt.plot(train_sizes, test_mean, color='green', linestyle='--', marker='s', markersize=5, label='Validation score')
plt.fill_between(train_sizes, test_mean + test_std, test_mean - test_std, alpha=0.15, color='green')
plt.title(f'Learning Curve - {best_model_name}')
plt.xlabel('Training Size')
plt.ylabel('Score')
plt.grid()
plt.legend(loc='lower right')
plt.show()

## Summary and Next Steps

### Summary of Findings:

- Best performing model: [Insert best model name with its performance metrics]
- Key features: [Insert top features if feature importance was available]
- Model evaluation: [Summarize key evaluation metrics]

### Potential Next Steps:

1. **Model Improvement**:
   - Hyperparameter tuning
   - Feature engineering
   - Try ensemble methods or more advanced algorithms

2. **Further Analysis**:
   - Investigate specific misclassifications or prediction errors
   - Perform deeper feature analysis
   - Consider additional data sources

3. **Deployment**:
   - Serialize the model for production
   - Create an API for model inference
   - Set up monitoring for model performance

In [None]:
# Save the best model for later use
import joblib

# Uncomment to save the model
# joblib.dump(best_model, 'best_model.pkl')
# print(f"Saved best model ({best_model_name}) to 'best_model.pkl'")

# Create a simple function to make new predictions
def make_prediction(data, model=best_model, preprocessor=preprocessor):
    """
    Make predictions on new data using the trained model
    
    Parameters:
    data: pandas DataFrame - New data to predict
    model: trained model object
    preprocessor: fitted preprocessor
    
    Returns:
    array of predictions
    """
    # Preprocess the data
    processed_data = preprocessor.transform(data)
    
    # Make predictions
    predictions = model.predict(processed_data)
    
    return predictions

# Example usage:
# new_data = pd.DataFrame(...)
# predictions = make_prediction(new_data)