# Workspace Setup Guide

This notebook will guide you through setting up a complete data science development workspace.

**Prerequisites:**
- Python 3.8+ installed
- VS Code installed
- Git installed

Run each cell in order to complete the setup process.

## 1. Install Essential Python Packages

Let's start by installing the core Python packages for data science and development.

In [None]:
# Check Python version
import sys
print(f"Python version: {sys.version}")
print(f"Python executable: {sys.executable}")

In [None]:
# Install essential packages
import subprocess
import sys

def install_package(package):
    subprocess.check_call([sys.executable, "-m", "pip", "install", package])

# Core data science packages
essential_packages = [
    "numpy",
    "pandas", 
    "matplotlib",
    "seaborn",
    "scipy",
    "scikit-learn",
    "jupyter",
    "jupyterlab"
]

print("Installing essential packages...")
for package in essential_packages:
    try:
        install_package(package)
        print(f"✓ {package} installed successfully")
    except Exception as e:
        print(f"✗ Failed to install {package}: {e}")

## 2. Configure Development Environment

Set up environment variables and configurations for our development workspace.

In [None]:
import os
from pathlib import Path

# Get current working directory
workspace_path = Path.cwd()
print(f"Workspace path: {workspace_path}")

# Set up environment variables for the project
os.environ['PROJECT_ROOT'] = str(workspace_path)
os.environ['DATA_PATH'] = str(workspace_path / 'data')
os.environ['NOTEBOOKS_PATH'] = str(workspace_path / 'notebooks')
os.environ['SRC_PATH'] = str(workspace_path / 'src')

print("Environment variables set:")
for key in ['PROJECT_ROOT', 'DATA_PATH', 'NOTEBOOKS_PATH', 'SRC_PATH']:
    print(f"  {key}: {os.environ.get(key)}")

## 3. Verify Project Directory Structure

Check that our project has the proper directory structure for data science work.

In [None]:
import os
from pathlib import Path

# Define the expected directory structure
directories = [
    'notebooks',
    'data/raw',
    'data/processed', 
    'data/external',
    'src'
]

workspace_root = Path.cwd()
print(f"Checking directory structure in: {workspace_root}")
print("\nDirectory structure:")

for directory in directories:
    dir_path = workspace_root / directory
    if dir_path.exists():
        print(f"✓ {directory}/")
    else:
        print(f"✗ {directory}/ (missing)")
        # Create missing directories
        dir_path.mkdir(parents=True, exist_ok=True)
        print(f"  → Created {directory}/")

# List all files and directories in the workspace
print("\nCurrent workspace contents:")
for item in sorted(workspace_root.iterdir()):
    if item.is_dir():
        print(f"📁 {item.name}/")
    else:
        print(f"📄 {item.name}")

## 4. Initialize Version Control

Set up Git repository and basic configuration for version control.

In [None]:
import subprocess
import os
from pathlib import Path

def run_git_command(command):
    try:
        result = subprocess.run(command, shell=True, capture_output=True, text=True, cwd=Path.cwd())
        if result.returncode == 0:
            return True, result.stdout.strip()
        else:
            return False, result.stderr.strip()
    except Exception as e:
        return False, str(e)

# Check if git is already initialized
git_dir = Path.cwd() / '.git'
if git_dir.exists():
    print("✓ Git repository already initialized")
else:
    print("Initializing Git repository...")
    success, output = run_git_command('git init')
    if success:
        print("✓ Git repository initialized")
    else:
        print(f"✗ Failed to initialize Git: {output}")

# Check Git configuration
print("\nChecking Git configuration:")
success, user_name = run_git_command('git config user.name')
success2, user_email = run_git_command('git config user.email')

if user_name:
    print(f"✓ Git user.name: {user_name}")
else:
    print("⚠ Git user.name not set. Run: git config --global user.name 'Your Name'")
    
if user_email:
    print(f"✓ Git user.email: {user_email}")
else:
    print("⚠ Git user.email not set. Run: git config --global user.email 'your.email@example.com'")

# Check if .gitignore exists
gitignore_path = Path.cwd() / '.gitignore'
if gitignore_path.exists():
    print("✓ .gitignore file exists")
else:
    print("⚠ .gitignore file not found")

## 5. Configure IDE Settings

Verify VS Code is configured properly for Python development.

In [None]:
import json
from pathlib import Path

# Check if .vscode directory exists
vscode_dir = Path.cwd() / '.vscode'
settings_file = vscode_dir / 'settings.json'

print("VS Code workspace configuration:")
if vscode_dir.exists():
    print("✓ .vscode directory exists")
    
    if settings_file.exists():
        print("✓ settings.json exists")
        try:
            with open(settings_file, 'r') as f:
                settings = json.load(f)
            print("Current workspace settings:")
            for key, value in settings.items():
                print(f"  {key}: {value}")
        except Exception as e:
            print(f"⚠ Error reading settings.json: {e}")
    else:
        print("⚠ settings.json not found")
else:
    print("⚠ .vscode directory not found")
    print("  This is normal - VS Code will create it when you open the workspace")

# Recommended VS Code extensions for data science
recommended_extensions = [
    "ms-python.python",
    "ms-python.vscode-python-envs", 
    "ms-toolsai.jupyter",
    "ms-vscode.vscode-jupyter-slideshow",
    "donjayamanne.githistory",
    "eamodio.gitlens"
]

print("\nRecommended VS Code extensions:")
for ext in recommended_extensions:
    print(f"  • {ext}")

## 6. Create Virtual Environment

Set up a Python virtual environment and install project dependencies.

In [None]:
import subprocess
import sys
from pathlib import Path

# Check current Python environment
print(f"Current Python executable: {sys.executable}")
print(f"Current working directory: {Path.cwd()}")

# Check if we're in a virtual environment
in_venv = hasattr(sys, 'real_prefix') or (hasattr(sys, 'base_prefix') and sys.base_prefix != sys.prefix)
print(f"In virtual environment: {in_venv}")

if in_venv:
    print("✓ Already in a virtual environment")
    venv_path = Path(sys.executable).parent.parent
    print(f"Virtual environment path: {venv_path}")
else:
    print("⚠ Not in a virtual environment")
    print("Consider creating and activating a virtual environment:")
    print("  python -m venv data_sandbox_env")
    print("  # On Windows: data_sandbox_env\\Scripts\\activate")
    print("  # On macOS/Linux: source data_sandbox_env/bin/activate")

# Check if requirements.txt exists
requirements_file = Path.cwd() / 'requirements.txt'
if requirements_file.exists():
    print("\n✓ requirements.txt found")
    print("\nTo install requirements, run:")
    print("  pip install -r requirements.txt")
    
    # Show requirements content
    print("\nCurrent requirements:")
    with open(requirements_file, 'r') as f:
        for line in f:
            if line.strip() and not line.strip().startswith('#'):
                print(f"  • {line.strip()}")
else:
    print("\n⚠ requirements.txt not found")

## 7. Install and Configure Jupyter Extensions

Set up useful Jupyter extensions and configurations for improved productivity.

In [None]:
# Check Jupyter installation
try:
    import jupyter
    import jupyterlab
    print("✓ Jupyter and JupyterLab are installed")
    print(f"Jupyter version: {jupyter.__version__}")
    print(f"JupyterLab version: {jupyterlab.__version__}")
except ImportError as e:
    print(f"⚠ Jupyter not fully installed: {e}")
    print("Install with: pip install jupyter jupyterlab")

# Check for useful libraries
libraries_to_check = {
    'numpy': 'np',
    'pandas': 'pd', 
    'matplotlib': 'plt',
    'seaborn': 'sns',
    'plotly': 'plotly'
}

print("\nChecking data science libraries:")
available_imports = []

for lib, alias in libraries_to_check.items():
    try:
        exec(f"import {lib}")
        print(f"✓ {lib} available")
        available_imports.append(f"import {lib} as {alias}" if alias != lib else f"import {lib}")
    except ImportError:
        print(f"✗ {lib} not available")

# Generate common imports for notebooks
if available_imports:
    print("\nCommon imports for your notebooks:")
    print("```python")
    for imp in available_imports:
        print(imp)
    if 'matplotlib' in [imp.split()[1] for imp in available_imports]:
        print("%matplotlib inline")
    print("```")

## Setup Complete! 🎉

Your data science workspace is now set up and ready to use.

### Next Steps:

1. **Start exploring**: Create new notebooks in the `notebooks/` folder
2. **Add data**: Place your datasets in the `data/raw/` folder
3. **Write reusable code**: Add Python modules to the `src/` folder
4. **Version control**: Commit your initial setup with Git

### Quick Commands:

- Start JupyterLab: `jupyter lab`
- Install packages: `pip install package_name`
- Update requirements: `pip freeze > requirements.txt`

Happy coding! 🚀