# üöÄ Machinelearning Workspace - DP-100 Practice

**Workspace:** Machinelearning  
**Resource Group:** AI-102  
**Location:** Canada East  
**Compute:** Standard_DS11_v2 (Running)

This notebook provides hands-on practice for Azure DP-100 certification using the Machinelearning workspace.

## üìö Module 1: Environment Setup & Connection

In [None]:
# Import required libraries
from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential
from azure.ai.ml.entities import Data, Environment, Model
from azure.ai.ml.constants import AssetTypes
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

print("‚úÖ Libraries imported successfully")

In [None]:
# Connect to Machinelearning workspace
credential = DefaultAzureCredential()

ml_client = MLClient(
    credential=credential,
    subscription_id="29f1cd2f-d0e2-413e-b913-1976b6924fa6",
    resource_group_name="AI-102",
    workspace_name="Machinelearning"
)

print("‚úÖ Connected to Machinelearning workspace")

# Verify connection
workspace = ml_client.workspaces.get("Machinelearning")
print(f"üìç Workspace: {workspace.display_name}")
print(f"üìç Location: {workspace.location}")
print(f"üìç Resource Group: {workspace.resource_group}")

In [None]:
# Check available compute resources
computes = list(ml_client.compute.list())
print(f"üñ•Ô∏è Available compute resources: {len(computes)}")

for compute in computes:
    print(f"  ‚Ä¢ {compute.name} ({compute.type})")
    if hasattr(compute, 'size'):
        print(f"    Size: {compute.size}")
    if hasattr(compute, 'state'):
        print(f"    State: {compute.state}")
    print()

## üìä Module 2: Data Management

In [None]:
# Create sample dataset
np.random.seed(42)
n_samples = 1000

data = {
    'age': np.random.normal(45, 15, n_samples).clip(18, 80),
    'income': np.random.normal(50000, 20000, n_samples).clip(10000, 150000),
    'credit_score': np.random.normal(650, 100, n_samples).clip(300, 850),
    'debt_ratio': np.random.normal(0.3, 0.2, n_samples).clip(0, 1),
    'employment_years': np.random.normal(10, 5, n_samples).clip(0, 40),
    'approved': np.random.choice([0, 1], n_samples, p=[0.3, 0.7])
}

df = pd.DataFrame(data)
print("‚úÖ Sample dataset created")
print(f"üìä Dataset shape: {df.shape}")
print(f"üéØ Target distribution: {df['approved'].value_counts().to_dict()}")

# Save locally
df.to_csv('credit_approval_data.csv', index=False)
print("üíæ Dataset saved as credit_approval_data.csv")

In [None]:
# Register dataset in Azure ML
data_asset = Data(
    path="./credit_approval_data.csv",
    type=AssetTypes.URI_FILE,
    description="Credit approval dataset for DP-100 practice",
    name="credit-approval-data"
)

registered_data = ml_client.data.create_or_update(data_asset)
print("‚úÖ Dataset registered in Azure ML")
print(f"üìÅ Name: {registered_data.name}")
print(f"üè∑Ô∏è Version: {registered_data.version}")
print(f"üîó Path: {registered_data.path}")

In [None]:
# Explore the data
print("üìà Dataset Overview:")
print(df.head())
print("\n" + "="*50)
print("üìä Statistical Summary:")
print(df.describe())
print("\n" + "="*50)
print("üîç Data Types:")
print(df.dtypes)
print("\n" + "="*50)
print("‚ùì Missing Values:")
print(df.isnull().sum())

## ü§ñ Module 3: Model Training

In [None]:
# Prepare data for training
X = df.drop('approved', axis=1)
y = df['approved']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("üìä Data Split Summary:")
print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")
print(f"Features: {X_train.shape[1]}")
print(f"\nTarget distribution:")
print(f"Train: {y_train.value_counts().to_dict()}")
print(f"Test: {y_test.value_counts().to_dict()}")

In [None]:
# Train Logistic Regression model
lr_model = LogisticRegression(random_state=42, max_iter=1000)
lr_model.fit(X_train, y_train)

# Make predictions
lr_pred = lr_model.predict(X_test)
lr_prob = lr_model.predict_proba(X_test)[:, 1]

# Evaluate
lr_accuracy = accuracy_score(y_test, lr_pred)

print("üîç Logistic Regression Results:")
print(f"Accuracy: {lr_accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, lr_pred))
print("\nCoefficients:")
for feature, coef in zip(X.columns, lr_model.coef_[0]):
    print(f"{feature}: {coef:.4f}")

In [None]:
# Train Random Forest model
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    random_state=42
)
rf_model.fit(X_train, y_train)

# Make predictions
rf_pred = rf_model.predict(X_test)
rf_prob = rf_model.predict_proba(X_test)[:, 1]

# Evaluate
rf_accuracy = accuracy_score(y_test, rf_pred)

print("üå≤ Random Forest Results:")
print(f"Accuracy: {rf_accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, rf_pred))
print("\nFeature Importances:")
for feature, importance in zip(X.columns, rf_model.feature_importances_):
    print(f"{feature}: {importance:.4f}")

In [None]:
# Compare models
models_comparison = pd.DataFrame({
    'Model': ['Logistic Regression', 'Random Forest'],
    'Accuracy': [lr_accuracy, rf_accuracy]
})

print("üèÜ Model Comparison:")
print(models_comparison)

# Determine best model
best_model_name = models_comparison.loc[models_comparison['Accuracy'].idxmax(), 'Model']
best_accuracy = models_comparison['Accuracy'].max()

print(f"\nüéØ Best Model: {best_model_name} (Accuracy: {best_accuracy:.4f})")

# Select best model for registration
if best_model_name == 'Random Forest':
    best_model = rf_model
    model_type = 'RandomForest'
else:
    best_model = lr_model
    model_type = 'LogisticRegression'

print(f"üì¶ Selected model for registration: {model_type}")

## üì¶ Module 4: Model Registration

In [None]:
# Save the best model locally
model_filename = f"{model_type.lower().replace(' ', '_')}_model.pkl"
joblib.dump(best_model, model_filename)

print(f"üíæ Model saved locally as: {model_filename}")

# Verify file exists
import os
if os.path.exists(model_filename):
    file_size = os.path.getsize(model_filename)
    print(f"‚úÖ File exists (Size: {file_size} bytes)")
else:
    print("‚ùå File not found")

In [None]:
# Register model in Azure ML
model_asset = Model(
    path=model_filename,
    name=f"dp100-{model_type.lower().replace(' ', '-')}-model",
    version="1",
    description=f"{model_type} model trained for DP-100 credit approval prediction",
    type=AssetTypes.CUSTOM_MODEL,
    properties={
        "accuracy": str(best_accuracy),
        "algorithm": model_type,
        "dataset": "credit-approval-data",
        "training_samples": str(X_train.shape[0]),
        "features": str(X_train.shape[1])
    },
    tags={
        "project": "dp100-practice",
        "workspace": "machinelearning",
        "certification": "dp100"
    }
)

registered_model = ml_client.models.create_or_update(model_asset)

print("‚úÖ Model registered successfully!")
print(f"üè∑Ô∏è Name: {registered_model.name}")
print(f"üìä Version: {registered_model.version}")
print(f"üÜî ID: {registered_model.id}")
print(f"üìù Description: {registered_model.description}")

if hasattr(registered_model, 'tags') and registered_model.tags:
    print(f"üè∑Ô∏è Tags: {registered_model.tags}")

## üéØ Module 5: Practice Exercises

### Exercise 1: Data Visualization
Create visualizations to understand the credit approval dataset better.

In [None]:
# Set up plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Create comprehensive visualizations
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
fig.suptitle('Credit Approval Dataset Analysis', fontsize=16, fontweight='bold')

# Distribution plots
features = ['age', 'income', 'credit_score', 'debt_ratio', 'employment_years']
for i, feature in enumerate(features):
    row, col = i // 3, i % 3
    if row < 2 and col < 3:
        sns.histplot(data=df, x=feature, ax=axes[row, col], kde=True)
        axes[row, col].set_title(f'{feature.replace("_", " ").title()} Distribution')

# Target distribution in the last subplot
if len(features) < 6:
    approval_counts = df['approved'].value_counts()
    axes[1, 2].bar(['Rejected', 'Approved'], approval_counts.values, 
                   color=['#ff6b6b', '#4ecdc4'], alpha=0.7)
    axes[1, 2].set_title('Loan Approval Distribution')
    axes[1, 2].set_ylabel('Count')
    
    # Add value labels
    for i, v in enumerate(approval_counts.values):
        axes[1, 2].text(i, v + 5, str(v), ha='center', fontweight='bold')

plt.tight_layout()
plt.savefig('credit_analysis_visualization.png', dpi=300, bbox_inches='tight')
plt.show()

print("‚úÖ Visualization saved as 'credit_analysis_visualization.png'")

### Exercise 2: Feature Engineering
Create new features and analyze their impact on model performance.

In [None]:
# Create new features
df_engineered = df.copy()

# Income to debt ratio
df_engineered['income_to_debt'] = df_engineered['income'] / (df_engineered['debt_ratio'] + 0.001)

# Age groups
df_engineered['age_group'] = pd.cut(df_engineered['age'], 
                                   bins=[0, 25, 35, 45, 55, 100], 
                                   labels=['18-25', '26-35', '36-45', '46-55', '56+'])

# Credit score categories
df_engineered['credit_category'] = pd.cut(df_engineered['credit_score'], 
                                         bins=[0, 580, 670, 740, 850], 
                                         labels=['Poor', 'Fair', 'Good', 'Excellent'])

# Employment stability
df_engineered['employment_stability'] = pd.cut(df_engineered['employment_years'], 
                                              bins=[0, 2, 5, 10, 40], 
                                              labels=['New', 'Developing', 'Stable', 'Experienced'])

# Convert categorical to numeric
df_engineered['age_group_encoded'] = df_engineered['age_group'].cat.codes
df_engineered['credit_category_encoded'] = df_engineered['credit_category'].cat.codes
df_engineered['employment_stability_encoded'] = df_engineered['employment_stability'].cat.codes

print("‚úÖ New features created:")
print("‚Ä¢ income_to_debt: Income relative to debt")
print("‚Ä¢ age_group: Categorized age groups")
print("‚Ä¢ credit_category: Credit score categories")
print("‚Ä¢ employment_stability: Employment experience levels")

print(f"\nüìä Dataset shape after engineering: {df_engineered.shape}")
print("\nSample of engineered features:")
print(df_engineered[['age', 'age_group', 'credit_score', 'credit_category', 'income_to_debt']].head())

### Exercise 3: Model Improvement
Train a model with the engineered features and compare performance.

In [None]:
# Prepare engineered features
numerical_features = ['age', 'income', 'credit_score', 'debt_ratio', 'employment_years', 
                     'income_to_debt', 'age_group_encoded', 'credit_category_encoded', 
                     'employment_stability_encoded']

X_engineered = df_engineered[numerical_features]
y_engineered = df_engineered['approved']

# Split data
X_train_eng, X_test_eng, y_train_eng, y_test_eng = train_test_split(
    X_engineered, y_engineered, test_size=0.2, random_state=42, stratify=y_engineered
)

print("üîß Engineered Features Model Training:")
print(f"Features used: {len(numerical_features)}")
print(f"Training samples: {X_train_eng.shape[0]}")
print(f"Test samples: {X_test_eng.shape[0]}")

In [None]:
# Train model with engineered features
rf_engineered = RandomForestClassifier(
    n_estimators=200,
    max_depth=12,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42
)

rf_engineered.fit(X_train_eng, y_train_eng)
rf_eng_pred = rf_engineered.predict(X_test_eng)
rf_eng_accuracy = accuracy_score(y_test_eng, rf_eng_pred)

print("üå≤ Random Forest with Engineered Features:")
print(f"Accuracy: {rf_eng_accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test_eng, rf_eng_pred))

# Feature importance for engineered features
feature_importance_df = pd.DataFrame({
    'feature': numerical_features,
    'importance': rf_engineered.feature_importances_
}).sort_values('importance', ascending=False)

print("\nüîç Top 5 Most Important Features:")
print(feature_importance_df.head())

# Compare with original model
improvement = rf_eng_accuracy - rf_accuracy
print(f"\nüìà Accuracy Improvement: {improvement:.4f} ({improvement*100:.2f}%)")

## üèÜ Certification Practice Summary

In [None]:
# Practice completion summary
practice_summary = {
    "workspace": "Machinelearning",
    "location": "Canada East",
    "compute": "Standard_DS11_v2",
    "modules_completed": [
        "Environment Setup",
        "Data Management",
        "Model Training",
        "Model Registration",
        "Data Visualization",
        "Feature Engineering",
        "Model Improvement"
    ],
    "models_registered": 1,
    "datasets_created": 1,
    "best_accuracy": rf_eng_accuracy,
    "skills_practiced": [
        "Azure ML SDK",
        "Data Preparation",
        "Model Training",
        "Model Evaluation",
        "Feature Engineering",
        "Model Registration",
        "Data Visualization"
    ]
}

print("üéì DP-100 Practice Summary for Machinelearning Workspace:")
print("=" * 60)
for key, value in practice_summary.items():
    print(f"{key.replace('_', ' ').title()}: {value}")

print("\nüéØ Ready for DP-100 Certification!")
print("Next: Practice with AIintern workspace or deploy models to production.")