# Azure Demand Forecasting – Milestone 1
## Notebook 01: Data Loading & Initial EDA

**Team:** Backend-B  
**Milestone:** Data Collection & Preparation  
**Date:** 20-08-25


In [10]:
import os
print(os.getcwd())


d:\infosysspringboard projects\project1-1stmilestine\AZURE_BACKEND_TEAM-B\notebooks


In [11]:
# Import libraries
import pandas as pd

# Load raw datasets
# Load Azure usage data

azure_df = pd.read_csv('../data/raw/azure_usage.csv')
# Load External factors data

external_df = pd.read_csv('../data/raw/external_factors.csv')
# Display first rows of each dataframe
display(azure_df.head())
display(external_df.head())


Unnamed: 0,date,region,resource_type,usage_cpu,usage_storage,users_active
0,2023-01-01,East US,VM,88,1959,470
1,2023-01-01,East US,Storage,92,1595,388
2,2023-01-01,East US,Container,70,621,414
3,2023-01-01,West US,VM,60,1982,287
4,2023-01-01,West US,Storage,85,1371,351


Unnamed: 0,date,economic_index,cloud_market_demand,holiday
0,2023-01-01,104.97,0.99,1
1,2023-01-02,106.48,1.15,0
2,2023-01-03,97.66,0.98,0
3,2023-01-04,115.79,1.08,0
4,2023-01-05,95.31,1.05,0


In [12]:
# Convert date columns to datetime
azure_df['date'] = pd.to_datetime(azure_df['date'])
external_df['date'] = pd.to_datetime(external_df['date'])

# Check data info and missing values
print(azure_df.isnull().sum())
print(external_df.isnull().sum())


date             0
region           0
resource_type    0
usage_cpu        0
usage_storage    0
users_active     0
dtype: int64
date                   0
economic_index         0
cloud_market_demand    0
holiday                0
dtype: int64


In [13]:
# Print summary statistics for azure_df
print("Azure Usage Dataset infomation")
print("=" * 30)
print(f"Total entries: {len(azure_df)}")
print(f"Date range: {azure_df['date'].min().date()} to {azure_df['date'].max().date()}")
print(f"Unique regions: {azure_df['region'].nunique()} - {azure_df['region'].unique().tolist()}")
print(f"Unique resource_types: {azure_df['resource_type'].nunique()} - {azure_df['resource_type'].unique().tolist()}")
print(f"Average CPU usage: {azure_df['usage_cpu'].mean():.2f}%")
print(f"Average Storage usage: {azure_df['usage_storage'].mean():.2f} units")
print(f"Average active users: {azure_df['users_active'].mean():.2f}")

print("\nExternal Factors Dataset Summary")
print("=" * 30)
print(f"Total entries: {len(external_df)}")
if 'economic_index' in external_df.columns:
    print(f"Average Economic Index: {external_df['economic_index'].mean():.2f}")
if 'cloud_market_demand' in external_df.columns:
    print(f"Average Cloud Market Demand: {external_df['cloud_market_demand'].mean():.2f}")


Azure Usage Dataset infomation
Total entries: 1080
Date range: 2023-01-01 to 2023-03-31
Unique regions: 4 - ['East US', 'West US', 'North Europe', 'Southeast Asia']
Unique resource_types: 3 - ['VM', 'Storage', 'Container']
Average CPU usage: 74.65%
Average Storage usage: 1242.48 units
Average active users: 352.69

External Factors Dataset Summary
Total entries: 90
Average Economic Index: 98.93
Average Cloud Market Demand: 1.01


In [14]:
# Basic stats as evidence of loading
display(azure_df.describe())
display(external_df.describe())


Unnamed: 0,date,usage_cpu,usage_storage,users_active
count,1080,1080.0,1080.0,1080.0
mean,2023-02-14 12:00:00,74.651852,1242.477778,352.694444
min,2023-01-01 00:00:00,50.0,500.0,200.0
25%,2023-01-23 00:00:00,62.0,881.75,283.0
50%,2023-02-14 12:00:00,75.0,1262.0,353.0
75%,2023-03-09 00:00:00,87.0,1609.0,427.0
max,2023-03-31 00:00:00,99.0,1995.0,499.0
std,,14.549621,432.444284,86.28015


Unnamed: 0,date,economic_index,cloud_market_demand,holiday
count,90,90.0,90.0,90.0
mean,2023-02-14 12:00:00,98.930444,1.006667,0.277778
min,2023-01-01 00:00:00,73.8,0.8,0.0
25%,2023-01-23 06:00:00,93.035,0.94,0.0
50%,2023-02-14 12:00:00,99.645,1.015,0.0
75%,2023-03-08 18:00:00,103.5675,1.0775,1.0
max,2023-03-31 00:00:00,118.86,1.27,1.0
std,,8.825072,0.101517,0.450412


# Final Observations / Summary


# Azure Usage:


In [15]:
# Azure Usage:
num_records = len(azure_df)
regions_covered = azure_df['region'].unique().tolist()
avg_cpu_usage = azure_df['usage_cpu'].mean()
avg_storage_usage = azure_df['usage_storage'].mean()
active_users_min = azure_df['users_active'].min()
active_users_max = azure_df['users_active'].max()

print(f"Number of records: {num_records}")
print(f"Regions covered: {regions_covered}")
print(f"Average CPU usage: {avg_cpu_usage:.2f}")
print(f"Average Storage usage: {avg_storage_usage:.2f}")
print(f"Active users range: {active_users_min} - {active_users_max}")

Number of records: 1080
Regions covered: ['East US', 'West US', 'North Europe', 'Southeast Asia']
Average CPU usage: 74.65
Average Storage usage: 1242.48
Active users range: 200 - 499


# External Data:


In [16]:
economic_index_min = external_df['economic_index'].min()
economic_index_max = external_df['economic_index'].max()
market_demand_mean = external_df['cloud_market_demand'].mean()
market_demand_std = external_df['cloud_market_demand'].std()
holiday_unique_values = external_df['holiday'].unique()

print(f"Economic Index range: {economic_index_min} - {economic_index_max}")
print(f"Market Demand: Mean = {market_demand_mean:.3f}, Std = {market_demand_std:.3f}")
print(f"Holidays/weekend indicator included, unique values: {holiday_unique_values}")

Economic Index range: 73.8 - 118.86
Market Demand: Mean = 1.007, Std = 0.102
Holidays/weekend indicator included, unique values: [1 0]
