# Azure Demand Forecasting — Data Cleaning & Preprocessing
Infosys Springboard Internship Notebook

This notebook performs:
- Data loading
- Data inspection
- Missing value handling
- Type conversion
- Outlier checks
- Feature engineering for time series
- Encoding categorical variables
- Train/Test split for forecasting

In [None]:
import pandas as pd
import numpy as np

# Load dataset
df = pd.read_csv('azure_demand_forecasting_dataset.csv')
df.head()

## Basic Info & Structure

In [None]:
df.shape, df.columns, df.info()

## Convert Timestamp to Datetime

In [None]:
df['timestamp'] = pd.to_datetime(df['timestamp'])
df.dtypes

## Check Missing Values

In [None]:
df.isnull().sum()

## Remove Duplicate Rows

In [None]:
df = df.drop_duplicates()
df.shape

## Sort by Time (Important for Forecasting)

In [None]:
df = df.sort_values('timestamp')
df.head()

## Handle Invalid Values

In [None]:
# Ensure no negative demand or capacity
df['usage_units'] = df['usage_units'].clip(lower=0)
df['provisioned_capacity'] = df['provisioned_capacity'].clip(lower=0)
df['cost_usd'] = df['cost_usd'].clip(lower=0)

## Outlier Detection (IQR Method)

In [None]:
def remove_outliers(col):
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    return df[(df[col] >= lower) & (df[col] <= upper)]

for c in ['usage_units','cost_usd']:
    df = remove_outliers(c)

df.shape

## Feature Engineering — Time Features

In [None]:
df['year'] = df['timestamp'].dt.year
df['month'] = df['timestamp'].dt.month
df['day'] = df['timestamp'].dt.day
df['day_of_week'] = df['timestamp'].dt.dayofweek
df['weekofyear'] = df['timestamp'].dt.isocalendar().week.astype(int)

df.head()

## Seasonal Flags

In [None]:
df['is_weekend'] = df['day_of_week'].isin([5,6]).astype(int)

## Encode Categorical Columns

In [None]:
df = pd.get_dummies(df, columns=['region','service_type'], drop_first=True)
df.head()

## Normalize Numeric Features

In [None]:
from sklearn.preprocessing import StandardScaler

num_cols = ['usage_units','provisioned_capacity','cost_usd','availability_pct']
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

## Train-Test Split (Time Series Safe)

In [None]:
split_date = df['timestamp'].quantile(0.8)

train = df[df['timestamp'] <= split_date]
test = df[df['timestamp'] > split_date]

train.shape, test.shape

## Save Cleaned Dataset

In [None]:
df.to_csv('azure_cleaned_preprocessed.csv', index=False)
print('Saved cleaned dataset')