# **AI TECH INSTITUTE** · *Intermediate AI & Data Science*
### Week 01 · Notebook 03 — Data Wrangling & Transformation
**Instructor:** Amir Charkhi  |  **Goal:** Master real-world data manipulation techniques.

> Format: practical scenarios → powerful pandas methods → data ready for analysis.


---
## Real Data is Messy!
Let's load and clean actual messy data - the skills you'll use every day.

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# Set display options for better visibility
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 10)

print("Ready to wrangle! 🛠️")

## 1. Reading Data from Files

In [None]:
# Create sample CSV data (simulating a file)
csv_data = """order_id,customer_name,product,quantity,price,order_date
1001,Alice Smith,Laptop,1,1200.00,2025-08-15
1002,Bob Jones,Mouse,2,25.50,2025-08-15
1003,Charlie Brown,,1,80.00,2025-08-16
1004,Alice Smith,Monitor,1,,2025-08-16
1005,Diana Prince,Keyboard,3,75.00,2025-08-17
1006,,Webcam,1,120.00,2025-08-17
1007,Bob Jones,Laptop,1,1200,2025-08-18"""

# Save to file and read back
with open('orders.csv', 'w') as f:
    f.write(csv_data)

# Read CSV with proper data types
orders_df = pd.read_csv('orders.csv', parse_dates=['order_date'])
print("Raw data from CSV:")
print(orders_df)
print(f"\nData types:")
print(orders_df.dtypes)
print(f"\nMissing values:")
print(orders_df.isnull().sum())

## 2. Cleaning Missing and Incorrect Data

In [None]:
# Make a copy for cleaning
clean_df = orders_df.copy()

# Handle missing customer names
clean_df['customer_name'].fillna('Unknown Customer', inplace=True)

# Handle missing products (look at other orders from same customer)
clean_df.loc[2, 'product'] = 'Keyboard'  # Reasonable guess based on price

# Handle missing prices (use average for that product)
monitor_avg_price = 350.00  # Domain knowledge
clean_df.loc[3, 'price'] = monitor_avg_price

# Ensure price is float
clean_df['price'] = pd.to_numeric(clean_df['price'], errors='coerce')

# Calculate total
clean_df['total'] = clean_df['quantity'] * clean_df['price']

print("Cleaned data:")
print(clean_df)
print(f"\nRevenue by customer:")
print(clean_df.groupby('customer_name')['total'].sum().sort_values(ascending=False))

**Exercise 1 — Data Quality Check (medium)**  
Create a function that returns a data quality report: % complete, unique counts, and outliers.


In [None]:
# Your turn


<details>
<summary><b>Solution</b></summary>

```python
def data_quality_report(df):
    report = {}
    
    # Completeness
    report['completeness'] = (1 - df.isnull().sum() / len(df)) * 100
    
    # Unique counts
    report['unique_counts'] = df.nunique()
    
    # Outliers for numeric columns (using IQR)
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    outliers = {}
    for col in numeric_cols:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        outliers[col] = ((df[col] < Q1 - 1.5*IQR) | (df[col] > Q3 + 1.5*IQR)).sum()
    report['outliers'] = outliers
    
    return report

quality_report = data_quality_report(clean_df)
print("Data Quality Report:")
for key, value in quality_report.items():
    print(f"\n{key}:")
    print(value)
```
</details>

## 3. Merging and Joining DataFrames

In [None]:
# Create related dataframes
customers = pd.DataFrame({
    'customer_id': [1, 2, 3, 4],
    'name': ['Alice Smith', 'Bob Jones', 'Charlie Brown', 'Diana Prince'],
    'city': ['Perth', 'Sydney', 'Melbourne', 'Brisbane'],
    'member_since': ['2023-01-15', '2023-06-20', '2024-02-10', '2024-08-01']
})

orders = pd.DataFrame({
    'order_id': [101, 102, 103, 104, 105],
    'customer_id': [1, 2, 1, 3, 1],
    'amount': [150, 250, 100, 300, 175],
    'date': pd.date_range('2025-08-20', periods=5)
})

print("Customers:")
print(customers)
print("\nOrders:")
print(orders)

# Merge dataframes
merged = pd.merge(orders, customers, on='customer_id', how='left')
print("\nMerged data:")
print(merged)

# Different join types
print("\nInner join (only matching):")
inner_join = pd.merge(orders, customers, on='customer_id', how='inner')
print(f"Rows: {len(inner_join)}")

print("\nOuter join (all records):")
outer_join = pd.merge(orders, customers, on='customer_id', how='outer', indicator=True)
print(outer_join)

## 4. Grouping and Aggregation

In [None]:
# Create sales data
np.random.seed(42)
sales = pd.DataFrame({
    'date': pd.date_range('2025-08-01', periods=20),
    'store': np.random.choice(['Store_A', 'Store_B', 'Store_C'], 20),
    'product': np.random.choice(['Laptop', 'Phone', 'Tablet'], 20),
    'quantity': np.random.randint(1, 10, 20),
    'revenue': np.random.randint(100, 2000, 20)
})

print("Sales data:")
print(sales.head(10))

# Group by store
store_summary = sales.groupby('store').agg({
    'quantity': 'sum',
    'revenue': ['sum', 'mean', 'count']
})
print("\nStore summary:")
print(store_summary)

# Multiple grouping
product_store = sales.groupby(['product', 'store'])['revenue'].sum().unstack(fill_value=0)
print("\nRevenue by product and store:")
print(product_store)

# Add calculated columns
sales['revenue_per_unit'] = sales['revenue'] / sales['quantity']
sales['day_of_week'] = sales['date'].dt.day_name()

# Group by day of week
daily_pattern = sales.groupby('day_of_week')['revenue'].mean().round(2)
print("\nAverage revenue by day:")
print(daily_pattern)

**Exercise 2 — Customer Segmentation (hard)**  
Group customers by total spend into segments: VIP (>500), Regular (200-500), New (<200).


In [None]:
# Your turn


<details>
<summary><b>Solution</b></summary>

```python
# Calculate customer totals
customer_totals = merged.groupby('name')['amount'].sum().reset_index()
customer_totals.columns = ['customer', 'total_spend']

# Create segments
def segment_customer(spend):
    if spend > 500: return 'VIP'
    elif spend >= 200: return 'Regular'
    else: return 'New'

customer_totals['segment'] = customer_totals['total_spend'].apply(segment_customer)

print("Customer segments:")
print(customer_totals.sort_values('total_spend', ascending=False))

# Segment summary
print("\nSegment distribution:")
print(customer_totals['segment'].value_counts())
```
</details>

## 5. Pivoting and Reshaping

In [None]:
# Create long format data
long_data = pd.DataFrame({
    'date': pd.date_range('2025-08-01', periods=12),
    'metric': ['Sales', 'Costs', 'Profit'] * 4,
    'value': np.random.randint(1000, 5000, 12)
})

print("Long format:")
print(long_data)

# Pivot to wide format
wide_data = long_data.pivot(index='date', columns='metric', values='value')
print("\nWide format (pivoted):")
print(wide_data.head())

# Melt back to long format
melted = wide_data.reset_index().melt(id_vars='date', var_name='metric', value_name='amount')
print("\nMelted back to long:")
print(melted.head())

# Pivot table with aggregation
pivot_table = sales.pivot_table(
    values='revenue',
    index='store',
    columns='product',
    aggfunc='sum',
    fill_value=0,
    margins=True
)
print("\nPivot table with totals:")
print(pivot_table)

## 6. String Operations and Data Types

In [None]:
# Create messy text data
contacts = pd.DataFrame({
    'name': ['  john smith  ', 'JANE DOE', 'Bob Johnson Jr.', 'alice wong'],
    'email': ['John.Smith@GMAIL.com', 'jane@company.COM', 'bob@email.co', 'Alice@Email.net'],
    'phone': ['0412-345-678', '(04) 9876 5432', '0401234567', '04 1111 2222']
})

print("Messy contact data:")
print(contacts)

# Clean strings
contacts['name_clean'] = contacts['name'].str.strip().str.title()
contacts['email_clean'] = contacts['email'].str.lower()

# Extract domain from email
contacts['domain'] = contacts['email_clean'].str.split('@').str[1]

# Standardize phone numbers
contacts['phone_clean'] = contacts['phone'].str.replace(r'[^0-9]', '', regex=True)

print("\nCleaned contacts:")
print(contacts[['name_clean', 'email_clean', 'phone_clean', 'domain']])

## 7. Date and Time Operations

In [None]:
# Create time series data
dates = pd.date_range('2025-01-01', periods=100, freq='D')
ts_data = pd.DataFrame({
    'date': dates,
    'sales': np.random.randint(1000, 5000, 100) + np.sin(np.arange(100) * 2 * np.pi / 30) * 500
})

# Extract date components
ts_data['year'] = ts_data['date'].dt.year
ts_data['month'] = ts_data['date'].dt.month
ts_data['day_of_week'] = ts_data['date'].dt.day_name()
ts_data['week'] = ts_data['date'].dt.isocalendar().week

print("Time series with date components:")
print(ts_data.head())

# Resample to weekly
weekly = ts_data.set_index('date')['sales'].resample('W').agg(['mean', 'sum', 'std'])
print("\nWeekly aggregation:")
print(weekly.head())

# Rolling window calculations
ts_data['rolling_mean_7d'] = ts_data['sales'].rolling(window=7).mean()
ts_data['rolling_std_7d'] = ts_data['sales'].rolling(window=7).std()

print("\nWith rolling statistics:")
print(ts_data[['date', 'sales', 'rolling_mean_7d', 'rolling_std_7d']].tail())

**Exercise 3 — Time Series Analysis (hard)**  
Find the best and worst performing days of the week, and calculate week-over-week growth.


In [None]:
# Your turn


<details>
<summary><b>Solution</b></summary>

```python
# Best/worst days
day_performance = ts_data.groupby('day_of_week')['sales'].agg(['mean', 'std'])
day_performance = day_performance.sort_values('mean', ascending=False)
print("Day of week performance:")
print(day_performance)
print(f"\nBest day: {day_performance.index[0]}")
print(f"Worst day: {day_performance.index[-1]}")

# Week-over-week growth
weekly_sales = ts_data.groupby('week')['sales'].sum().reset_index()
weekly_sales['wow_growth'] = weekly_sales['sales'].pct_change() * 100
print("\nWeek-over-week growth:")
print(weekly_sales.head(10))
```
</details>

## 8. Mini-Challenges
- **M1 (easy):** Create a function to detect duplicate rows based on subset of columns
- **M2 (medium):** Implement a data validation function that checks data types and ranges
- **M3 (hard):** Create a pipeline that cleans, transforms, and aggregates raw sales data

In [None]:
# Your turn - try the challenges!


<details>
<summary><b>Solutions</b></summary>

```python
# M1 - Duplicate detection
def find_duplicates(df, subset=None):
    duplicates = df[df.duplicated(subset=subset, keep=False)]
    return duplicates.sort_values(subset if subset else df.columns.tolist())

# M2 - Data validation
def validate_data(df, rules):
    """
    rules = {
        'column_name': {'type': str, 'min': 0, 'max': 100, 'required': True}
    }
    """
    issues = []
    for col, rule in rules.items():
        if col not in df.columns and rule.get('required'):
            issues.append(f"Missing required column: {col}")
            continue
        
        if 'type' in rule:
            wrong_type = df[col].apply(lambda x: not isinstance(x, rule['type']))
            if wrong_type.any():
                issues.append(f"{col}: {wrong_type.sum()} type mismatches")
        
        if 'min' in rule:
            below_min = df[col] < rule['min']
            if below_min.any():
                issues.append(f"{col}: {below_min.sum()} values below {rule['min']}")
    
    return issues

# M3 - Data pipeline
def sales_pipeline(raw_df):
    # Clean
    df = raw_df.copy()
    df = df.dropna(subset=['product', 'revenue'])
    df['date'] = pd.to_datetime(df['date'])
    
    # Transform
    df['month'] = df['date'].dt.to_period('M')
    df['revenue_category'] = pd.cut(df['revenue'], 
                                     bins=[0, 500, 1000, float('inf')],
                                     labels=['Low', 'Medium', 'High'])
    
    # Aggregate
    summary = df.groupby(['month', 'product']).agg({
        'revenue': ['sum', 'mean', 'count'],
        'quantity': 'sum'
    }).round(2)
    
    return summary

# Test pipeline
result = sales_pipeline(sales)
print("Pipeline output:")
print(result.head())
```
</details>

## Wrap-Up
✅ You can read and clean messy real-world data  
✅ You mastered merging, grouping, and pivoting  
✅ You can handle dates, strings, and missing values  

**Next:** EDA - Exploring and understanding your cleaned data!
