# IBM Applied Data Science Capstone
## Part 2: Data Wrangling

**Objective:** Clean, transform, and prepare data for analysis

**Author:** Son Nguyen

---


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style("whitegrid")
plt.style.use('seaborn-v0_8')

print("Libraries imported successfully!")


In [None]:
# Load dataset
df = pd.read_csv('../data/automotive_sales.csv')

print(f"Dataset loaded: {df.shape}")
print(f"Columns: {df.columns.tolist()}")


## 2. Data Cleaning

### 2.1 Check for Missing Values


In [None]:
# Check missing values
missing_values = df.isnull().sum()
missing_pct = (missing_values / len(df)) * 100

missing_df = pd.DataFrame({
    'Missing Count': missing_values,
    'Missing Percentage': missing_pct
})

print("Missing Values Summary:")
print(missing_df[missing_df['Missing Count'] > 0])

if missing_df['Missing Count'].sum() == 0:
    print("\nâœ“ No missing values found!")


### 2.2 Feature Engineering


In [None]:
# Create Date column
df['Date'] = pd.to_datetime(df[['Year', 'Month']].assign(Day=1))

# Create Quarter column
df['Quarter'] = df['Month'].apply(lambda x: (x-1)//3 + 1)

# Create Price Category
df['Price_Category'] = pd.cut(
    df['Price'],
    bins=[0, 20, 30, 40, 100],
    labels=['Budget', 'Mid-range', 'Premium', 'Luxury']
)

# Create Sales Category (for classification later)
sales_median = df['Sales'].median()
df['Sales_Category'] = (df['Sales'] > sales_median).astype(int)
df['Sales_Category_Label'] = df['Sales_Category'].map({0: 'Low Sales', 1: 'High Sales'})

# Create Economic Index (composite)
df['Economic_Index'] = (
    (df['GDP'] / 100) * 0.5 + 
    ((100 - df['Unemployment_Rate']) / 100) * 0.5
).round(3)

print("New features created:")
print(df[['Date', 'Quarter', 'Price_Category', 'Sales_Category_Label', 'Economic_Index']].head())


In [None]:
# Save cleaned dataset
df.to_csv('../data/automotive_sales_cleaned.csv', index=False)

print(f"Cleaned dataset saved!")
print(f"Final shape: {df.shape}")
print(f"\nColumns in cleaned dataset: {df.columns.tolist()}")
