# Electric Vehicle Feature Engineering

This notebook focuses on creating meaningful features for ML models.

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

: 

In [None]:
# Load processed data
df = pd.read_csv('../data/processed/ev_data_processed.csv')
print(f"Dataset shape: {df.shape}")

## 1. Create Basic Features

In [None]:
# Vehicle age
current_year = 2024
df['vehicle_age'] = current_year - df['Model Year']

# Is luxury brand
luxury_brands = ['TESLA', 'PORSCHE', 'BMW', 'AUDI', 'MERCEDES-BENZ', 'JAGUAR', 'LEXUS']
df['is_luxury'] = df['Make'].isin(luxury_brands).astype(int)

# Is BEV
df['is_bev'] = (df['Electric Vehicle Type'] == 'Battery Electric Vehicle (BEV)').astype(int)

# Has long range (>200 miles)
df['has_long_range'] = (df['Electric Range'] > 200).astype(int)

# Price category
df['price_category'] = pd.cut(df['Base MSRP'], 
                              bins=[0, 30000, 50000, 75000, np.inf],
                              labels=['budget', 'mid', 'premium', 'luxury'])

print("Basic features created!")

## 2. Geographic Features

In [None]:
# City EV density
city_counts = df['City'].value_counts().to_dict()
df['city_ev_density'] = df['City'].map(city_counts)

# County EV density
county_counts = df['County'].value_counts().to_dict()
df['county_ev_density'] = df['County'].map(county_counts)

# Is urban (top 10 cities)
top_cities = df['City'].value_counts().head(10).index
df['is_urban'] = df['City'].isin(top_cities).astype(int)

print("Geographic features created!")

## 3. Manufacturer Features

In [None]:
# Manufacturer market share
make_share = df['Make'].value_counts(normalize=True).to_dict()
df['manufacturer_market_share'] = df['Make'].map(make_share)

# Model popularity
model_counts = df['Model'].value_counts().to_dict()
df['model_popularity'] = df['Model'].map(model_counts)

# Average range by manufacturer
avg_range_by_make = df.groupby('Make')['Electric Range'].mean().to_dict()
df['manufacturer_avg_range'] = df['Make'].map(avg_range_by_make)

print("Manufacturer features created!")

## 4. Encode Categorical Variables

In [None]:
# Label encoding for ordinal features
label_encoders = {}
ordinal_features = ['Clean Alternative Fuel Vehicle (CAFV) Eligibility']

for feature in ordinal_features:
    le = LabelEncoder()
    df[f'{feature}_encoded'] = le.fit_transform(df[feature].fillna('Unknown'))
    label_encoders[feature] = le

# Target encoding for high cardinality features
high_card_features = ['Make', 'Model']
for feature in high_card_features:
    mean_target = df.groupby(feature)['Electric Range'].mean()
    df[f'{feature}_target_encoded'] = df[feature].map(mean_target)

print("Categorical encoding completed!")

## 5. Feature Interactions

In [None]:
# Range to price ratio
df['range_price_ratio'] = df['Electric Range'] / (df['Base MSRP'] + 1)

# Age-range interaction
df['age_range_interaction'] = df['vehicle_age'] * df['Electric Range']

# Luxury BEV indicator
df['luxury_bev'] = df['is_luxury'] * df['is_bev']

print("Feature interactions created!")

## 6. Feature Selection

In [None]:
# Select features for modeling
feature_cols = [
    'Model Year', 'Electric Range', 'Base MSRP', 'vehicle_age',
    'is_luxury', 'is_bev', 'has_long_range', 'city_ev_density',
    'county_ev_density', 'is_urban', 'manufacturer_market_share',
    'model_popularity', 'manufacturer_avg_range', 'range_price_ratio',
    'age_range_interaction', 'luxury_bev'
]

# Remove rows with missing values in key features
df_model = df[feature_cols].dropna()

print(f"Final feature set shape: {df_model.shape}")
print(f"Features selected: {len(feature_cols)}")

## 7. Feature Scaling

In [None]:
# Scale features
scaler = StandardScaler()
df_scaled = pd.DataFrame(
    scaler.fit_transform(df_model),
    columns=df_model.columns,
    index=df_model.index
)

print("Features scaled successfully!")

## 8. Dimensionality Reduction

In [None]:
# PCA analysis
pca = PCA(n_components=10)
pca_features = pca.fit_transform(df_scaled)

# Explained variance
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
plt.bar(range(1, 11), pca.explained_variance_ratio_)
plt.xlabel('Component')
plt.ylabel('Variance Explained')
plt.title('PCA Explained Variance')

plt.subplot(1, 2, 2)
plt.plot(range(1, 11), np.cumsum(pca.explained_variance_ratio_), 'bo-')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Variance Explained')
plt.title('Cumulative Explained Variance')
plt.axhline(y=0.95, color='r', linestyle='--')
plt.tight_layout()
plt.show()

print(f"Variance explained by first 10 components: {sum(pca.explained_variance_ratio_):.2%}")

## 9. Feature Importance Analysis

In [None]:
# Calculate correlation with target
target = 'Electric Range'
if target in df_model.columns:
    correlations = df_model.corr()[target].sort_values(ascending=False)
    
    plt.figure(figsize=(10, 6))
    correlations[1:].plot(kind='barh')
    plt.xlabel('Correlation with Electric Range')
    plt.title('Feature Correlation with Target')
    plt.tight_layout()
    plt.show()

## 10. Save Engineered Features

In [None]:
# Save feature-engineered dataset
df.to_csv('../data/processed/ev_data_features.csv', index=False)
df_scaled.to_csv('../data/processed/ev_data_scaled.csv', index=False)

# Save feature list
with open('../data/processed/feature_list.txt', 'w') as f:
    for feature in feature_cols:
        f.write(f"{feature}\n")

print("Feature engineering completed and saved!")
print(f"Total features created: {len(df.columns)}")
print(f"Selected features for modeling: {len(feature_cols)}")