# ðŸ“Š Feature Importance Analysis

Analyze feature importance using multiple methods: correlation, Random Forest, XGBoost, and permutation importance.

In [1]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import permutation_importance
from sklearn.model_selection import train_test_split
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')
print("âœ… Libraries loaded!")

âœ… Libraries loaded!


In [2]:
# Load CLEANED data
df = pd.read_csv("../data/weather_cleaned.csv", parse_dates=['last_updated'])
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
target = 'temperature_celsius'
features = [c for c in numeric_cols if c != target]
print(f"ðŸ“Š Target: {target}, Features: {len(features)}")

ðŸ“Š Target: temperature_celsius, Features: 28


In [3]:
# Prepare data
df_model = df[features + [target]].dropna()
X, y = df_model[features], df_model[target]
if len(X) > 50000:
    idx = np.random.choice(len(X), 50000, replace=False)
    X, y = X.iloc[idx], y.iloc[idx]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"ðŸ“Š Train: {len(X_train)}, Test: {len(X_test)}")

ðŸ“Š Train: 40000, Test: 10000


In [4]:
# Correlation importance
correlations = X_train.corrwith(y_train).abs().sort_values(ascending=False)
fig = go.Figure(go.Bar(x=correlations.values[:15], y=correlations.index[:15], orientation='h', marker_color='#4ECDC4'))
fig.update_layout(title='ðŸ”— Correlation with Temperature', template='plotly_dark', height=500)
fig.show()

In [5]:
# Random Forest importance
rf = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)
rf_imp = pd.Series(rf.feature_importances_, index=features).sort_values(ascending=False)
fig = go.Figure(go.Bar(x=rf_imp.values[:15], y=rf_imp.index[:15], orientation='h', marker_color='#FF6B6B'))
fig.update_layout(title='ðŸŒ² Random Forest Importance', template='plotly_dark', height=500)
fig.show()

In [6]:
# XGBoost importance
xgb_model = xgb.XGBRegressor(n_estimators=100, max_depth=6, random_state=42)
xgb_model.fit(X_train, y_train)
xgb_imp = pd.Series(xgb_model.feature_importances_, index=features).sort_values(ascending=False)
fig = go.Figure(go.Bar(x=xgb_imp.values[:15], y=xgb_imp.index[:15], orientation='h', marker_color='#45B7D1'))
fig.update_layout(title='ðŸš€ XGBoost Importance', template='plotly_dark', height=500)
fig.show()

In [7]:
# Compare rankings
comparison = pd.DataFrame({'Correlation': correlations.rank(ascending=False), 'RF': rf_imp.rank(ascending=False), 'XGB': xgb_imp.rank(ascending=False)})
comparison['Avg Rank'] = comparison.mean(axis=1)
comparison.sort_values('Avg Rank').head(15)

Unnamed: 0,Correlation,RF,XGB,Avg Rank
temperature_fahrenheit,1.0,1.0,2.0,1.333333
feels_like_celsius,3.0,2.0,1.0,2.0
feels_like_fahrenheit,2.0,3.0,3.0,2.666667
humidity,5.0,5.0,9.0,6.333333
pressure_mb,9.0,8.0,5.0,7.333333
latitude,6.0,17.0,7.0,10.0
pressure_in,8.0,6.0,17.0,10.333333
air_quality_Ozone,7.0,11.0,14.0,10.666667
air_quality_PM10,13.0,15.0,8.0,12.0
uv_index,4.0,21.0,12.0,12.333333
