In [None]:
# Cell 1: Import Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
import shap

In [None]:
# Cell 2: Load Data
file_path = "./Cairo-Weather.csv"
df = pd.read_csv(file_path)

print("Shape before cleaning:", df.shape)
df.head()

In [None]:
# Cell 3: Data Cleaning
df = df.drop(columns=[
    "visibility_mean (undefined)",
    "visibility_max (undefined)",
    "visibility_min (undefined)"
])

for col in df.select_dtypes(include=[np.number]).columns:
    df[col] = df[col].fillna(df[col].mean())

Q1 = df['temperature_2m_mean (°C)'].quantile(0.25)
Q3 = df['temperature_2m_mean (°C)'].quantile(0.75)
IQR = Q3 - Q1
lower, upper = Q1 - 1.5*IQR, Q3 + 1.5*IQR

df['temperature_2m_mean (°C)'] = df['temperature_2m_mean (°C)'].apply(
    lambda x: lower if x < lower else (upper if x > upper else x)
)

print("Shape after cleaning:", df.shape)
df.head()

In [None]:
# Cell 4: Feature Engineering
df['temp_range'] = df['temperature_2m_max (°C)'] - df['temperature_2m_min (°C)']
df['prev_temp'] = df['temperature_2m_mean (°C)'].shift(1).fillna(df['temperature_2m_mean (°C)'])
df['rolling_temp'] = df['temperature_2m_mean (°C)'].rolling(window=3).mean().fillna(df['temperature_2m_mean (°C)'])

print("New Features added:", ['temp_range', 'prev_temp', 'rolling_temp'])
df.head()

In [None]:
# Cell 5: Visualization
plt.figure(figsize=(14,6))
sns.heatmap(df.corr(numeric_only=True), annot=True, cmap="coolwarm")
plt.title("Correlation Heatmap")
plt.show()

plt.figure(figsize=(10,5))
sns.histplot(df['temperature_2m_mean (°C)'], bins=30, kde=True)
plt.title("Temperature Distribution")
plt.show()

plt.figure(figsize=(12,5))
sns.lineplot(data=df, y="temperature_2m_mean (°C)", x=df.index)
plt.title("Temperature Trend Over Time")
plt.show()

In [None]:
# Cell 6: Train-Test Split
features = [
    'apparent_temperature_mean (°C)','et0_fao_evapotranspiration (mm)',
    'daylight_duration (s)','shortwave_radiation_sum (MJ/m²)',
    'dew_point_2m_mean (°C)','sunshine_duration (s)',
    'temp_range','prev_temp','rolling_temp'
]

X = df[features]
y = df['temperature_2m_mean (°C)']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Cell 7: Model Comparison
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "XGBoost": XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
}

results = []
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    results.append([name, mse, r2])

results_df = pd.DataFrame(results, columns=["Model", "MSE", "R²"])
results_df

In [None]:
# Cell 8: Cross Validation
for name, model in models.items():
    scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring="r2")
    print(f"{name}: Mean CV R² = {scores.mean():.3f}")

In [None]:
# Cell 9: Feature Importance & Explainability
best_model = models["Random Forest"]
best_model.fit(X_train_scaled, y_train)

# Feature Importance
importances = best_model.feature_importances_
importance_df = pd.DataFrame({
    "Feature": features,
    "Importance": importances
}).sort_values(by="Importance", ascending=False)

plt.figure(figsize=(10,6))
sns.barplot(data=importance_df, x="Importance", y="Feature", palette="viridis")
plt.title("Feature Importance")
plt.show()

# SHAP (Explainability)
explainer = shap.TreeExplainer(best_model)
shap_values = explainer.shap_values(X_test_scaled)
shap.summary_plot(shap_values, X_test, feature_names=features)

In [None]:
# Cell 10: Save Model
import os
os.makedirs("models", exist_ok=True)

joblib.dump(best_model, "models/BestModel.pkl")
joblib.dump(scaler, "models/Scaler.pkl")
print("Model & Scaler saved successfully!")

In [None]:
# Cell 11: Model Comparison Table
results = []

for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    results.append([name, mse, r2])

results_df = pd.DataFrame(results, columns=["Model", "MSE", "R² Score"])
print(results_df)

plt.figure(figsize=(8,4))
sns.barplot(data=results_df, x="Model", y="R² Score", palette="mako")
plt.title("Model Performance Comparison")
plt.show()

In [None]:
# Cell 12: Save & Load Best Model
best_model = models["Random Forest"]
best_model.fit(X_train_scaled, y_train)

joblib.dump(best_model, "best_model.pkl")
print("Model saved successfully!")

# Load again
loaded_model = joblib.load("best_model.pkl")
sample_pred = loaded_model.predict(X_test_scaled[:5])
print("Sample predictions:", sample_pred)

In [None]:
# Cell 13: Prediction Function
def predict_temperature(input_data):
    input_scaled = scaler.transform([input_data])
    prediction = loaded_model.predict(input_scaled)
    return prediction[0]

# Example:
example = X_test.iloc[0].values
print("Example input:", example)
print("Predicted temperature:", predict_temperature(example))