In [None]:
import pandas as pd

df = pd.read_csv("weatherHistory.csv", encoding='latin1', engine='python')
df.head()

In [None]:
df.columns

In [None]:
df.isnull().sum()

In [None]:
df['Precip Type'] = df['Precip Type'].fillna('Unknown')

In [None]:
df.isnull().sum()

In [None]:
df.rename(columns={'Loud Cover': 'Cloud Cover'}, inplace=True)

In [None]:
df['Formatted Date'] = pd.to_datetime(df['Formatted Date'], errors='coerce', utc=True)

In [None]:
df['Year'] = df['Formatted Date'].dt.year
df['Month'] = df['Formatted Date'].dt.month
df['Day'] = df['Formatted Date'].dt.day
df['Hour'] = df['Formatted Date'].dt.hour

In [None]:
df_encoded = pd.get_dummies(df, columns=['Summary', 'Precip Type', 'Daily Summary'], drop_first=True)
df_encoded.head()

In [None]:
X = df_encoded.drop(['Temperature (C)', 'Formatted Date'], axis=1)
y = df_encoded['Temperature (C)']

In [None]:
X.head()
y.head()

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Split done!")

In [None]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators=10, random_state=42)
model.fit(X_train, y_train)

print("Small model trained!")

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

y_pred = model.predict(X_test)

print("MAE:", mean_absolute_error(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
print("RÂ²:", r2_score(y_test, y_pred))

In [None]:
import pandas as pd

feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': model.feature_importances_
}).sort_values(by='Importance', ascending=False)

feature_importance.head(20)

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10,6))
plt.barh(feature_importance['Feature'][:20], feature_importance['Importance'][:20])
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.title("Top 20 Important Features for Temperature Prediction")
plt.gca().invert_yaxis()
plt.show()

In [None]:
import pickle

with open("temperature_model.pkl", "wb") as f:
    pickle.dump(model, f)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

numeric_df = df[['Temperature (C)', 'Apparent Temperature (C)', 'Humidity',
                 'Wind Speed (km/h)', 'Visibility (km)', 'Pressure (millibars)']]

plt.figure(figsize=(10, 6))
sns.heatmap(numeric_df.corr(), annot=True, cmap='coolwarm')
plt.title("Correlation Heatmap of Key Weather Features")
plt.show()

In [None]:
plt.figure(figsize=(20, 15))
sns.heatmap(df_encoded.corr(), cmap='coolwarm')
plt.title("Full Dataset Correlation Heatmap")
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

numeric_df = df[['Temperature (C)', 'Apparent Temperature (C)', 'Humidity',
                 'Wind Speed (km/h)', 'Visibility (km)', 'Pressure (millibars)']]

plt.figure(figsize=(10, 6))
sns.heatmap(numeric_df.corr(), annot=True, cmap='coolwarm')
plt.title("Correlation Heatmap of Key Weather Features")

# SAVE IMAGE
plt.savefig("heatmap.png", dpi=300, bbox_inches='tight')

plt.show()

In [None]:
import os
os.listdir()

In [None]:
import shap

explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_train)

In [None]:
shap.summary_plot(shap_values, X_train)

In [None]:
from sklearn.inspection import plot_partial_dependence
import matplotlib.pyplot as plt

features_to_plot = ['Humidity', 'Pressure (millibars)', 'Visibility (km)', 'Wind Speed (km/h)']

fig, ax = plt.subplots(figsize=(12, 8))
plot_partial_dependence(model, X_train, features_to_plot, ax=ax)
plt.show()

In [None]:
print("ok")