PREPROCESSING THE WEATHER DATA

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
weather= pd.read_csv('export-rabat0.csv', header=3, index_col="DATE")
weather.head()

In [None]:
# Calculate the correlation matrix
correlation_matrix = weather.corr()

correlation_threshold = 0.5

important_features = correlation_matrix[abs(correlation_matrix['MAX_TEMPERATURE_C']) > correlation_threshold]['MAX_TEMPERATURE_C'].index.tolist()

plt.figure(figsize=(10, 10))

sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5, annot_kws={"size": 8})



In [None]:
print("Important Features:", important_features)

Important Features: ['MAX_TEMPERATURE_C', 'MIN_TEMPERATURE_C', 'TEMPERATURE_MORNING_C', 'TEMPERATURE_NOON_C', 'TEMPERATURE_EVENING_C', 'HEATINDEX_MAX_C', 'DEWPOINT_MAX_C', 'WINDTEMP_MAX_C', 'UV_INDEX', 'SUNHOUR']


In [None]:
weather.apply(pd.isnull).sum()/weather.shape[0]

In [None]:
core_weather = weather[["MAX_TEMPERATURE_C","MIN_TEMPERATURE_C","TEMPERATURE_MORNING_C","TEMPERATURE_NOON_C","TEMPERATURE_EVENING_C","HEATINDEX_MAX_C","DEWPOINT_MAX_C","WINDTEMP_MAX_C","UV_INDEX","SUNHOUR"]].copy()
core_weather

In [None]:
core_weather.columns = ["temp_max","temp_min","morning_temp","noon_temp","evening_temp","heat_index_max","dew_point_max","wind_temp_max","uv_index","sunhour"]
core_weather.columns

In [None]:
core_weather.apply(pd.isnull).sum()/core_weather.shape[0] #to know if there are any missing values

In [None]:
core_weather.index = pd.to_datetime(core_weather.index)

In [None]:
core_weather.index

In [None]:
core_weather.apply(lambda x: (x==9999).sum())

In [None]:
# Assuming your original DataFrame is named core_weather
shuffled_core_weather = core_weather.sample(frac=1, random_state=42)

# Display the first few rows of the shuffled DataFrame
print(shuffled_core_weather.head())

ANALYSING THE WEATHER DATA

In [None]:
shuffled_core_weather[["temp_max","temp_min"]].plot()

TRAINING THE ML MODEL

In [None]:
shuffled_core_weather["target"] = shuffled_core_weather.shift(-1)["temp_max"]

In [None]:
shuffled_core_weather[["temp_max", "target"]]

In [None]:
shuffled_core_weather.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
temp_max,5416.0,22.945532,5.035808,10.0,19.0,23.0,27.0,43.0
temp_min,5416.0,16.57404,4.413952,3.0,13.0,17.0,20.0,30.0
morning_temp,5416.0,16.588442,4.409501,3.0,13.0,17.0,20.0,30.0
noon_temp,5416.0,22.646418,5.160899,10.0,18.0,22.0,26.0,43.0
evening_temp,5416.0,20.021787,4.740689,8.0,16.0,20.0,24.0,37.0
heat_index_max,5416.0,23.77788,5.495029,10.0,19.0,25.0,27.0,44.0
dew_point_max,5416.0,14.944793,3.920128,0.0,12.0,15.0,18.0,25.0
wind_temp_max,5416.0,16.261078,4.834373,1.0,12.0,17.0,20.0,30.0
uv_index,5416.0,5.348966,1.121473,2.0,5.0,5.0,6.0,8.0
sunhour,5416.0,10.681518,1.397848,5.0,10.1,11.6,11.6,13.7


In [None]:
shuffled_core_weather = shuffled_core_weather.iloc[:-1,:].copy() #removing the last line with the value NaN in target
shuffled_core_weather

In [None]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split

reg = Ridge(alpha=.1)

In [None]:
num_test = 0.20
X = shuffled_core_weather.drop(columns=['temp_max'])  # Features (excluding the target variable)
y = shuffled_core_weather['temp_max']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= num_test, random_state = 23)

# Display the shapes of the resulting sets
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

In [None]:
X_train

In [None]:
predictors = ["temp_min", "morning_temp", "noon_temp", "evening_temp", "heat_index_max", "dew_point_max", "wind_temp_max", "uv_index", "sunhour"]

# Assuming reg is your regression model
reg.fit(X_train[predictors], y_train)

In [None]:
predictions = reg.predict(X_test[predictors])

In [None]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(X_test["target"], predictions)


5.75735381323511

In [None]:
combined = pd.concat([X_test["target"], pd.Series(predictions, index =X_test.index)], axis=1)
combined.columns = ["actual","predictions"]
combined

In [None]:
combined.plot()

In [None]:
reg.coef_ #to see the impact of each feature on the target

In [None]:
def create_predictions(predictors, shuffled_core_weather, reg):
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= num_test, random_state = 23)
  reg.fit(X_train[predictors], y_train)
  predictions = reg.predict(X_test[predictors])
  error = mean_absolute_error(X_test["target"], predictions)
  combined = pd.concat([X_test["target"], pd.Series(predictions, index =X_test.index)], axis=1)
  combined.columns = ["actual","predictions"]
  return error, combined

In [None]:
combined["diff"] = combined["actual"]-combined["predictions"]
combined.sort_values("diff", ascending=False).head()

In [None]:
from datetime import datetime, timedelta

In [None]:
user_input_date = input("Enter the date (YYYY-MM-DD): ")
user_input_date = datetime.strptime(user_input_date, '%Y-%m-%d')
user_input_features = []
for feature in predictors:
    value = float(input(f"Enter the value for {feature}: "))
    user_input_features.append(value)

user_input_df = pd.DataFrame([user_input_features], columns=predictors)

user_input_date_plus_one = user_input_date + timedelta(days=1)
predicted_temperature = reg.predict(user_input_df)

print(f"For the date {user_input_date.strftime('%Y-%m-%d')}, the predicted maximum temperature for the following day ({user_input_date_plus_one.strftime('%Y-%m-%d')}) is: {predicted_temperature[0]:.2f}°C")


In [None]:
from sklearn.metrics import r2_score

# Assuming 'reg' is your trained Ridge regression model
# Assuming 'X_test' and 'y_test' are your test set features and target values

# Make predictions on the test set
predictions = reg.predict(X_test[predictors])
# Calculate R-squared score
r2 = r2_score(y_test, predictions)

# Print the result
print(f"R-squared Score: {r2:.4f}")


R-squared Score: 0.9898
