# Weather Prediction

## Introduction
This is a weather prediction model using Ikorodu weather data.

## Dependencies
- Pandas
- Matplotlib
- Numpy
- Seaborn
- Sklearn


In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import mean_squared_error, precision_score, accuracy_score


In [2]:
df = pd.read_csv('../../../data/ikorodu_weather_2016_2024.csv')

In [3]:
df.shape

(23360, 17)

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,Time (hr),Temperature (°c),Forecast (°c),Rain (mm),Rain (%),Cloud (%),Pressure (mb),Wind (km/h),Gust (km/h),Direction (deg),Moonrise (hr),Moonset (hr),Sunrise (hr),Sunset (hr),Weather,Date
0,0,0,25 °c,27 °c,0.1 mm,45%,87%,1013 mb,18 km/h,27 km/h,transform: rotate(225.3deg);,04:31 AM,05:17 PM,06:39 AM,06:53 PM,Light rain shower,31-08-2024
1,1,3,24 °c,26 °c,0.4 mm,100%,100%,1012 mb,19 km/h,28 km/h,transform: rotate(225.1deg);,04:31 AM,05:17 PM,06:39 AM,06:53 PM,Light rain shower,31-08-2024
2,2,6,24 °c,26 °c,0.3 mm,100%,100%,1013 mb,18 km/h,27 km/h,transform: rotate(230.1deg);,04:31 AM,05:17 PM,06:39 AM,06:53 PM,Light rain shower,31-08-2024
3,3,9,25 °c,28 °c,0.1 mm,45%,76%,1014 mb,21 km/h,28 km/h,transform: rotate(226.8deg);,04:31 AM,05:17 PM,06:39 AM,06:53 PM,Light rain shower,31-08-2024
4,4,12,27 °c,29 °c,0.0 mm,45%,61%,1014 mb,21 km/h,27 km/h,transform: rotate(229.2deg);,04:31 AM,05:17 PM,06:39 AM,06:53 PM,Light rain shower,31-08-2024


In [5]:
df.columns

Index(['Unnamed: 0', 'Time (hr)', 'Temperature (°c)', 'Forecast (°c)',
       'Rain (mm)', 'Rain (%)', 'Cloud (%)', 'Pressure (mb)', 'Wind (km/h)',
       'Gust (km/h)', 'Direction (deg)', 'Moonrise (hr)', 'Moonset (hr)',
       'Sunrise (hr)', 'Sunset (hr)', 'Weather', 'Date'],
      dtype='object')

# Data Preprocessing

In [208]:
# Forcast (°c)
forcast_scaler = StandardScaler()
df['forecast_normalized'] = forcast_scaler.fit_transform(df['Forecast (°c)'].values.reshape(-1, 1))

In [209]:
# Weather
weather_encoder = LabelEncoder()
df['weather_condition'] = weather_encoder.fit_transform(df['Weather'])

In [210]:
# Temperature (°c)
temperature_scaler = StandardScaler()
df['temp_normalized'] = temperature_scaler.fit_transform(df['Temperature (°c)'].values.reshape(-1, 1))

In [211]:
# Wind (km/h)
wind_scaler = StandardScaler()
df['wind_speed_normalized'] = wind_scaler.fit_transform(df['Wind (km/h)'].values.reshape(-1, 1))

In [212]:
# Cloud (%)
cloud_scaler = StandardScaler()
df['cloud_cover_scaled'] = cloud_scaler.fit_transform(df['Cloud (%)'].values.reshape(-1, 1))


In [213]:
# Pressure (mb)
pressure_scaler = StandardScaler()
df['pressure_normalized'] = pressure_scaler.fit_transform(df['Pressure (mb)'].values.reshape(-1, 1))

In [214]:
# Rain (mm)
rain_scaler = StandardScaler()
df['rainfall_scaled'] = rain_scaler.fit_transform(df['Rain (mm)'].values.reshape(-1, 1))

In [215]:
# Rain (%)
rain_percent_scaler = StandardScaler()
df['rain_probability_scaled'] = rain_percent_scaler.fit_transform(df['Rain (%)'].values.reshape(-1, 1))

In [216]:
# Time (hr)
time_scaler = StandardScaler()
df['time_of_day_normalized'] = time_scaler.fit_transform(df['Time (hr)'].values.reshape(-1, 1))

In [217]:
# Gust (km/h)
gust_scaler = StandardScaler()
df['gust_speed_scaled'] = gust_scaler.fit_transform(df['Gust (km/h)'].values.reshape(-1, 1))

In [218]:
# Gust (km/h)
direction_scaler = StandardScaler()
df['direction_scaled'] = direction_scaler.fit_transform(df['Direction (deg)'].values.reshape(-1, 1))

In [219]:
# Date
date_encoder = DictVectorizer()
date_features = date_encoder.fit_transform(df['Date'].apply(lambda x: {'year': x.year, 'month': x.month, 'day': x.day}))
df = pd.concat([df, pd.DataFrame(date_features.toarray(), columns=['day', 'month', 'year'])], axis=1)

In [220]:
# Moonrise
moonrise_scaler = StandardScaler()
df['moonrise_normalized'] = moonrise_scaler.fit_transform(df['Moonrise (min)'].values.reshape(-1, 1))

In [221]:
# Moonset
moonset_scaler = StandardScaler()
df['moonset_normalized'] = moonset_scaler.fit_transform(df['Moonset (min)'].values.reshape(-1, 1))

In [222]:
# Sunrise
sunrise_scaler = StandardScaler()
df['sunrise_normalized'] = sunrise_scaler.fit_transform(df['Sunrise (min)'].values.reshape(-1, 1))

In [223]:
# Sunset
sunset_scaler = StandardScaler()
df['sunset_normalized'] = sunset_scaler.fit_transform(df['Sunset (min)'].values.reshape(-1, 1))

In [224]:
df.head()

Unnamed: 0,Time (hr),Temperature (°c),Forecast (°c),Rain (mm),Rain (%),Cloud (%),Pressure (mb),Wind (km/h),Gust (km/h),Direction (deg),...,time_of_day_normalized,gust_speed_scaled,direction_scaled,day,month,year,moonrise_normalized,moonset_normalized,sunrise_normalized,sunset_normalized
0,0,25,27,0.1,45.0,87.0,1013.0,18.0,27.0,225.3,...,-1.527525,1.181843,0.151091,31.0,8.0,2024.0,-1.230083,0.617203,-0.280207,0.268626
1,3,24,26,0.4,100.0,100.0,1012.0,19.0,28.0,225.1,...,-1.091089,1.347149,0.145622,31.0,8.0,2024.0,-1.230083,0.617203,-0.280207,0.268626
2,6,24,26,0.3,100.0,100.0,1013.0,18.0,27.0,230.1,...,-0.654654,1.181843,0.28235,31.0,8.0,2024.0,-1.230083,0.617203,-0.280207,0.268626
3,9,25,28,0.1,45.0,76.0,1014.0,21.0,28.0,226.8,...,-0.218218,1.347149,0.19211,31.0,8.0,2024.0,-1.230083,0.617203,-0.280207,0.268626
4,12,27,29,0.0,45.0,61.0,1014.0,21.0,27.0,229.2,...,0.218218,1.181843,0.257739,31.0,8.0,2024.0,-1.230083,0.617203,-0.280207,0.268626


### Create Prediction Models

In [225]:
rfr = RandomForestRegressor(n_estimators= 200, min_samples_split= 10, random_state= 1)

In [226]:
rfc = RandomForestClassifier(n_estimators= 300, min_samples_split= 10, random_state= 1)

In [227]:
def predict_category(data, target, predictors, features):
    # split the data into train and test sets
    train = data[data['Date'] < '2022-09-01']
    test = data[data['Date'] >= '2022-09-01']

    # fit the model
    rfc.fit(train[predictors], train[target])

    # make prediction
    prediction = rfc.predict(test[predictors])

    # combine actual and predicted values into a dataframe
    combined = pd.DataFrame(dict(actual= test[target], prediction=prediction))

    # add feature columns to the combined dataframe
    combined = combined.merge(data[features], left_index= True, right_index= True)

    # get model accuracy
    accuracy = accuracy_score(test[target], prediction)

    # get model precision
    precision = precision_score(test[target], prediction, average= 'micro')
    
    return combined, precision, accuracy

In [255]:
def rolling_averges(group: pd.Grouper, cols, new_cols, n):
    # sort the group by date
    group = group.sort_values('Date')

    # get the average of the passed n days
    rolling_stats = group[cols].rolling(n, closed='left').mean()

    group[new_cols] = rolling_stats

    # drop subsets with no averages
    group = group.dropna(subset=new_cols)
    
    return group

In [256]:
# Gust (km/h)
gust_scaler = StandardScaler()
df['gust_speed_scaled'] = gust_scaler.fit_transform(df['Gust (km/h)'].values.reshape(-1, 1))

In [257]:
# Gust (km/h)
gust_scaler = StandardScaler()
df['gust_speed_scaled'] = gust_scaler.fit_transform(df['Gust (km/h)'].values.reshape(-1, 1))

In [258]:
def get_rolling_averages(df, rolling, n):
    rolled = [f'{c.lower()}_rolling' for c in rolling]
    rolling_df = df.groupby('year').apply(lambda x: rolling_averges(x, rolling, rolled, n))
    rolling_df.index = rolling_df.index.droplevel(0)
    rolling_df.index = range(rolling_df.shape[0])
    
    return rolling_df, rolled

### Predict if it rained or not

In [259]:
rain_df = df.copy()

In [260]:
rain_df.columns

Index(['Time (hr)', 'Temperature (°c)', 'Forecast (°c)', 'Rain (mm)',
       'Rain (%)', 'Cloud (%)', 'Pressure (mb)', 'Wind (km/h)', 'Gust (km/h)',
       'Direction (deg)', 'Moonrise (hr)', 'Moonset (hr)', 'Sunrise (hr)',
       'Sunset (hr)', 'Weather', 'Date', 'Moonrise (min)', 'Moonset (min)',
       'Sunrise (min)', 'Sunset (min)', 'Season', 'forecast_normalized',
       'weather_condition', 'temp_normalized', 'wind_speed_normalized',
       'cloud_cover_scaled', 'pressure_normalized', 'rainfall_scaled',
       'rain_probability_scaled', 'time_of_day_normalized',
       'gust_speed_scaled', 'direction_scaled', 'day', 'month', 'year',
       'moonrise_normalized', 'moonset_normalized', 'sunrise_normalized',
       'sunset_normalized'],
      dtype='object')

In [261]:
rain_df['rained'] = (rain_df['Rain (mm)'] > 0 ).astype(int)

In [262]:
rain_features = [
'Time (hr)',
'Temperature (°c)',
'Forecast (°c)',
'Rain (mm)',
'Rain (%)',
'Cloud (%)',
'Pressure (mb)',
'Wind (km/h)',
'Gust (km/h)',
'Direction (deg)',
'Moonrise (hr)',
'Moonset (hr)',
'Sunrise (hr)',
'Sunset (hr)',
'Weather',
'Date',
'Moonrise (min)',
'Moonset (min)',
'Sunrise (min)',
'Sunset (min)',
'Season',
]

In [263]:
# Define predictors
rain_predictors = [
'forecast_normalized', 
'weather_condition', 
'temp_normalized', 
'wind_speed_normalized', 
'cloud_cover_scaled', 
'pressure_normalized', 
'gust_speed_scaled', 
'direction_scaled',
'rain_probability_scaled'
]

In [264]:
sc_combined, sc_precision, sc_accuracy = predict_category(rain_df, 'rained', rain_predictors, rain_features)

In [265]:
pd.crosstab(index=sc_combined['actual'], columns= sc_combined['prediction'])

prediction,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,4021,0
1,140,1687


In [266]:
sc_precision

0.9760601915184679

In [267]:
rain_rolling = [
    'forecast_normalized', 
    'weather_condition', 
    'temp_normalized', 
    'wind_speed_normalized', 
    'cloud_cover_scaled', 
    'pressure_normalized', 
    'gust_speed_scaled', 
    'direction_scaled',
    'rain_probability_scaled'
 ]

In [268]:
rain_df_rolling, rain_rolled = get_rolling_averages(df=rain_df, rolling=rain_rolling, n=8)

  rolling_df = df.groupby('year').apply(lambda x: rolling_averges(x, rolling, rolled, n))


In [269]:
sc_rolled_combined, sc_rolled_precision, sc_rolled_accuracy = predict_category(rain_df_rolling, 'rained', rain_predictors + rain_rolled, rain_features)

In [270]:
sc_rolled_precision

0.97599451303155

In [271]:
pd.crosstab(index=sc_rolled_combined['actual'], columns= sc_rolled_combined['prediction'])

prediction,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,4005,0
1,140,1687
