In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')

In [2]:
df = pd.read_csv('Jan2017_to_Feb2023_data.csv', skiprows=3)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2243 entries, 0 to 2242
Data columns (total 15 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   time                             2243 non-null   object 
 1   weathercode (wmo code)           2243 non-null   int64  
 2   temperature_2m_max (°F)          2243 non-null   float64
 3   temperature_2m_min (°F)          2243 non-null   float64
 4   temperature_2m_mean (°F)         2243 non-null   float64
 5   apparent_temperature_max (°F)    2243 non-null   float64
 6   apparent_temperature_min (°F)    2243 non-null   float64
 7   apparent_temperature_mean (°F)   2243 non-null   float64
 8   sunrise (iso8601)                2243 non-null   object 
 9   sunset (iso8601)                 2243 non-null   object 
 10  shortwave_radiation_sum (MJ/m²)  2243 non-null   float64
 11  rain_sum (inch)                  2243 non-null   float64
 12  snowfall_sum (inch) 

In [3]:
df.isnull().sum()
# there are no null values, meaning that this dataset probably needs minimal processing

time                               0
weathercode (wmo code)             0
temperature_2m_max (°F)            0
temperature_2m_min (°F)            0
temperature_2m_mean (°F)           0
apparent_temperature_max (°F)      0
apparent_temperature_min (°F)      0
apparent_temperature_mean (°F)     0
sunrise (iso8601)                  0
sunset (iso8601)                   0
shortwave_radiation_sum (MJ/m²)    0
rain_sum (inch)                    0
snowfall_sum (inch)                0
windspeed_10m_max (mp/h)           0
windgusts_10m_max (mp/h)           0
dtype: int64

In [4]:
df.rename(columns={'weathercode (wmo code)': 'weathercode', 
                   'temperature_2m_max (°F)':'max_temperature', 
                   'temperature_2m_min (°F)':'min_temperature',
                   'temperature_2m_mean (°F)':'mean_temperature', 
                   'apparent_temperature_max (°F)':'max_feels_like', 
                   'apparent_temperature_min (°F)':'min_feels_like', 
                   'apparent_temperature_mean (°F)':'mean_feels_like', 
                   'sunrise (iso8601)':'sunrise_time', 
                   'sunset (iso8601)':'sunset_time', 
                   'shortwave_radiation_sum (MJ/m²)':'solar_radiation_sum', 
                   'rain_sum (inch)':'total_rainfall', 
                   'snowfall_sum (inch)':'total_snowfall', 
                   'windspeed_10m_max (mp/h)':'max_windspeed', 
                   'windgusts_10m_max (mp/h)':'max_windgusts'}, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2243 entries, 0 to 2242
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   time                 2243 non-null   object 
 1   weathercode          2243 non-null   int64  
 2   max_temperature      2243 non-null   float64
 3   min_temperature      2243 non-null   float64
 4   mean_temperature     2243 non-null   float64
 5   max_feels_like       2243 non-null   float64
 6   min_feels_like       2243 non-null   float64
 7   mean_feels_like      2243 non-null   float64
 8   sunrise_time         2243 non-null   object 
 9   sunset_time          2243 non-null   object 
 10  solar_radiation_sum  2243 non-null   float64
 11  total_rainfall       2243 non-null   float64
 12  total_snowfall       2243 non-null   float64
 13  max_windspeed        2243 non-null   float64
 14  max_windgusts        2243 non-null   float64
dtypes: float64(11), int64(1), object(3)
me

In [5]:
df['time'] = pd.to_datetime(df['time'])
df['weathercode'] = df['weathercode'].astype('category')
df['sunrise_time'] = pd.to_datetime(df['sunrise_time'])
df['sunset_time'] = pd.to_datetime(df['sunset_time'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2243 entries, 0 to 2242
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   time                 2243 non-null   datetime64[ns]
 1   weathercode          2243 non-null   category      
 2   max_temperature      2243 non-null   float64       
 3   min_temperature      2243 non-null   float64       
 4   mean_temperature     2243 non-null   float64       
 5   max_feels_like       2243 non-null   float64       
 6   min_feels_like       2243 non-null   float64       
 7   mean_feels_like      2243 non-null   float64       
 8   sunrise_time         2243 non-null   datetime64[ns]
 9   sunset_time          2243 non-null   datetime64[ns]
 10  solar_radiation_sum  2243 non-null   float64       
 11  total_rainfall       2243 non-null   float64       
 12  total_snowfall       2243 non-null   float64       
 13  max_windspeed        2243 non-nul

In [6]:
df.head()

#TODO EXTEND THE EDA WITH MORE DATA

Unnamed: 0,time,weathercode,max_temperature,min_temperature,mean_temperature,max_feels_like,min_feels_like,mean_feels_like,sunrise_time,sunset_time,solar_radiation_sum,total_rainfall,total_snowfall,max_windspeed,max_windgusts
0,2017-01-01,3,46.6,30.0,39.7,27.2,39.2,34.1,2017-01-01 07:18:00,2017-01-01 16:41:00,9.01,0.0,0.0,11.3,23.7
1,2017-01-02,53,40.3,29.4,35.9,25.3,32.8,29.8,2017-01-02 07:18:00,2017-01-02 16:42:00,1.0,0.136,0.0,9.8,18.3
2,2017-01-03,61,43.0,39.4,41.0,31.6,38.1,34.8,2017-01-03 07:18:00,2017-01-03 16:43:00,0.83,0.379,0.0,13.6,24.8
3,2017-01-04,51,50.3,31.2,41.9,24.9,42.6,36.5,2017-01-04 07:18:00,2017-01-04 16:43:00,4.58,0.012,0.0,14.8,29.8
4,2017-01-05,71,32.7,27.5,29.7,17.0,24.0,21.1,2017-01-05 07:18:00,2017-01-05 16:44:00,6.15,0.0,0.056,12.5,24.8


In [11]:
X   = df[['weathercode', 
           'max_temperature', 
           'min_temperature',
           'mean_temperature', 
           'max_feels_like', 
           'min_feels_like', 
           'mean_feels_like', 
           'solar_radiation_sum', 
           'total_snowfall', 
           'max_windspeed', 
           'max_windgusts']] #get the input features
y   = df['total_rainfall'] #get the target TODO MAKE IT PRECIPITATION WHICH IS EQUIVALENT TO THE SUM OF TOTAL_RAINFALL AND TOTAL_SNOWFALL

X_train, X_test, y_train, y_test = train_test_split(X,              #the input features
                                                    y,              #the label
                                                    test_size=0.3,  #set aside 30% of the data as the test set
                                                    random_state=7 #reproduce the results
                                                   )

In [12]:
rf = RandomForestRegressor(random_state=7)
rf.fit(X_train, y_train)

RandomForestRegressor(random_state=7)

In [13]:
#predict the labels for the test set
y_pred   = rf.predict(X_test)

print('The predicted precipitation is: {}'.format(y_pred))

The predicted precipitation is: [0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 4.91510e-01
 2.69600e-02 1.48250e-01 0.00000e+00 0.00000e+00 2.98800e-02 0.00000e+00
 7.94100e-02 1.64740e-01 1.63600e-02 0.00000e+00 0.00000e+00 0.00000e+00
 0.00000e+00 1.81200e-02 4.32620e-01 5.15320e-01 0.00000e+00 1.31140e-01
 0.00000e+00 2.35830e-01 0.00000e+00 0.00000e+00 2.76400e-02 7.73300e-02
 1.99720e-01 2.16000e-02 1.08800e-02 2.71600e-02 0.00000e+00 0.00000e+00
 1.65870e-01 0.00000e+00 2.55420e-01 0.00000e+00 2.09490e-01 0.00000e+00
 1.81200e-02 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 2.62000e-02
 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 1.86400e-02
 7.70530e-01 0.00000e+00 1.96000e-02 1.20870e-01 0.00000e+00 2.00000e-03
 0.00000e+00 0.00000e+00 8.84500e-02 4.97530e-01 3.72100e-02 7.64500e-02
 0.00000e+00 1.40830e-01 1.78400e-02 0.00000e+00 0.00000e+00 0.00000e+00
 1.02100e-02 1.22800e-02 1.25540e-01 0.00000e+00 1.13200e-02 1.50400e-02
 0.00000e+00 9.6000

In [14]:
mse = mean_squared_error(y_test, y_pred)

# Evaluate the Predictions
print('The mse of the model is: {}'.format(mse))

The mse of the model is: 0.016936813582020795
