In [44]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV
from sklearn.linear_model import ElasticNetCV
from sklearn.model_selection import cross_val_score
import xgboost as xgb

ModuleNotFoundError: No module named 'xgboost'

In [5]:
df = pd.read_csv("weather.csv")

In [7]:
# Drop irrelevant columns
df = df.drop(['station', 'year'], axis=1)

# Fill in any null values.
df = (df.ffill()+df.bfill())/2
df = df.bfill().ffill()

In [10]:
# Feature engineering
df['two_day_rainfall_total_mm'] = df['daily_rainfall_total_mm'] + df['daily_rainfall_total_mm'].shift(1)
df['next_day_rainfall_total_mm'] = df['daily_rainfall_total_mm'].shift(-1)
df['next_next_day_rainfall_total_mm'] = df['daily_rainfall_total_mm'].shift(-2)
df['next_day_maximum_temperature_c'] = df['maximum_temperature_c'].shift(-1)
df['next_next_day_maximum_temperature_c'] = df['maximum_temperature_c'].shift(-2)
df['next_day_minimum_temperature_c'] = df['minimum_temperature_c'].shift(-1)
df['next_next_day_minimum_temperature_c'] = df['minimum_temperature_c'].shift(-2)

In [12]:
df = df.dropna()

In [17]:
def vif_feature_select(df, max_score=5.0, inplace=False, printable=False, _drops=None):
        '''
        Takes a DataFrame and returns it after recursively eliminating columns
        with the highest VIF scores until the remainder have VIF scores less
        than max_score.

        params:
            printable   when set to True, the function returns a list of
                        features that would be dropped instead.
        '''
        import numpy as np
        # Avoids overwriting the original DataFrame by default.
        if inplace is False:
            df = df.copy()
        # Creates an empty list for the first iteration.
        if _drops is None:
            _drops = []
            # Check if features contain string values.
            dtypes = [dt for dt in df.dtypes]
            if 'object' in dtypes:
                print('Feature(s) contain string values. Result may be unreliable.')
            # Check if any feature contains all 0s.
            if (df == 0).all().any():
                raise Exception(
                    'Feature(s) contain all 0s. Drop them before use.')
        features = df.columns
        # VIF is the diagonal of the correlation matrix.
        vifs = np.linalg.inv(df.corr().values).diagonal()
        max_vif_index = np.argmax(vifs)
        # Eliminate feature with the highest VIF score and rerun the function.
        if vifs[max_vif_index] >= max_score:
            _drops.append(features[max_vif_index])
            del df[features[max_vif_index]]
            return vif_feature_select(df, max_score, inplace, printable, _drops)
        else:
            # Returns a list of features that would be dropped instead of a
            # DataFrame
            if printable:
                return _drops
            else:
                return df

# Forecast Rain tomorrow

In [19]:
features = ['month', 'day', 'daily_rainfall_total_mm', 'highest_30_min_rainfall_mm',
       'highest_60_min_rainfall_mm', 'highest_120_min_rainfall_mm',
       'mean_temperature_c', 'maximum_temperature_c', 'minimum_temperature_c',
       'mean_wind_speed_kmh', 'max_wind_speed_kmh',
       'two_day_rainfall_total_mm']
target = 'next_day_rainfall_total_mm'
X = df[features]
y = df[target].values

In [21]:
# Remove high correlated features
X = vif_feature_select(X)
features = X.columns

In [27]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [30]:
# Standard scale
ss = StandardScaler()
X_train = pd.DataFrame(ss.fit_transform(X_train[features]), columns=features)
X_test = pd.DataFrame(ss.transform(X_test[features]), columns=features)

In [41]:
ridge_cv = RidgeCV(cv=5)
lasso = LassoCV(n_alphas=100, cv=5)
elastic = ElasticNetCV(cv=5)

In [42]:
ridge_scores = cross_val_score(ridge_cv, X_train, y_train, cv=5)
lasso_scores = cross_val_score(lasso, X_train, y_train, cv=5)
elastic_scores = cross_val_score(elastic, X_train, y_train, cv=5)

In [43]:
print('ridge:', ridge_scores.mean())
print('lasso:', lasso_scores.mean())
print('elastic:', elastic_scores.mean())

ridge: -0.01939371599312183
lasso: -0.009370093214024466
elastic: -0.008978480009192102
