In [1]:
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, ShuffleSplit, learning_curve, train_test_split, cross_val_score
from sklearn import preprocessing
from sklearn.metrics import r2_score, make_scorer, mean_squared_error, mean_absolute_error
from sklearn.feature_selection import SelectFromModel


%matplotlib inline
%load_ext autoreload

import warnings
warnings.filterwarnings("ignore")

In [2]:
dataset = pd.read_csv('export_dataframe.csv')

In [3]:
dataset.head()

Unnamed: 0,Station Id,Air Temperature Maximum (degF),Air Temperature Minimum (degF),Precipitation Increment (in),Relative Humidity (pct) Mean of Hourly Values,Wind Speed Maximum (mph) Max of Hourly Values,Wind Speed Average (mph) Mean of Hourly Values,Solar Radiation Average (watt/m2) Mean of Hourly Values,Solar Radiation/langley Total (langley),Vapor Pressure - Partial (inch_Hg) Mean of Hourly Values,...,Soil Moisture Percent -40in (pct) Mean of Hourly Values,Soil Temperature Observed -2in (degF) Mean of Hourly Values,Soil Temperature Observed -4in (degF) Mean of Hourly Values,Soil Temperature Observed -8in (degF) Mean of Hourly Values,Soil Temperature Observed -20in (degF) Mean of Hourly Values,Soil Temperature Observed -40in (degF) Mean of Hourly Values,MonthOfYear,DayOfYear,WeekOfYear,Soil Temperature
0,2174,61,51,0.0,94,10.5,4.6,40.01,82.6,0.43,...,62.2,57,58,57,55,57,1,1,0,56.75
1,2174,52,48,1.29,99,13.0,4.9,13.1,27.05,0.36,...,62.2,53,55,55,56,57,1,2,0,55.75
2,2174,59,49,0.98,99,19.2,5.7,17.74,36.63,0.38,...,62.1,52,54,54,56,57,1,3,0,55.25
3,2174,62,44,0.29,91,25.2,10.2,51.13,105.6,0.35,...,62.3,53,54,54,56,57,1,4,0,55.25
4,2174,62,40,0.0,89,11.6,3.5,130.0,268.4,0.29,...,62.3,50,52,52,55,57,1,5,0,54.0


In [4]:
dataset.columns

Index(['Station Id', 'Air Temperature Maximum (degF)',
       'Air Temperature Minimum (degF)', 'Precipitation Increment (in)',
       'Relative Humidity (pct) Mean of Hourly Values',
       'Wind Speed Maximum (mph) Max of Hourly Values',
       'Wind Speed Average (mph) Mean of Hourly Values',
       'Solar Radiation Average (watt/m2) Mean of Hourly Values',
       'Solar Radiation/langley Total (langley)',
       'Vapor Pressure - Partial (inch_Hg) Mean of Hourly Values',
       'Vapor Pressure - Saturated (inch_Hg) Mean of Hourly Values',
       'Soil Moisture Percent -2in (pct) Mean of Hourly Values',
       'Soil Moisture Percent -4in (pct) Mean of Hourly Values',
       'Soil Moisture Percent -8in (pct) Mean of Hourly Values',
       'Soil Moisture Percent -20in (pct) Mean of Hourly Values',
       'Soil Moisture Percent -40in (pct) Mean of Hourly Values',
       'Soil Temperature Observed -2in (degF) Mean of Hourly Values',
       'Soil Temperature Observed -4in (degF) Mean of 

In [66]:
dataset = dataset.dropna()

In [67]:
X_train = dataset[[ 'Air Temperature Maximum (degF)',
       'Air Temperature Minimum (degF)', 'Precipitation Increment (in)',
       'Relative Humidity (pct) Mean of Hourly Values',
       'Wind Speed Maximum (mph) Max of Hourly Values',
       'Wind Speed Average (mph) Mean of Hourly Values',
       'Vapor Pressure - Partial (inch_Hg) Mean of Hourly Values',
       'Vapor Pressure - Saturated (inch_Hg) Mean of Hourly Values',
            'Soil Temperature']]
y_train = dataset['Soil Moisture Percent -2in (pct) Mean of Hourly Values']

In [68]:
X_test=pd.read_csv('test.csv')

In [69]:
X_test

Unnamed: 0,Air Temperature Maximum (degF),Air Temperature Minimum (degF),Precipitation Increment (in),Relative Humidity (pct) Mean of Hourly Values,Wind Speed Maximum (mph) Max of Hourly Values,Wind Speed Average (mph) Mean of Hourly Values,Vapor Pressure - Partial (inch_Hg) Mean of Hourly Values,Vapor Pressure - Saturated (inch_Hg) Mean of Hourly Values,Soil Temperature
0,89.492,75.002,0.0,79.291667,4.41,3.03,0.3,0.3,54


In [70]:
def ModelLearning(X, y):

    # Create 10 cross-validation sets for training and testing
    cv = ShuffleSplit(n_splits = 10, test_size = 0.2, random_state = 0)

    # Generate the training set sizes increasing by 50
    train_sizes = np.rint(np.linspace(1, X.shape[0]*0.8 - 1, 9)).astype(int)

    fig = plt.figure(figsize=(15,10))

    for k, depth in enumerate([1,3,6,10]):

        # Create a random forest regressor
        regressor = RandomForestRegressor(max_depth = depth)

        # Calculate the training and testing scores
        sizes, train_scores, test_scores = learning_curve(regressor, X, y, \
            cv = cv, train_sizes = train_sizes, scoring = 'r2')

        # Find the mean and standard deviation for smoothing
        train_std = np.std(train_scores, axis = 1)
        train_mean = np.mean(train_scores, axis = 1)
        test_std = np.std(test_scores, axis = 1)
        test_mean = np.mean(test_scores, axis = 1)

        # Subplot the learning curve
        ax = fig.add_subplot(2, 2, k+1)
        ax.plot(sizes, train_mean, 'o-', color = 'r', label = 'Training Score')
        ax.plot(sizes, test_mean, 'o-', color = 'purple', label = 'Testing Score')
        ax.fill_between(sizes, train_mean - train_std, \
            train_mean + train_std, alpha = 0.15, color = 'r')
        ax.fill_between(sizes, test_mean - test_std, \
            test_mean + test_std, alpha = 0.15, color = 'purple')

        ax.set_title('max_depth = %s'%(depth))
        ax.set_xlabel('Training Set Size')
        ax.set_ylabel('Score')
        ax.set_xlim([0, X.shape[0]*0.8])
        ax.set_ylim([-0.05, 1.05])

    # Visual
    ax.legend(bbox_to_anchor=(1.05, 2.05), loc='lower left', borderaxespad = 0.)
    fig.suptitle('Random Forest Regressor Learning Performances', fontsize = 16, y = 1.03)
    fig.tight_layout()
    fig.show()

In [71]:
regressor = RandomForestRegressor(n_estimators = 100)

In [72]:
from sklearn.model_selection import validation_curve

def ModelComplexity(X, y):

    # Create 10 cross-validation sets for training and testing
    cv = ShuffleSplit(n_splits = 10, test_size = 0.2, random_state = 0)

    # Vary the max_depth parameter from 1 to 10
    max_depth = np.arange(1,11)

    # Calculate the training and testing scores
    train_scores, test_scores = validation_curve(RandomForestRegressor(), X, y, \
        param_name = "max_depth", param_range = max_depth, cv = cv, scoring = 'r2')

    # Find the mean and standard deviation for smoothing
    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    test_mean = np.mean(test_scores, axis=1)
    test_std = np.std(test_scores, axis=1)

    # Plot the validation curve
    plt.figure(figsize=(15,10))
    plt.title('Random Forest Regressor Complexity Performance')
    plt.plot(max_depth, train_mean, 'o-', color = 'r', label = 'Training Score')
    plt.plot(max_depth, test_mean, 'o-', color = 'g', label = 'Validation Score')
    plt.fill_between(max_depth, train_mean - train_std, \
        train_mean + train_std, alpha = 0.15, color = 'r')
    plt.fill_between(max_depth, test_mean - test_std, \
        test_mean + test_std, alpha = 0.15, color = 'g')

    # Visual aesthetics
    plt.legend(loc = 'lower right')
    plt.xlabel('Maximum Depth')
    plt.ylabel('Score')
    plt.ylim([-0.05,1.05])
    plt.show()

In [73]:
def fit_model(X, y):

    cv_sets = ShuffleSplit(test_size = 0.20, random_state = 0) 
    
    params = {'n_estimators':[100, 120, 140], 
              'min_samples_leaf':[1, 2, 3],
              'max_depth':list(range(1, 20)),
              'max_features':[0.05, 0.1, 0.15, 0.2]}

    n_iter_search = 20
    regressor = RandomForestRegressor()
    score = make_scorer(r2_score)
    grid = RandomizedSearchCV(regressor, params, n_iter = n_iter_search, scoring = score, cv = cv_sets)
    grid = grid.fit(X, y)
    # Return the optimal model
    return grid.best_estimator_

In [74]:
# Fit the training data to the model using grid search
reg = fit_model(X_train, y_train)

In [75]:
print("Parameter are {} for the optimal model.".format(reg.get_params()))

Parameter are {'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'mse', 'max_depth': 13, 'max_features': 0.1, 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 120, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False} for the optimal model.


In [76]:
y_opt_pred = reg.predict(X_test)

In [78]:
y_opt_pred


array([49.18166667])