In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pylab as pl
import os
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
import math

# Functions for random forest classifiers

def train_predict_random_forest_regressor(n_estimators, n_jobs, train, target, test):
    """
    Function to train random forest algorithm
    """

    rf = RandomForestRegressor(n_estimators = n_estimators, n_jobs = n_jobs, oob_score = True)
    print("Training random forest regressor model ...")
    rf.fit(train, target)

    pred_prob_array = rf.predict(test)
    print("Predicting using random forest model (regression)...")
    #[x for x in pred_prob_array]
    #print([x[1] for x in pred_prob_array])

    # Statistics and important features of fit
    print("Statistics and important features of fit\n")
    print(rf.estimators_) # list of DecisionTreeRegressor, The collection of fitted sub-estimators.

    print("Important features\n")
    print(rf.feature_importances_) # : array of shape = [n_features] The feature importances (the higher, the more important the feature).

    print("Number of features\n")
    print(rf.n_features_) #: int The number of features when fit is performed.

    print("The number of outputs when fit is performed\n")
    print(rf.n_outputs_) # : int The number of outputs when fit is performed.

    print("OOB score\n")
    print(rf.oob_score_) # : float Score of the training dataset obtained using an out-of-bag estimate.

    #print(rf.oob_prediction)

    return rf, pred_prob_array


def train_predict_random_forest_classifier(n_estimators, n_jobs, train, target, test):
    """
    Function to train random forest algorithm and also predict
    """

    rf = RandomForestClassifier(n_estimators = n_estimators, n_jobs = n_jobs, oob_score = True)
    print("Training random forest model ...")
    rf.fit(train, target)

    pred_prob_array = rf.predict_proba(test)
    print("Predicting using random forest model ...")
    [x[1] for x in pred_prob_array]
    # print([x[1] for x in pred_prob_array])

    return rf, pred_prob_array



data = pd.read_csv('aggregated_timeseries.csv')



In [3]:
print(data.describe())

             count  DAY_OF_WEEK  ATMOSPH_COND
count  3759.000000  3759.000000   3759.000000
mean     38.150572     3.902634      3.391061
std       8.525746     2.012888      3.557673
min       1.000000     0.000000      1.000000
25%      32.000000     2.000000      1.000000
50%      38.000000     4.000000      1.000000
75%      44.000000     6.000000      9.000000
max      79.000000     7.000000      9.000000


In [4]:
# first column is targe
target = data.iloc[0:, 0]

In [5]:
# Convert target to numeric type
target = pd.to_numeric(target)

In [6]:
# all other columns are training set
train = data.iloc[0:, 1:]

# Convert target to numeric type
train = train.apply(lambda x: pd.to_numeric(x) )

In [8]:
# train random forest model
print("Training random forest model ...")

test = np.matrix([[2, 1], [3, 1], [4, 1], [5, 1], [6, 1], [7, 1]  ])

Training random forest model ...


In [9]:
fit_rf_model, pred_prob_array = train_predict_random_forest_regressor(n_estimators=100, n_jobs=2,
                                                                train=train, target=target, test = test)

Training random forest regressor model ...
Predicting using random forest model (regression)...
Statistics and important features of fit

[DecisionTreeRegressor(criterion='mse', max_depth=None, max_features='auto',
           max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, presort=False,
           random_state=1038941789, splitter='best'), DecisionTreeRegressor(criterion='mse', max_depth=None, max_features='auto',
           max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, presort=False,
           random_state=1937038525, splitter='best'), DecisionTreeRegressor(criterion='mse', max_depth=None, max_features='auto',
           max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, presort=False,
           random_state=1665010902, splitter='best'), DecisionTreeRegressor(criterion='mse', max_depth=None, max_features='auto',
        

In [10]:
print(pred_prob_array)

[ 35.28010564  37.50122204  37.98302406  40.7272273   40.83695031
  38.26492146]
