# MODEL OPTIMIZATION

In [1]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge, RANSACRegressor
from sklearn.cross_validation import train_test_split, cross_val_score
from xgboost import XGBRegressor #faster version of gradient boosting regressor
from sklearn.grid_search import GridSearchCV
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import pickle
% matplotlib inline
import warnings
warnings.filterwarnings('ignore')

## LOAD DATA

In [2]:
df = pd.read_csv('data/wheat-2013-supervised-edited.csv')
drop_cols = ['Latitude','Longitude'] + [df.columns[0]]
df.drop(drop_cols,axis=1,inplace=True)
df.head()

Unnamed: 0,apparentTemperatureMax,apparentTemperatureMin,cloudCover,dewPoint,humidity,precipIntensity,precipIntensityMax,precipProbability,precipAccumulation,precipTypeIsRain,precipTypeIsSnow,pressure,temperatureMax,temperatureMin,visibility,windBearing,windSpeed,NDVI,DayInSeason,Yield
0,35.7,20.85,0.0,29.53,0.91,0.0,0.0,0.0,0.0,0,0,1027.13,35.7,27.48,2.46,214,1.18,134.110657,0,35.7
1,35.1,26.92,0.0,29.77,0.93,0.0001,0.0019,0.05,0.0,1,0,1026.87,35.1,26.92,2.83,166,1.01,131.506592,0,35.7
2,33.38,26.95,0.0,29.36,0.94,0.0001,0.0022,0.06,0.02,0,1,1026.88,33.38,26.95,2.95,158,1.03,131.472946,0,35.7
3,28.05,25.93,0.91,29.47,0.94,0.0002,0.0039,0.15,0.036,0,1,1026.37,33.19,27.17,2.89,153,1.84,131.2883,0,35.7
4,28.83,25.98,0.91,29.86,0.94,0.0003,0.0055,0.24,0.0,1,0,1026.19,33.85,27.07,2.97,156,1.85,131.2883,0,35.7


## OPTIMIZE MODELS

In [4]:
with open('sbs_feat_set.plk','rb') as f:
    sbs_dict = pickle.load(f)

In [5]:
def optimizer(name,*params):
    if name == 'Linear':
        k, model = 8, RANSACRegressor(base_estimator=LinearRegression(),max_trials=100,min_samples=params[0])
    elif name == 'Random Forest':
        k, model = 4, RandomForestRegressor(n_estimators=params[0],max_depth=params[1],random_state=42)
    elif name == 'Gradient Boost':
        k, model = 8, XGBRegressor(n_estimators=params[0],max_depth=params[1]) #XG Boost (same as gradient boost)
    X = np.matrix(df.ix[:,:-1])[:,list(sbs_dict[name][k])]
    y = np.array(df.ix[:,-1])
    X_std = StandardScaler().fit_transform(X)
    y_std = StandardScaler().fit_transform(y)
    X_train, X_test, y_train, y_test = train_test_split(X_std,y_std,test_size=0.25,random_state=42)
    results = model.fit(X_train,y_train)
    train_score = np.mean(cross_val_score(results,X_train,y_train,cv=8))
    test_score = results.score(X_test,y_test)
    return train_score, test_score

In [6]:
name = 'Linear'
test_scores = []
train_scores= []
param_list = []
min_samp_size = 70000
max_samp_size = 100000
for param in np.arange(min_samp_size,max_samp_size+1,10000):
    train_score, test_score = optimizer(name,param)
    train_scores.append(train_score)
    test_scores.append(test_score)
    param_list.append(param)
plt.figure(figsize=(10,7))
plt.plot(param_list,train_scores,marker='^')
plt.plot(param_list,test_scores,marker='o')
plt.legend(['Train','Test'],loc=0)
plt.title('Linear Regression')
plt.xlabel('RANSAC Sample Size')
# plt.axis([min_samp_size,max_samp_size,0,1])
plt.ylabel('Score')
plt.grid(True)
plt.show()

KeyboardInterrupt: 

In [None]:
name = 'Random Forest'
test_scores = []
train_scores= []
param_list = []
min_trees,max_trees = 20,100
min_depth,max_depth = 4,5,6
for param1 in np.arange(min_trees,max_trees+1,20):
    bin1 = []
    bin2 = []
    param_list.append(param1)
    for param2 in np.arange(min_depth,max_depth+1,1):
        train_score, test_score = optimizer(name,param1,param2)
        bin1.append(train_score)
        bin2.append(test_score)
    train_scores.append(np.array(bin1))
    test_scores.append(np.array(bin2))
fig,axs = plt.subplots(nrows=1,ncols=2)
fig.set_figheight(7)
fig.set_figwidth(15)
ax = axs[0]
ax.plot(param_list,train_scores,marker='^')
ax.legend(['n_depth={}'.format(n) for n in np.arange(1,max_depth+1,1)],loc=3)
ax.set_title('Random Forest (Train)')
ax.axis([min_trees,max_trees,0,1])
ax.set_xlabel('n Trees')
ax.set_ylabel('Score')
ax.grid(True)

ax = axs[1]
ax.plot(param_list,test_scores,marker='o')
ax.legend(['n_depth={}'.format(n) for n in np.arange(1,max_depth+1,1)],loc=3)
ax.set_title('Random Forest (Test)')
ax.axis([min_trees,max_trees,0,1])
ax.set_xlabel('n Trees')
ax.set_ylabel('Score')
ax.grid(True)

In [None]:
name = 'Gradient Boost'
test_scores = []
train_scores= []
param_list = []
min_trees,max_trees = 100,500
min_depth,max_depth = 4,5,6

for param1 in np.arange(min_trees,max_trees+1,100):
    bin1 = []
    bin2 = []
    param_list.append(param1)
    for param2 in np.arange(min_depth,max_depth+1,1):
        train_score, test_score = optimizer(name,param1,param2)
        bin1.append(train_score)
        bin2.append(test_score)
    train_scores.append(np.array(bin1))
    test_scores.append(np.array(bin2))
    
fig,axs = plt.subplots(nrows=1,ncols=2)
fig.set_figheight(7)
fig.set_figwidth(15)
ax = axs[0]
ax.plot(param_list,train_scores,marker='^')
ax.legend(['n_depth={}'.format(n) for n in np.arange(min_depth,max_depth+1,1)],loc=3)
ax.set_title('XG Boost (Train)')
ax.axis([min_trees,max_trees,0,1])
ax.set_xlabel('n Trees')
ax.set_ylabel('Score')
ax.grid(True)

ax = axs[1]
ax.plot(param_list,test_scores,marker='o')
ax.legend(['n_depth={}'.format(n) for n in np.arange(min_depth,max_depth+1,1)],loc=3)
ax.set_title('XG Boost (Test)')
ax.axis([min_trees,max_trees,0,1])
ax.set_xlabel('n Trees')
ax.set_ylabel('Score')
ax.grid(True)

In [None]:
#

In [None]:
# model = ExtraTreesRegressor()
# params = {'n_estimators':[30,40,50,60,70,80,90,100]}
# grid_ = GridSearchCV(model, params)
# grid_.fit(X_train, y_train)
# best_model = grid_.best_estimator_
# best_model

## PICKLE MODEL

I like to pickle my models and/or anything that may be utilized in a production line later down the line.

In [None]:
# filename = 'wheat_ET_model.plk'
# with open(filename, 'wb') as f:
#     pickle.dump(best_model, f)

## References

https://www.researchgate.net/post/How_to_determine_the_number_of_trees_to_be_generated_in_Random_Forest_algorithm