# Regression Modeling

## IMPORTS

In [None]:
from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

## LOAD DATA

In [None]:
filename = 'insert file name here'
df = pd.read_csv(filename) # or read_json, read_sql, read_pickle, read_html, etc
df.head()                  # see head of data frame

## SPLIT INTO TRAIN/TEST
- shuffle data at random state = 42
- Train with 80% of data
- Test with 20% of data

In [None]:
X = [] # features, should be matrix (or a vector if only 1 feature)
y = [] # target, should be vector
# X_std = StandardScaler().fit_transform(X) # scale if necessary
# y_std = StandardScaler().fit_transform(y) # scale if necessary
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.20,random_state=42)

## MACHINE LEARNING ALGORITHMS

In [None]:
models = {}
models['Ridge'] = Ridge()
models['Lasso'] = Lasso()
models['Elastic_Net'] = ElasticNet()
models['Linear_Regression'] = LinearRegression()
models['Random_Forest'] = RandomForestRegressor()
models['Gradient_Boost'] = GradientBoostingRegressor()

## TRAIN & SCORE MODELS

In [None]:
for name,model in models.items():
    results = model.fit(X_train,y_train) #fit that model
    y_pred = results.predict(X_test)
    train_score = np.mean(cross_val_score(model,X_train,y_train, cv=4)) # 4 fold cross validation 
    test_score = results.score(X_test,y_test)
    print 'MODEL:{} \t TRAIN_SCORE:{} \t TEST_SCORE:{}'.format(name,train_score,test_score)

## MODEL EVALUATION

In [None]:
def plot_analysis(name,y_true,y_pred):  
    figs,axs = plt.subplots(ncols=2,nrows=1)
    figs.set_figwidth(15)
    figs.set_figheight(10)   
    ax = axs[0]
    residual = abs(y_true)-abs(y_pred)
    ax.scatter(y_pred,residual)
    ax.plot(np.linspace(min(residual),max(residual)),np.linspace(min(residual),max(residual))*0, '--r')
    ax.set_title('Residual VS. Predicted (MODEL:{})'.format(name))
    ax.set_xlabel('Predicted Yield')
    ax.set_ylabel('Residual')
    ax.grid(True)
    ax = axs[1]
    ax.scatter(y_true,y_pred)
    ax.plot([min(y_true),max(true)],[min(y_true),max(y_true)],'--r')
    ax.set_title('Predicted VS. True (MODEL:{})'.format(name))
    ax.set_xlabel('True Yield')
    ax.set_ylabel('Predicted Yield')
    ax.grid(True)