## Runs Variable and KFold Cross Validation with MLR

In [1]:
import pandas as pd
import sys
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as sk
from sklearn.model_selection import train_test_split
from statsmodels.tools import add_constant
from statsmodels.regression.linear_model import RegressionResults
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA
import statsmodels.api as sm
import pickle
import os
from sklearn import preprocessing
from sklearn.metrics import r2_score
sns.set(style="ticks")
%matplotlib inline

## Read in data frame

In [66]:
'''Read in the data on which the variable selection will be performed.'''
#Round 1
#pm = pd.read_csv("C:/Users/Anthony DePinho/Documents/REU 2017/TRiCAM_BostonAQ/StateData/Round_1_Data/finished_pm.csv")
#Round 2 Data
#PM2.5
# pm = pd.read_csv("finished_pmweather.csv")
#SO 2
# pm = pd.read_csv("finished_so2weather.csv")
#NO 2
pm = pd.read_csv("merged_so2weather.csv")

In [67]:
pm.shape ##311,19 NO2

(1032, 19)

In [68]:
'''Method that splits the DataFrame into train and test sets.'''
train, test = train_test_split(pm, test_size=.30, random_state=0)

'''Scales y data (ppm values)by multiplying by 100. Otherwise, ppm values would be too small to properly model.'''
#PM 2.5 - Label is 'ppm'
y_train = train['ppm'].values * 100
y_test = test['ppm'].values * 100

all_predictors=['forest', 'open_land', 'water',
       'wetland', 'transitional', 'urban_public_institution', 'commercial',
       'transportation', 'crop_land', 'medium_density_residential',
       'industrial', 'outdoor_temperature', 'solar_radiation',
       'wind_speed_resultant']
#NO2 and SO2 - Label is "Arithmetic Mean"
#y_train = train['ppm'].values * 100
#y_test = test['ppm'].values * 100

In [69]:
X_train = add_constant(train[all_predictors].values, has_constant = "add")
X_test = add_constant(test[all_predictors].values, has_constant='add')

In [70]:
'''Method that takes in the x and y training sets, fits a linear model, and returns the training R^2'''
def get_Rsquared(x_train, y_train, x_test, t_test):
    model = LinearRegression(fit_intercept=True)
    model.fit(x_train, y_train)
    train_r_sq = model.score(x_train, y_train)
    test_r_sq = model.score(x_test, y_test)
    return train_r_sq, test_r_sq

In [71]:
'''Method that takes in the x and y, training AND testing sets. Fits a linear model on the training sets, then returns the
testing R^2 of the model.'''

#this isn't necessary. You can get the rsquared all together in the R_squared function above
def get_test_Rsquared(X_train, y_train, X_test, y_test):
    model = LinearRegression(fit_intercept=True)
    model.fit(X_train, y_train)
    print (X_test.shape, y_test.reshape(94,1).shape)
    test_r_sq = model.score(X_test, y_test.reshape(94,1))
    return test_r_sq
    

## Variable Selection and K-Fold Cross Validation

In [72]:
""" R^2 variable selection with k-fold cross validation.
""" 
regression_model = LinearRegression(fit_intercept=False)
kf = KFold(n_splits=8)

#Begins by considering the set with all possible predictors together
#X_train = add_constant(train[all_predictors].values, prepend=True)
#X_test = add_constant(test[all_predictors].values, prepend=True)
train_rsq, test_rsq = get_Rsquared(X_train, y_train, X_test, y_test)
#test_rsq = get_test_Rsquared(X_train, y_train, X_test, y_test)
print ("Original train and test r^2:", train_rsq, test_rsq)

#K-fold cross validation
valscores = []
for train_index, val_index in kf.split(X_train):
    X_training, X_valid = X_train[train_index], X_train[val_index]
    y_training, y_valid = y_train[train_index], y_train[val_index]

    regression_model.fit(X_training, y_training)
    valscores.append(regression_model.score(X_valid, y_valid))

Original train and test r^2: 0.445261443859 0.337524169459


In [73]:
our_tuple = (all_predictors, train_rsq, test_rsq, np.mean(valscores))
predictor_sets = [list(our_tuple)]

#Removes a predictor, examines all possible combinations of remaining predictors
for k in range(len(all_predictors), 1, -1):   
    best_k_predictors = predictor_sets[-1][0] 
    train_R_squares = []
    test_R_squares = []
    x_val_scores = []
    
    for predictor in best_k_predictors:
        k_minus_1 = list(set(best_k_predictors) - set([predictor]))
        X_train = train[k_minus_1].values
        X_test = test[k_minus_1].values
        
        train_rsq, test_rsq = get_Rsquared(X_train, y_train, X_test, y_test)
        train_R_squares.append(train_rsq)
        test_R_squares.append(test_rsq)
        
        #K-fold cross-validation
        validation_R_sqs = []
        for train_index, val_index in kf.split(X_train):
            X_training, X_valid = X_train[train_index], X_train[val_index]
            y_training, y_valid = y_train[train_index], y_train[val_index]

            regression_model.fit(X_training, y_training)
            validation_R_sqs.append(regression_model.score(X_valid, y_valid))
        
        x_val_scores.append(np.mean(validation_R_sqs))
    
    
    max_test = max(test_R_squares)
    ind = test_R_squares.index(max_test)
    max_valscore = max(x_val_scores)
    ind2 = x_val_scores.index(max_valscore)
    
    #Appends to predictor_sets the set of best performing model with the given numer of parameters being examined
    #Also appends the training R^2, testing R^2 and validation score R^2
    best_k_minus_1 = list(set(best_k_predictors) - set([best_k_predictors[ind2]]))
    our_tuple = (best_k_minus_1, np.max(train_R_squares), max_test, max_valscore )
    predictor_sets.append(list(our_tuple))

print("done")

done


In [74]:
'''Holds the best model by each number of predictors.
predictor_sets[x][0]: list which has the combination of most significant predictors for a model with that number of predictors
    (i.e, a list with 7 predictors in it represents the best model of 7 predictors)
predictor_sets[x][1]: training R^2 of that model
predictor_sets[x][2]: testing R^2 of that model
predictor_sets[x][3]: validation R^2 of the model
'''
predictor_sets

[[['forest',
   'open_land',
   'water',
   'wetland',
   'transitional',
   'urban_public_institution',
   'commercial',
   'transportation',
   'crop_land',
   'medium_density_residential',
   'industrial',
   'outdoor_temperature',
   'solar_radiation',
   'wind_speed_resultant'],
  0.44526144385897043,
  0.33752416945905528,
  0.40707198419229074],
 [['commercial',
   'transportation',
   'industrial',
   'solar_radiation',
   'wind_speed_resultant',
   'open_land',
   'water',
   'urban_public_institution',
   'medium_density_residential',
   'wetland',
   'crop_land',
   'forest',
   'transitional'],
  0.44521146790168742,
  0.33922255088603448,
  0.40876703243456047],
 [['transportation',
   'industrial',
   'commercial',
   'solar_radiation',
   'wind_speed_resultant',
   'open_land',
   'water',
   'urban_public_institution',
   'medium_density_residential',
   'wetland',
   'crop_land',
   'forest'],
  0.44345131844399788,
  0.3329735108932701,
  0.41109180656200806],
 [['ind

In [75]:
'''Selects the best of these "best models" by validation R^2. '''
best_predictor = sorted(predictor_sets, key=lambda t: t[3])[-1]
best_predictor

[['industrial',
  'transportation',
  'commercial',
  'solar_radiation',
  'wind_speed_resultant',
  'water',
  'urban_public_institution',
  'medium_density_residential',
  'forest'],
 0.43911778874382329,
 0.37391837406071604,
 0.41662429992918526]

In [76]:
'''Makes predictions on the data set, using the specific parameters of the best model.'''
x_train1 = add_constant(train[best_predictor[0]], prepend=True)
x_test1 = add_constant(test[best_predictor[0]], prepend=True)

x1 = add_constant(pm[best_predictor[0]], prepend=True)

best_model = LinearRegression(fit_intercept=True)
best_model.fit(x_train1, y_train)
predictions = best_model.predict(x1)

In [77]:
mean_squared_error(pm['ppm'].values, predictions)
# mean_squared_error(pm['Arithmetic Mean'].values, predictions)

0.44337121068556123

In [143]:
predictions

array([ 4.69219694,  4.69219694,  4.69219694, ...,  4.69219694,
        4.69219694,  4.69219694])