In [69]:
import pandas as pd
import sys
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as sk
from sklearn.model_selection import train_test_split
from statsmodels.tools import add_constant
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C
import statsmodels.api as sm
import pickle
import os
from sklearn import preprocessing
sns.set(style="ticks")
%matplotlib inline

In [70]:
no2 = pd.read_csv("merged_no2weather.csv")
so2 = pd.read_csv("merged_so2weather.csv")

In [71]:
#scale data
del no2["Latitude"]
del no2["Longitude"]
del so2["Latitude"]
del so2["Longitude"]

no2_scaled = no2.iloc[:, 3:]
no2_scaled = preprocessing.minmax_scale(no2_scaled)
no2_scaled = pd.DataFrame(no2_scaled)
add = no2.iloc[:, :3]
no2_scaled = add.join(no2_scaled)
cols = list(no2.columns)
no2_scaled.columns = cols

#splits train, test sets
train_no2, test_no2 = train_test_split(no2_scaled, test_size=.30, random_state=0)
#Scales y data (ppm values)
y_train_no2 = train_no2['ppm'].values * 100
y_test_no2 = test_no2['ppm'].values * 100
test_no2 = test_no2.iloc[:, 3:]

In [72]:
#scale data
so2_scaled = so2.iloc[:, 3:]
so2_scaled = preprocessing.minmax_scale(so2_scaled)
so2_scaled = pd.DataFrame(so2_scaled)
add = so2.iloc[:, :3]
so2_scaled = add.join(so2_scaled)
cols = list(so2.columns)
so2_scaled.columns = cols

#splits train, test sets
train_so2, test_so2 = train_test_split(so2_scaled, test_size=.30, random_state=0)
#Scales y data (ppm values)
y_train_so2 = train_so2['ppm'].values * 100
y_test_so2 = test_so2['ppm'].values * 100
test_so2 = test_so2.iloc[:, 3:]

In [73]:
'''Method that takes in a list of all the predictors left in the model, and the model, 
spits out the lowest p valued predictor and lowest p value'''
def find_low_p(all_preds, model_OLS):
    #initiates lowest p value
    maxp = max(model_OLS.pvalues[1:])  
    index = list(model_OLS.pvalues[1:]).index(maxp)
    lowestlu = all_preds[index]

    return maxp , lowestlu

In [74]:
"""Input: y data set, list of predictor variables
Action: X data set formed, runs a MLR through data set, computes pvalue, deletes predictor with least pvalue
returns: models - one for each new data set created after deleting
"""
def backwards_var_sel(y, all_predictors, train):
    models = [] #to hold all the models
    x = train[all_predictors].values
    x = add_constant(x, has_constant='add')
    indices = []
    copy = all_predictors

    for i in range(len(copy)):
        #fit a MLR to data set
        OLS_model = sm.OLS(y, x).fit()

        #finds lowest pvalue and index of lowest pvalue, excluding constant term

        maxp, lowestlu = find_low_p(all_predictors, OLS_model)
        #removes variable with lowest p value
        index_td = copy.index(lowestlu)
        indices.append(index_td)
        index = all_predictors.index(lowestlu)
        all_predictors = all_predictors[:index] + all_predictors[index+1 :]
        
        #updates list of x values, not including variable with lowest pvalue
        x = train[all_predictors].values
        x = add_constant(x, prepend=True)
        #save new model in a list
        models.append(OLS_model)
        
        
    return models, indices

predictors = ['forest',
       'open_land', 'water', 'wetland', 'transitional',
       'urban_public_institution', 'commercial', 'transportation', 'crop_land',
       'medium_density_residential', 'industrial', 
       'outdoor_temperature', 'solar_radiation',
       'wind_speed_resultant']

#models, index_ls = backwards_var_sel(y_train_no2, predictors, train_no2)
models, index_ls = backwards_var_sel(y_train_so2, predictors, train_so2)

In [62]:
best_models = []
index_of_model = []
for i in range(len(models)):
    if np.average(models[i].pvalues[1:]) < .005:
        best_models.append(models[i])
        #print(np.average(models[i].pvalues[1:]))
        index_of_model.append(i)
model2_pos = index_of_model[0]
#must subtract 1. index ls tells what column to remove, but models list includes the model with no column removed at all
new_index_list = index_ls[:model2_pos]
test1 = test_so2.drop(test_so2.columns[new_index_list], axis=1)
best_models[0].summary()
test1.columns
#test1 = add_constant(test1, has_constant = "add")
#np.mean((best_models[0].predict(test1)-y_test_so2)**2)
##linmodel = LinearRegression(fit_intercept=True)
#linmodel.fit(test1,y_test_so2)
#linmodel.score(test1, y_test_so2)

Index(['commercial', 'transportation', 'medium_density_residential',
       'industrial', 'solar_radiation'],
      dtype='object')

In [63]:
best_models = []
index_of_model = []
for i in range(len(models)):
    if np.average(models[i].pvalues[1:]) < .00001:
        best_models.append(models[i])
        #print(np.average(models[i].pvalues[1:]))
        index_of_model.append(i)
model2_pos = index_of_model[0]
#must subtract 1. index ls tells what column to remove, but models list includes the model with no column removed at all
new_index_list = index_ls[:model2_pos]
test1 = test_no2.drop(test_no2.columns[new_index_list], axis=1)
best_models[0].summary()
#test1 = add_constant(test1)
#np.mean((best_models[0].predict(test1)-y_test_no2)**2)
#linmodel = LinearRegression(fit_intercept=True)
#linmodel.fit(test1,y_test_no2)
#linmodel.score(test1, y_test_no2)
test1.columns

Index(['transportation', 'industrial', 'solar_radiation'], dtype='object')

In [87]:
predictors = ['forest',
       'open_land', 'water', 'wetland', 'transitional',
       'urban_public_institution', 'commercial', 'transportation', 'crop_land',
       'medium_density_residential', 'industrial',
       'outdoor_temperature', 'solar_radiation',
       'wind_speed_resultant']

"""Input: y data set, list of predictor variables
Action: X data set formed, runs a MLR through data set, computes AIC, deletes predictor with maximum AIC
returns: models - list of remaining predictors after each iteration
"""
def backwards_var_sel_aic(y, all_predictors, train):
    preds = [] #to hold all the list of predictors
    copy = all_predictors #keeps copy of full predictors list #holds AIC values for each iteration
    models = []
    indices = []
    for j in range(len(copy)):
        t = all_predictors
        AIC = []
        for i in range(len(t)):
            #removes one predictor in each iteration
            predictors = all_predictors[:i] + all_predictors[i+1 :]
            #print(predictors)
            #train and fit a model on those predictors
            x = train[predictors].values
            x = add_constant(x, has_constant='add')
            OLS_model = sm.OLS(y, x).fit()

            #append AIC value to AIC list for each iteration
            AIC.append(OLS_model.aic)
        #finds the largest AIC and the index at which it exists
        worstAIC = max(AIC)
        index = AIC.index(max(AIC))
        worstLU = all_predictors[index]
        indices.append(copy.index(worstLU))
        #reassign all_predictors to a new list with the predictor corresponding to the worst AIC removed
        all_predictors = all_predictors[:index] + all_predictors[index+1 :]
        #append a list of the remaining predictors to the models list
        preds.append(all_predictors)
        models.append(OLS_model)
        
    return models, indices, preds


models, indices, predicts = backwards_var_sel_aic(y_train_so2, predictors, train_so2)

In [89]:
# relevant for 
best_models = []
for i in range(len(models)):
    #find all models where the average pvalue is less than 0.09
    best_models.append(models[i].aic)
m = min(best_models)
#finds minimum aic among each model
#gets the index of every instance where the minimum AIC occurs
index_of_model = best_models.index(m)
models[0].summary()
test2 = test_no2.loc[:, predicts[0]]
test2 = add_constant(test2, has_constant="add")
#test2["const"] = np.repeat(1, len(test2.index))
no2preds = test2.columns

boston = pd.read_csv("final_boston.csv")
np.mean((models[0].predict(test2)-y_test_no2)**2)
linmodel = LinearRegression(fit_intercept=True)
linmodel.fit(test2,y_test_no2)
linmodel.score(test2, y_test_no2)

finaltest = boston[list(no2preds)[1:]]

finaltest = preprocessing.minmax_scale(finaltest)
#turns array back into dataframe
finaltest = pd.DataFrame(finaltest)
finaltest = add_constant(finaltest, has_constant="add")

boston_aic_preds = models[0].predict(finaltest)/100
def parse_str(str_edit):
#   print str_edit
    str_edit = str_edit.replace('[', '')
    str_edit = str_edit.replace('(', '')
    str_edit = str_edit.replace(']', '')
    str_edit = str_edit.replace(')', '')
    str_edit = str_edit.replace(',', '')
    list = str_edit.split(' ')
#   print list
    return [[np.float64(list[0]), np.float64(list[1])], [np.float64(list[2]), np.float64(list[3])],
      [np.float64(list[4]), np.float64(list[5])], [np.float64(list[6]), np.float64(list[7])]]

loc = []
for i in boston["Site"]:
    loc.append(parse_str(i))
lats, longs = [], []
for i in np.arange(len(loc)):
    lats.append((loc[i][0][0], loc[i][0][1], boston_aic_preds[i]))
    #longs.append(i[0][1])
w = pd.DataFrame(pd.Series(lats))
w.to_csv("boston_no2_LUR_preds.csv", index=False)
w.to_json("boston_no2_LUR_preds.json")
boston_aic_preds

array([ 0.0187404 ,  0.01498904,  0.01132031, ...,  0.00540185,
        0.00540185,  0.00540185])

In [28]:
# relevant for 
best_models = []
for i in range(len(models)):
    #find all models where the average pvalue is less than 0.09
    best_models.append(models[i].aic)
m = min(best_models)
#finds minimum aic among each model
#gets the index of every instance where the minimum AIC occurs
index_of_model = best_models.index(m)
models[0].summary()

#test2.columns

0,1,2,3
Dep. Variable:,y,R-squared:,0.613
Model:,OLS,Adj. R-squared:,0.608
Method:,Least Squares,F-statistic:,121.2
Date:,"Fri, 04 Aug 2017",Prob (F-statistic):,1.4500000000000002e-194
Time:,13:43:15,Log-Likelihood:,-1105.9
No. Observations:,1009,AIC:,2240.0
Df Residuals:,995,BIC:,2309.0
Df Model:,13,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,0.1694,0.154,1.099,0.272,-0.133 0.472
x1,-1.0584,0.372,-2.845,0.005,-1.788 -0.328
x2,-1.6311,0.380,-4.294,0.000,-2.376 -0.886
x3,0.3065,0.517,0.593,0.554,-0.708 1.321
x4,-1.5820,0.737,-2.146,0.032,-3.029 -0.135
x5,-1.7591,0.672,-2.618,0.009,-3.078 -0.440
x6,-0.5411,0.272,-1.990,0.047,-1.075 -0.007
x7,0.6120,0.146,4.187,0.000,0.325 0.899
x8,2.8961,0.299,9.696,0.000,2.310 3.482

0,1,2,3
Omnibus:,301.407,Durbin-Watson:,1.998
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1495.116
Skew:,1.293,Prob(JB):,0.0
Kurtosis:,8.373,Cond. No.,39.6
