In [49]:
import pandas as pd
import sys
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as sk
from sklearn.model_selection import train_test_split
from statsmodels.tools import add_constant
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C
import statsmodels.api as sm
import pickle
import os
from sklearn import preprocessing
sns.set(style="ticks")
%matplotlib inline

predictors = ['forest',
       'open_land', 'water', 'wetland', 'transitional',
       'urban_public_institution', 'commercial', 'transportation', 'crop_land',
       'medium_density_residential', 'industrial',
       'outdoor_temperature', 'solar_radiation', 
       'wind_speed_resultant']

In [50]:
pm = pd.read_csv("merged_pmweather.csv")
boston = pd.read_csv("final_boston.csv")
'''Rename columns for a consistent naming format'''
boston
def rename_again(df):
    df=df.rename(columns = {'Saltwater Sandy Beach':'saltwater_sandybeach'})
    df=df.rename(columns = {'Golf Course':'golfcourse'})
    df=df.rename(columns = {'urban_public/institutional':'urban_public_institution'})
    return df
boston=rename_again(boston)
#del boston["Cropland"]
"""Delete Variables that do not contribute to ppm"""
#Deletions only need to be made for PM
#del boston["saltwater_sandybeach"]
#del boston["recreational"]
#del boston["marina"]
#del boston["mining"]
#del boston["waste"]
#del boston["cemetary"]
#del boston["golfcourse"]
#del boston["high_density_residential"]
#del boston["low_density_residential"]

'Delete Variables that do not contribute to ppm'

In [51]:
boston.to_csv("final_boston.csv", index=False)
boston=pd.read_csv("final_boston.csv")
boston

Unnamed: 0,Site,commercial,crop_land,forest,industrial,medium_density_residential,open_land,transitional,transportation,urban_public_institution,water,wetland,outdoor_temperature,wind_speed_resultant,solar_radiation
0,"[(-71.201970000000003, 42.291440999999999), (-...",0.96,0.00,0.04,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,23.776190,8.497024,1.059524
1,"[(-71.197266400000004, 42.291440999999999), (-...",0.23,0.00,0.19,0.06,0.00,0.00,0.00,0.00,0.32,0.00,0.00,23.776190,8.497024,1.059524
2,"[(-71.192562800000005, 42.291440999999999), (-...",0.00,0.00,0.08,0.00,0.00,0.00,0.00,0.00,0.09,0.00,0.00,23.776190,8.497024,1.059524
3,"[(-71.187859200000005, 42.291440999999999), (-...",0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.04,0.00,0.00,23.776190,8.497024,1.059524
4,"[(-71.183155600000006, 42.291440999999999), (-...",0.00,0.00,0.03,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,23.776190,8.497024,1.059524
5,"[(-71.178452000000007, 42.291440999999999), (-...",0.00,0.00,0.04,0.00,0.00,0.00,0.00,0.00,0.03,0.00,0.00,23.776190,8.497024,1.059524
6,"[(-71.173748400000008, 42.291440999999999), (-...",0.00,0.00,0.03,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,23.776190,8.497024,1.059524
7,"[(-71.169044800000009, 42.291440999999999), (-...",0.00,0.00,0.07,0.00,0.00,0.00,0.00,0.00,0.02,0.00,0.00,23.776190,8.497024,1.059524
8,"[(-71.16434120000001, 42.291440999999999), (-7...",0.04,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,23.776190,8.497024,1.059524
9,"[(-71.159637599999996, 42.291440999999999), (-...",0.00,0.00,0.00,0.00,0.01,0.00,0.00,0.00,0.00,0.00,0.00,23.776190,8.497024,1.059524


In [52]:
"""Multiple linear Regression"""
def reg_m(y, x):
    model = sm.OLS(y, x.astype(float)).fit()
    #fits simple ordinary least squares model
    predictions = model.predict(x)
    #makes predictions for y based on x
    return(model.summary())

In [53]:
#scale data
del pm["Latitude"]
del pm["Longitude"]
#removes columns that should not be scaled (site keys, ppm, for example)
pm_scaled = pm.iloc[:, 3:]
#scales rest of predictors, which turns it into an array
pm_scaled = preprocessing.minmax_scale(pm_scaled)
#turns array back into dataframe
pm_scaled = pd.DataFrame(pm_scaled)
#grabs the columns I removed
add = pm.iloc[:, :3]
#adds them back to the scaled data
pm_scaled = add.join(pm_scaled)
#fix column names
cols = list(pm.columns)
#adds columns back
pm_scaled.columns = cols

#splits train, test sets
train, test = train_test_split(pm_scaled, test_size=.30, random_state=0)
#Scales y data (ppm values)
y_train = train['ppm'].values * 100
y_test = test['ppm'].values * 100
#removes site keys, ppm, etc from test df
test = test.iloc[:, 3:]

In [54]:
'''Method that takes in a list of all the predictors left in the model, and the model, 
spits out the lowest p valued predictor and lowest p value'''
def find_low_p(all_preds, model_OLS):
    #finds max p value, excluding the first column of constants
    maxp = max(model_OLS.pvalues[1:])  
    #gets index where max p value exists
    index = list(model_OLS.pvalues[1:]).index(maxp)
    #gets corresponding predictor
    lowestlu = all_preds[index]

    return maxp , lowestlu

In [55]:
"""Input: y data set, list of predictor variables
Action: X data set formed, runs a MLR through data set, computes pvalue, deletes predictor with max pvalue
returns: models - a linear regression object for each iteration and indices-the index of predictors to remove from the 
whole predictors list
"""
def backwards_var_sel(y, all_predictors, train):
    models = [] #to hold all the models
    x = train[all_predictors]
    x = add_constant(x, has_constant="add")
    indices = []
    copy = all_predictors

    for i in range(len(copy)):
        #fit a MLR to data set
        OLS_model = sm.OLS(y, x).fit()

        #finds lowest pvalue and index of lowest pvalue, excluding constant term

        maxp, lowestlu = find_low_p(all_predictors, OLS_model)
        #removes variable with lowest p value
        index_td = copy.index(lowestlu)
        indices.append(index_td)
        index = all_predictors.index(lowestlu)
        all_predictors = all_predictors[:index] + all_predictors[index+1 :]
        
        #updates list of x values, not including variable with lowest pvalue
        x = train[all_predictors]
        x = add_constant(x, has_constant="add")
        #save new model in a list
        models.append(OLS_model)
        
        
    return models, indices



In [56]:
models, index_ls = backwards_var_sel(y_train, predictors, train)

In [57]:
#relevant for 
best_models = []
index_of_model = []
for i in range(len(models)):
    #find all models where the average pvalue is less than 0.09
    if np.average(models[i].pvalues[1:]) < .05:
        #append model to the list best models
        best_models.append(models[i])
        #append the index of the model to the list index_of_model
        index_of_model.append(i)
best_models[0].summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.211
Model:,OLS,Adj. R-squared:,0.21
Method:,Least Squares,F-statistic:,154.6
Date:,"Fri, 04 Aug 2017",Prob (F-statistic):,8.439999999999999e-89
Time:,13:57:04,Log-Likelihood:,-4443.6
No. Observations:,1739,AIC:,8895.0
Df Residuals:,1735,BIC:,8917.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,0.7421,0.164,4.537,0.000,0.421 1.063
open_land,1.8031,0.543,3.323,0.001,0.739 2.867
crop_land,0.7448,0.365,2.041,0.041,0.029 1.461
outdoor_temperature,4.9128,0.246,19.939,0.000,4.430 5.396

0,1,2,3
Omnibus:,1216.536,Durbin-Watson:,2.033
Prob(Omnibus):,0.0,Jarque-Bera (JB):,77075.142
Skew:,2.607,Prob(JB):,0.0
Kurtosis:,35.195,Cond. No.,8.64


In [58]:
#must subtract 1. index ls tells what column to remove, but models list includes the model with no column removed at all
model2_pos = index_of_model[0]
#gets index of predictors to remove
new_index_list = index_ls[:model2_pos]
#make new test set
test1 = test.drop(test.columns[new_index_list], axis=1)
test1 = add_constant(test1)
np.mean((best_models[0].predict(test1)-y_test)**2)
finaltest = boston[['open_land', 'crop_land', 'outdoor_temperature']]
finaltest = preprocessing.minmax_scale(finaltest)
#turns array back into dataframe
finaltest = pd.DataFrame(finaltest)
finaltest = add_constant(finaltest, has_constant="add")
#test1.columns#best_models[0].summary()
#linmodel = LinearRegression(fit_intercept=True)
#linmodel.fit(test1,y_test)
final_pm_predictions = best_models[0].predict(finaltest)/100

In [59]:

"""Input: y data set, list of predictor variables
Action: X data set formed, runs a MLR through data set, computes AIC, deletes predictor with maximum AIC
returns: models - list of remaining predictors after each iteration
"""
def backwards_var_sel_aic(y, all_predictors, train):
    preds = [] #to hold all the list of predictors
    copy = all_predictors #keeps copy of full predictors list #holds AIC values for each iteration
    models = []
    indices = []
    for j in range(len(copy)):
        t = all_predictors
        removed = []
        AIC = []
        for i in range(len(t)):
            #removes one predictor in each iteration
            predictors = all_predictors[:i] + all_predictors[i+1 :]
            #print(predictors)
            #train and fit a model on those predictors
            x = train[predictors].values
            x = add_constant(x, has_constant = "add")
            OLS_model = sm.OLS(y, x).fit()

            #append AIC value to AIC list for each iteration
            AIC.append(OLS_model.bic)
        #finds the largest AIC and the index at which it exists
        worstAIC = max(AIC)
        index = AIC.index(max(AIC))
        worstLU = all_predictors[index]
        indices.append(copy.index(worstLU))
        #reassign all_predictors to a new list with the predictor corresponding to the worst AIC removed
        all_predictors = all_predictors[:index] + all_predictors[index+1 :]
        #append a list of the remaining predictors to the models list
        preds.append(all_predictors)
        models.append(OLS_model)
        
    return models, indices, preds

models, indices, predicts = backwards_var_sel_aic(y_train, predictors, train)

In [60]:
#relevant for 
best_models = []
for i in range(len(models)):
    #find all models where the average pvalue is less than 0.09
    best_models.append(models[i].aic)
m = min(best_models)
#finds minimum aic among each model
#gets the index of every instance where the minimum AIC occurs
index_of_model = best_models.index(m)
models[0].summary()
test2 = test.loc[:, predicts[0]]
test2 = add_constant(test2, has_constant="add")
np.mean((models[0].predict(test2)-y_test)**2)
finaltest = boston[['forest', 'open_land', 'water', 'wetland', 'transitional',
       'urban_public_institution', 'commercial', 'transportation', 'crop_land',
       'medium_density_residential', 'industrial', 'solar_radiation',
       'wind_speed_resultant']]
finaltest = preprocessing.minmax_scale(finaltest)
#turns array back into dataframe
finaltest = pd.DataFrame(finaltest)
finaltest = add_constant(finaltest, has_constant="add")
#test1.columns#best_models[0].summary()
#linmodel = LinearRegression(fit_intercept=True)
#linmodel.fit(test1,y_test)
boston_aic_preds = models[0].predict(finaltest)/100
#this is just constant model
#get MSE
#np.mean((np.repeat(8.6342, len(y_test))-y_test)**2)
#linmodel = LinearRegression(fit_intercept=True)
#linmodel.fit(test2,y_test)
#linmodel.score(test2, y_test)
boston["Site"][0]

'[(-71.201970000000003, 42.291440999999999), (-71.197266400000004, 42.291440999999999), (-71.197266400000004, 42.29402374), (-71.201970000000003, 42.29402374)]'

In [71]:
def parse_str(str_edit):
#   print str_edit
    str_edit = str_edit.replace('[', '')
    str_edit = str_edit.replace('(', '')
    str_edit = str_edit.replace(']', '')
    str_edit = str_edit.replace(')', '')
    str_edit = str_edit.replace(',', '')
    list = str_edit.split(' ')
#   print list
    return [[np.float64(list[0]), np.float64(list[1])], [np.float64(list[2]), np.float64(list[3])],
      [np.float64(list[4]), np.float64(list[5])], [np.float64(list[6]), np.float64(list[7])]]

loc = []
for i in boston["Site"]:
    loc.append(parse_str(i))
lats, longs = [], []
for i in np.arange(len(loc)):
    lats.append((loc[i][0][0], loc[i][0][1], boston_aic_preds[i]))
    #longs.append(i[0][1])
w = pd.DataFrame(pd.Series(lats))
w.to_csv("boston_pm_LUR_preds.csv", index=False)

In [72]:
boston_aic_preds

array([ 0.00149578,  0.00409976,  0.00624479, ...,  0.02231452,
        0.02231452,  0.02231452])

In [None]:
## random boston gridding stuff, can ignore for now

x = [ -71.20197, -70.96679]
y = [42.291441, 42.420578]
x_cell = 50
y_cell = 50


x_min = -71.20197 #x corresponds to longitude
x_max = -70.96679
y_min = 42.291441 #y corresponds to latitude
y_max = 42.420578

#Create ticks
x_s = np.linspace(x_min, x_max, x_cell + 1)
y_s = np.linspace(y_min, y_max, y_cell + 1)

#Create grid
x_coord, y_coord = np.meshgrid(x_s, y_s)


'''Function Description: given the x and y coordinates of a grid, turns each grid element into a polygon. I will move from left to right, bottom to top

Input: x-coords, y-coords is a list of lists of x and y coordinates for the grid

Output: a list of polygons, one for each grid '''

def make_polygon(x_coords, y_coords):
    gr = []
    for b in range(0, len(y_coords)-1, 1):
        y = tuple([y[0] for y in y_coords[b:b+2]])
        #grabs each x-coordinate pair for each grid element
        for a in range(0, len(x_coords[0])-1, 1):
            grid = []
            x = x_coords[0][a:a+2]
            #initiate list to store points for each grid element
            #for each x-coordinate pair, get all corresponding y-coordinates
            for j in range(0, len(x), 1):
                grid.append((x[j], y[0]))
                grid.append((x[j], y[1]))
            g = [grid[0]] + grid[2:4] + [grid[1]] 
            gr.append(g)
                #append grid points of polygon into the grid list 
        #turn grid into shapely polygon and append to polygon list
    return gr
gr = make_polygon(x_coord, y_coord)

In [12]:
boston = pd.read_csv("boston_props_table.csv")
boston.insert(0, "Site", gr)
boston.to_csv("boston_site_LU.csv", encoding = "utf-8")
#all_predictors = ["forest","commercial","transportation","medium_density_residential","industrial"]
#b=boston.loc[:, ["forest","commercial","transportation","medium_density_residential","industrial"]]
#est_models[0].predict(b.iloc[:, 1:])
#OLS_model = sm.OLS(y, x).fit()

#pm_boston = np.repeat(1.8039, len(b.index)) -0.7702*b.iloc[:,0]-0.9776*b.iloc[:,1]-0.6511*b.iloc[:,2]-1.0344*b.iloc[:, 3] -1.5905*b.iloc[:,4]
#pd.DataFrame(data={})
#bleh = pd.DataFrame(data={"Grids":gr,"ppm*100":pm_boston})
#bleh.to_csv("boston_preds.csv", encoding = "utf-8")