## Implements Gaussian on PM, NO, OZO dataframes that contain Landuse and Time

In [37]:
import pandas as pd
import sys
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as sk
from sklearn import mixture
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from statsmodels.tools import add_constant
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF
from sklearn.gaussian_process.kernels import ConstantKernel as C
from sklearn.gaussian_process.kernels import Matern, WhiteKernel, RationalQuadratic, DotProduct
import statsmodels.api as sm
sns.set(style="ticks")
import random
from matplotlib import pyplot as plt
import os
import pickle
%matplotlib inline

## Cleaning Boston PM Data Frame 

In [38]:
aq_df = pd.read_csv("appended_pm.csv")
df_w_cells = pd.read_csv("boston_site_LU.csv")

lat = df["Latitude"]
lon = df["Longitude"]

df['lat_lon'] = list(zip(lat, lon))
lat_lon_pts = df['lat_lon']
del df['lat_lon']

cells = df_cells["Site"]

AQ_width = df.shape[1]
LU_width = df_cells.shape[1]

## Functions that add Land Use columns to Data Frame 

In [39]:
def parse_str(str_edit):
    str_edit = str_edit.replace('[', '')
    str_edit = str_edit.replace('(', '')
    str_edit = str_edit.replace(']', '')
    str_edit = str_edit.replace(')', '')
    str_edit = str_edit.replace(',', '')
    list = str_edit.split(' ')
    return [(np.float64(list[0]), np.float64(list[1])), (np.float64(list[2]), np.float64(list[3])),
            (np.float64(list[4]), np.float64(list[5])), (np.float64(list[6]), np.float64(list[7]))]

In [40]:
def in_cell(lat, lon, cell):
    cell_points = parse_str(cell)
    lon_bool = lon >= cell_points[0][1] and lon <= cell_points[2][1]
    lat_bool = lat >= cell_points[0][0] and lat <= cell_points[2][0]
    if lat_bool and lon_bool:
        return True
    else:
        return False

In [41]:
def add_columns(df):
    df['Cropland'] = 0
    df['Golf Course'] = 0
    df['Saltwater Sandy Beach'] = 0
    df['cemetary'] = 0
    df['commercial'] = 0
    df['crop_land'] = 0
    df['forest'] = 0
    df['high_density_residential'] = 0
    df['industrial'] = 0
    df['low_density_residential'] = 0
    df['marina'] = 0
    df['medium_density_residential'] = 0
    df['mining'] = 0
    df['open_land'] = 0
    df['recreational'] = 0
    df['transitional'] = 0
    df['transportation'] = 0
    df['urban_public/institutional'] = 0
    df['waste'] = 0
    df['water'] = 0
    df['wetland'] = 0
    return df

In [42]:
def update_proportions(AQ_df, LU_df, AQ_index, LU_index):
    #AQ_df is the Boston AQ hourly
    #LU_df contains the cells and the landuse
    #AQ_index is the row of the cell that contains the point in the Boston AQ hourly df
    #LU_index is the row of the cell that contains the point in the cell and landuse df
    for col in range(0, LU_width - 2):#df.loc[i, 'forest'] = df.loc[i, 'forest'] + my_dict['forest']
#         print (AQ_index, AQ_width + col, LU_index, col + 1), "\n"
#         print AQ_df.iloc[AQ_index, col + AQ_width], "LU:", LU_df.iloc[LU_index, col+2]
        AQ_df.iloc[AQ_index, col + AQ_width] = LU_df.iloc[LU_index, col + 2]
    return AQ_df

In [44]:
def update_df_w_landuse(lat_lon_pts, cells, aq_df, df_w_cells):
    for pt in range(len(lat_lon_pts)): # pt is row for df
        for cell in range(len(cells)): #cell is row for df_cells
            if in_cell(lat_lon_pts[pt][1], lat_lon_pts[pt][0], df_w_cells["Site"][cell]):
                update_proportions(aq_df, df_w_cells, pt, cell)
    return aq_df

In [45]:
aq_df = add_columns(aq_df)
aq_df = update_df_w_landuse(lat_lon_pts, cells, aq_df, df_w_cells)

## Pickles Boston PM data frame with land use columns

In [51]:
def pickle_df(df):
    df.to_pickle("boston_aq_w_landuse.csv")
    df2 = pickle.load(open('boston_aq_w_landuse.csv', 'rb'))
    return df2

In [52]:
aq_df2 = pickle_df(aq_df)

## Gaussian function and plotting function

In [16]:
""" Creates Gaussian model, computs r squarred, MSE and log liklihood.
Given x_train, y_train, x_test, and y_test data, and numerical alpha value.
Prints train and test R^2. """

def gaussian(x_train, y_train, x_test, y_test, alpha):
    
    train_size = x_train.shape[0]
    test_size = x_test.shape[0]
    
    kern = RBF(length_scale = 1)

    gp = GaussianProcessRegressor(kernel=kern, alpha=alpha, optimizer='fmin_l_bfgs_b', n_restarts_optimizer=1, normalize_y=False, copy_X_train=False, random_state=None)
    gp.fit(x_train, y_train.reshape(train_size, 1))

    y_train_pred, sigma_train = gp.predict(x_train, return_std=True)
    y_test_pred, sigma_test = gp.predict(x_test, return_std=True)

    # get R^2
    r2 = gp.score(x_train, y_train.reshape(train_size, 1))
    r2_t = gp.score(x_test, y_test.reshape(test_size, 1))

    # get MSE measurements
    MSE_test = np.mean((y_test_pred - y_test.reshape(test_size, 1))**2)
    MSE_train = np.mean((y_train_pred - y_train.reshape(train_size, 1))**2)

    # get log likelihood
    t=gp.log_marginal_likelihood()

    # calculate AIC
    AIC = 2*len(x_test) - 2*np.log(-t)
    
    # print R^2 values
    print('mean squared error of train data with model = ' + str(MSE_train))
    print('mean squared error of test data with model = ' + str(MSE_test))
    print('Akaike information criterion = ' + str(AIC))
    print('log likelihood of model = ' + str(t))
    print('training R^2 value = ' + str(r2))
    print('testing R^2 value = ' + str(r2_t)), "\n \n"
    
    return y_train_pred, y_test_pred, gp

In [17]:
"""Plots predicted y values for testing set"""
def plot_predictions(x_test, y_test, y_test_pred, x_label, y_label):
#     print x_test.shape, y_test_pred.shape
    plt.figure(figsize=(13,8))
    plt.plot(x_test, y_test, ".", color="r",)
    plt.scatter(x_test, y_test_pred)
#     plt.xlabel("Time Occurred")
#     plt.title("Predicted Boston CO2 level")
#     plt.show()

## Splits DF into train and test sets

In [53]:
def split_train_test(df):
    train, test = train_test_split(df)
    x_train = train[col_names].values
    y_train = train["Measure Value"]
    x_test = test[col_names].values
    y_test = test["Measure Value"]
    return x_train, x_test, y_train, y_test

## Running Gaussian on Boston PM

In [56]:
pollutants = ['CO', 'SO', 'NO', 'OZO', 'PM']

col_names = ['Cropland', 'Golf Course',
       'Saltwater Sandy Beach', 'cemetary', 'commercial', 'crop_land',
       'forest', 'high_density_residential', 'industrial',
       'low_density_residential', 'marina', 'medium_density_residential',
       'mining', 'open_land', 'recreational', 'transitional', 'transportation',
       'urban_public/institutional', 'waste', 'water', 'wetland', 'Sample Collection Start Time']

x_label = 'Time'
y_label = 'Pollutant'

alpha = 0.0008

x_train, x_test, y_train, y_test = split_train_test(aq_df2)
y_train_pred, y_test_pred, gp = gaussian(x_train, y_train, x_test, y_test, alpha)

  app.launch_new_instance()


mean squared error of train data with model = 0.00121528233664
mean squared error of test data with model = 0.00226526227097
Akaike information criterion = nan
log likelihood of model = 520.032418602
training R^2 value = 0.727308972494
testing R^2 value = 0.54956932534 
 





## NO

In [61]:
no_df = pd.read_csv("appendedNO.csv")

no_lat = no_df["Latitude"]
no_lon = no_df["Longitude"]

no_df['lat_lon'] = list(zip(no_lat, no_lon))
no_lat_lon_pts = no_df['lat_lon']
del no_df['lat_lon']

NO_width = no_df.shape[1]
LU_width = df_cells.shape[1]
no_df = add_columns(no_df)
no_df = update_df_w_landuse(no_lat_lon_pts, cells, no_df, df_w_cells)

In [63]:
no_df2 = pickle_df(no_df)

In [68]:
alpha = 0.0009

x_train, x_test, y_train, y_test = split_train_test(no_df2)
y_train_pred, y_test_pred, gp = gaussian(x_train, y_train, x_test, y_test, alpha)

  app.launch_new_instance()


mean squared error of train data with model = 3.25633017e-05
mean squared error of test data with model = 2.58392107433e-05
Akaike information criterion = nan
log likelihood of model = 2170.78950866
training R^2 value = 0.0175989462569
testing R^2 value = 0.0192736529715 
 





## OZO Data frame

In [70]:
ozo_df = pd.read_csv("appendedNO.csv")

ozo_lat = no_df["Latitude"]
ozo_lon = no_df["Longitude"]

ozo_df['lat_lon'] = list(zip(ozo_lat, ozo_lon))
ozo_lat_lon_pts = ozo_df['lat_lon']
del ozo_df['lat_lon']

OZO_width = no_df.shape[1]
LU_width = df_cells.shape[1]
ozo_df = add_columns(ozo_df)
ozo_df = update_df_w_landuse(ozo_lat_lon_pts, cells, ozo_df, df_w_cells)

In [72]:
ozo_df2 = pickle_df(ozo_df)

In [73]:
alpha = 0.0009

x_train, x_test, y_train, y_test = split_train_test(ozo_df2)
y_train_pred, y_test_pred, gp = gaussian(x_train, y_train, x_test, y_test, alpha)

  app.launch_new_instance()


mean squared error of train data with model = 3.13437658171e-05
mean squared error of test data with model = 2.96779368081e-05
Akaike information criterion = nan
log likelihood of model = 2171.36993931
training R^2 value = 0.0104307214472
testing R^2 value = 0.0352364905692 
 



