In [1]:
import pandas as pd
import numpy as np
import sys
import warnings
warnings.filterwarnings('ignore')

In [2]:
california_ozone = pd.read_csv("../../data/cal_o3.csv.gz", index_col = None)

In [3]:
cadf = california_ozone
cadf = cadf[cadf['Time Local'] <= '18:00']
cadf = cadf[cadf['Time Local'] >= '10:00']

In [4]:
county_codes = cadf['County Code'].values
site_code = cadf['Site Num'].values
site_id = [str(county_codes[i]) + str(site_code[i]) for i in range(len(county_codes))]

In [5]:
cadf['ID'] = site_id

In [6]:
cadf.columns

Index(['State Code', 'County Code', 'Site Num', 'Parameter Code', 'POC',
       'Latitude', 'Longitude', 'Datum', 'Parameter Name', 'Date Local',
       'Time Local', 'Date GMT', 'Time GMT', 'Sample Measurement',
       'Units of Measure', 'MDL', 'Uncertainty', 'Qualifier', 'Method Type',
       'Method Code', 'Method Name', 'State Name', 'County Name',
       'Date of Last Change', 'ID'],
      dtype='object')

In [7]:
mask = cadf.groupby(['ID', 'Date Local'])['Sample Measurement'].count() >= 8

In [8]:
cadf = cadf.groupby(['ID', 'Date Local']).mean()[mask]

In [9]:
cadf.reset_index(inplace=True)

In [10]:
cadf = cadf[['ID', 'Date Local', 'Latitude', 'Longitude', 'Sample Measurement']]

In [11]:
m1 = cadf['Date Local'] >= '1990-05-01'
m2 = cadf['Date Local'] <= '1990-09-30'
mf = m1 & m2
cadf = cadf[mf]

In [12]:
m4 = cadf.groupby('ID').count()>=115

In [13]:
cadf = cadf.groupby('ID').mean()[m4]

In [14]:
cadf.dropna(inplace = True)

In [15]:
ca_df = cadf.copy()

## Need to do leave-one-out cross validation in Southern California.
## Southern California is Latitude <=34.81

In [16]:
socaldf = cadf[cadf['Latitude'] <= 34.81]

In [17]:
socaldf.shape

(73, 3)

In [21]:
X = socaldf[['Longitude', 'Latitude']].values
y = socaldf[['Sample Measurement']]*1000
y = y.values

In [22]:
from polire import Kriging, SpatialAverage
from polire import CustomInterpolator
from sklearn.neighbors import KNeighborsRegressor

In [25]:
kriging_predictions = []
knn_predictions = []
spatial_predictions = []
for ix in range(X.shape[0]):
    X_train = np.vstack((X[:ix], X[ix+1:]))
    y_train = np.vstack((y[:ix], y[ix+1:]))
    X_test = X[ix].reshape(1, 2)
    y_test = y[ix].reshape(-1, 1)   
    krig = Kriging(variogram_model='spherical', coordinate_type='Geographic')
    krig.fit(X_train, y_train.ravel())
    kriging_predictions.append(krig.predict(X_test).reshape(1, ))
    knn = CustomInterpolator(KNeighborsRegressor(n_neighbors=1))
    knn.fit(X_train, y_train.ravel())
    knn_predictions.append(knn.predict(X_test).reshape(1, ))
    spatial = SpatialAverage(coordinate_type='Geographic', radius=16.1)
    spatial.fit(X_train, y_train.ravel())
    spatial_predictions.append(spatial.predict(X_test).reshape(1, ))

In [26]:
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression

## Kriging

In [27]:
linear = LinearRegression()
linear.fit(kriging_predictions, y)
linear.coef_, r2_score(y, linear.predict(kriging_predictions))

(array([[1.06633059]]), 0.7807285408889516)

## Spatial Averaging

In [28]:
spatial_predictions = np.array(spatial_predictions)
mask = ~np.isnan(spatial_predictions)
y_spt = y[mask]
spatial_predictions_1 = spatial_predictions[mask]
linear = LinearRegression()
linear.fit(spatial_predictions_1.reshape(-1, 1), y_spt)
linear.coef_, r2_score(y_spt, linear.predict(spatial_predictions_1.reshape(-1, 1)))

(array([0.96384508]), 0.7832038030560716)

## Nearest Neighbor

In [29]:
linear = LinearRegression()
linear.fit(knn_predictions, y)
linear.coef_, r2_score(y, linear.predict(knn_predictions))

(array([[0.80732258]]), 0.6783862462470494)