### Package Imports

In [1]:
from time import time 
init = time()
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import sys
import warnings
warnings.filterwarnings("ignore")

### Data loading and filtering

In [2]:
cadf = pd.read_csv("../../data/cal_o3.csv.gz", index_col = None)

cadf = cadf[cadf['Time Local'] <= '18:00']
cadf = cadf[cadf['Time Local'] >= '10:00']

cadf['ID'] = cadf['County Code'].apply(str) + cadf['Site Num'].apply(str)

In [3]:
cadf.columns

Index(['State Code', 'County Code', 'Site Num', 'Parameter Code', 'POC',
       'Latitude', 'Longitude', 'Datum', 'Parameter Name', 'Date Local',
       'Time Local', 'Date GMT', 'Time GMT', 'Sample Measurement',
       'Units of Measure', 'MDL', 'Uncertainty', 'Qualifier', 'Method Type',
       'Method Code', 'Method Name', 'State Name', 'County Name',
       'Date of Last Change', 'ID'],
      dtype='object')

### Preprocessing

In [4]:
mask = cadf.groupby(['ID', 'Date Local'])['Sample Measurement'].count() >= 8
cadf = cadf.groupby(['ID', 'Date Local']).mean()[mask].reset_index()

cadf = cadf[['ID', 'Date Local', 'Latitude', 'Longitude', 'Sample Measurement']]

cadf = cadf[(cadf['Date Local'] >= '1990-05-01') & (cadf['Date Local'] <= '1990-09-30')]

mask = cadf.groupby('ID').count()>=115
cadf = cadf.groupby('ID').mean()[mask].dropna()

## Summary Statistics for Kriging Interpolation in California

In [5]:
def summary_stats(x):
    return x.mean(), np.percentile(x, 25), np.percentile(x, 50), np.percentile(x, 75)

res_df = pd.DataFrame(columns=[ 'mean', '25%', '50%', '75%'])

In [6]:
import geopandas as gpd
g_df = gpd.read_file("../../data/block_groups/california/tl_2016_06_bg.shp")

In [7]:
g_df['INTPTLAT'].max(), g_df['INTPTLAT'].min(), 

('+41.9466483', '+32.5447420')

In [8]:
shapes = g_df['geometry'].unique()
centroids = [shapes[i].centroid.wkt for i in range(len(shapes))]
centroids = [centroids[i].split(' ')[1:] for i in range(len(centroids))]
centroids = [[np.float64(centroids[i][0][1:]), np.float64(centroids[i][1][:-1])] for i in range(len(centroids))]
centroids = np.array(centroids)

In [9]:
centroids.min(axis=0), centroids.max(axis=0)

(array([-124.27033744,   32.54474204]), array([-114.29862718,   41.94353816]))

In [10]:
X = cadf[['Longitude', 'Latitude']].values
y = cadf[['Sample Measurement']].values*1000
X.shape

(154, 2)

In [11]:
from polire import Kriging
## PyKrige takes Longitude Values in the Range 0 to 360 => Add 360 to our values!
X_krig = X.copy()
X_test_krig = centroids.copy()
krig = Kriging(variogram_model='spherical', coordinate_type='Geographic')
krig.fit(X_krig, y.ravel())

X_test_krig[:, 0] = X_test_krig[:, 0] + 360
krig_predictions = krig.predict(X_test_krig)

res_df.loc['Kriging', :] = summary_stats(krig_predictions)

## IDW, KNN, SA for USA

In [12]:
df = pd.read_csv("../../data/usa_o3.csv.gz").dropna()
df.head(2)

Unnamed: 0,Latitude,Longitude,Sample Measurement
0,37.858932,-87.575291,0.040264
1,40.049604,-75.241209,0.046745


In [13]:
centroids = np.load("../../data/block_groups/usa_centroids.npy")

In [14]:
X = df[['Longitude', 'Latitude']].values
y = df['Sample Measurement'].values * 1000

## Spatial Averaging

In [15]:
from polire import SpatialAverage
spatial = SpatialAverage(coordinate_type='Geographic', radius=16.1)
spatial.fit(X, y)

SpatialAverage

In [16]:
spatial_predictions = spatial.predict(centroids)
spatial_predictions = spatial_predictions[~np.isnan(spatial_predictions)]
res_df.loc['Spatial Average', :] = summary_stats(spatial_predictions)

## IDW

In [17]:
from polire import IDW
idw = IDW(coordinate_type='Geographic')
idw.fit(X, y)

IDW

In [18]:
idw_predictions = idw.predict(centroids)

res_df.loc['IDW', :] = summary_stats(idw_predictions)

### $k$-NN, with $k = 1$.

In [19]:
from polire import CustomInterpolator
from sklearn.neighbors import KNeighborsRegressor
knn = CustomInterpolator(KNeighborsRegressor(n_neighbors=1))
knn.fit(X, y)
knn_predictions = knn.predict(centroids)

res_df.loc['KNN', :] = summary_stats(knn_predictions)

## Results

In [20]:
print(res_df.to_markdown())

|                 |    mean |     25% |     50% |     75% |
|:----------------|--------:|--------:|--------:|--------:|
| Kriging         | 51.1912 | 42.3414 | 52.5939 | 60.1207 |
| Spatial Average | 45.1846 | 39.9437 | 44.3338 | 49.2456 |
| IDW             | 45.7411 | 42.2394 | 45.5186 | 49.0094 |
| KNN             | 45.228  | 39.0644 | 45.1592 | 50.2772 |


In [21]:
print('Done in',(time()-init)/60,'minutes')

Done in 0.6720613718032837 minutes
