In [9]:
import numpy as np
from netCDF4 import Dataset
import cartopy
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import wrf
import pandas as pd
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score

In [10]:
# get data
data_dir = '/home/jcurtis2/hackathon_data/'
wrf_filename = '%straining.nc' % data_dir
ncf = Dataset(wrf_filename, "r", format="NETCDF4")

In [11]:
lats = ncf.variables['pmc_SO4'][0,0,0,0].data

In [12]:
# Input variables:

input_vars = ['TOT_NUM_CONC',
 'TOT_MASS_CONC',
 'pmc_SO4',
 'pmc_NO3',
 'pmc_Cl',
 'pmc_NH4',
 'pmc_ARO1',
 'pmc_ARO2',
 'pmc_ALK1',
 'pmc_OLE1',
 'pmc_API1',
 'pmc_API2',
 'pmc_LIM1',
 'pmc_LIM2',
 'pmc_OC',
 'pmc_BC',
 'pmc_H2O',
 'TEMPERATURE',
 'REL_HUMID',
 'ALT',
 'Z',
 'XLAT',
 'XLONG',
 'h2so4',
 'hno3',
 'hcl',
 'nh3',
 'no',
 'no2',
 'no3',
 'n2o5',
 'hono',
 'hno4',
 'o3',
 'o1d',
 'O3P',
 'oh',
 'ho2',
 'h2o2',
 'co',
 'so2',
 'ch4',
 'c2h6',
 'ch3o2',
 'ethp',
 'hcho',
 'ch3oh',
 'ANOL',
 'ch3ooh',
 'ETHOOH',
 'ald2',
 'hcooh',
 'RCOOH',
 'c2o3',
 'pan',
 'aro1',
 'aro2',
 'alk1',
 'ole1',
 'api1',
 'api2',
 'lim1',
 'lim2',
 'par',
 'AONE',
 'mgly',
 'eth',
 'OLET',
 'OLEI',
 'tol',
 'xyl',
 'cres',
 'to2',
 'cro',
 'open',
 'onit',
 'rooh',
 'ro2',
 'ano2',
 'nap',
 'xo2',
 'xpar',
 'isop',
 'isoprd',
 'isopp',
 'isopn',
 'isopo2',
 'api',
 'lim',
 'dms',
 'msa',
 'dmso',
 'dmso2',
 'ch3so2h',
 'ch3sch2oo',
 'ch3so2',
 'ch3so3',
 'ch3so2oo',
 'ch3so2ch2oo',
 'SULFHOX',
 'P',
 'PB']

out_vars = ['ccn_001',
 'ccn_003',
 'ccn_006',
 'CHI',
 'CHI_CCN',
 'D_ALPHA',
 'D_GAMMA',
 'D_ALPHA_CCN',
 'D_GAMMA_CCN',
 'PM25']


input_vars = ['isop',
 'isoprd',
 'isopp',
 'isopn',
 'isopo2']

output_vars = ['PM25', 'D_GAMMA_CCN']


In [13]:
# flatten out data

# training
data_frame_input = {}
data_frame_output = {}

# keep the last 100 data points for testing
test_input = {}
test_output = {}

for input in range(len(input_vars)):
    if(input_vars[input]== 'XLAT' or input_vars[input] == 'XLONG'):
        list1 = ncf.variables[input_vars[input]][:,:,:].flatten()
        for i in range(133):
            list.append(list1)
    else:
        list = ncf.variables[input_vars[input]][:,:,:,:].flatten() 
        
    test_input[input_vars[input]] = list[-100:]
    data_frame_input[input_vars[input]] = list[:-100]
    
for input in range(len(output_vars)):
    if(output_vars[input]== 'XLAT' or output_vars[input] == 'XLONG'):
        list1 = ncf.variables[output_vars[input]][:,:,:].flatten()
        for i in range(133):
            list.append(list1)
    else:
        list = ncf.variables[output_vars[input]][:,:,:,:].flatten()         
    test_output[output_vars[input]] = list[-100:]
    data_frame_output[output_vars[input]] = list[:-100]

In [14]:

X = pd.DataFrame(data_frame_input)
y = pd.DataFrame(data_frame_output)

X_test = pd.DataFrame(test_input)
y_test = pd.DataFrame(test_output)

X_test

Unnamed: 0,isop,isoprd,isopp,isopn,isopo2
0,1.000076e-16,1.341576e-14,1.000015e-16,1.000015e-16,1.000028e-16
1,1.000060e-16,1.214021e-14,1.000028e-16,1.000026e-16,1.000038e-16
2,1.000093e-16,1.353928e-14,1.000066e-16,1.000060e-16,1.000071e-16
3,1.000047e-16,1.131719e-14,1.000047e-16,1.000049e-16,1.000046e-16
4,1.000022e-16,9.758905e-15,1.000040e-16,1.000042e-16,1.000035e-16
...,...,...,...,...,...
95,1.000252e-16,1.052882e-13,1.000069e-16,1.000059e-16,1.000114e-16
96,1.000251e-16,1.165845e-13,1.000060e-16,1.000054e-16,1.000107e-16
97,1.000253e-16,1.205351e-13,1.000059e-16,1.000055e-16,1.000104e-16
98,1.000294e-16,1.317641e-13,1.000064e-16,1.000058e-16,1.000104e-16


In [34]:
# with sklearn

# Create linear regression object
regr = linear_model.LinearRegression()
regr.fit(X, y)

LinearRegression()

In [36]:
# prediction with sklearn
x1 = X_test['isop'][0]
x2 = X_test['isoprd'][0]
x3 = X_test['isopp'][0]
x4 = X_test['isopn'][0]
x5 = X_test['isopo2'][0]
print ('Predicted PM25 at some time: \n', regr.predict([[x1, x2, x3, x4, x5]]))
print(y_test['PM25'][0])

Predicted PM25 at some time: 
 [[2.7381142e-10]]
2.4822937e-11




isop      1.000076e-16
isoprd    1.341576e-14
isopp     1.000015e-16
isopn     1.000015e-16
isopo2    1.000028e-16
Name: 0, dtype: float32