In [1]:
import pandas as pd
import numpy as np

import pickle

import warnings
warnings.filterwarnings('ignore')

In [2]:
from glob import glob

station_names = [file.split("/")[-1][:5] for file in glob("./dl_models/*.pkl")]

In [3]:
np.random.seed(46)
train_stations = list(np.random.choice(station_names, size=6, replace=False))
test_stations  = list(set(station_names) - set(train_stations))

In [4]:
train_stations

['KA004', 'KA003', 'KA006', 'KA007', 'KA009', 'KA011']

In [5]:
test_stations

['KA002']

In [6]:
import configparser

config = configparser.ConfigParser()
config.read("./config.ini")

city = "Bengaluru"

START_DATE = config['period']['test_start_date']
END_DATE   = config['period']['test_end_date']

In [7]:
india_stations  = pd.read_csv("../2015-2020-pm25/india_stations-corrected.csv")
bengaluru_stations = india_stations[india_stations["StationId"].isin(train_stations + test_stations)]

In [8]:
def get_lat_lng(station_id):
    row = bengaluru_stations[bengaluru_stations['StationId'] == station_id]
    return row['Latitude'].values[0], row['Longitude'].values[0]

In [9]:
SPATIAL_MODELS_DIR  = "./spt_models/"

sequence = {
    "Machine Learning" : {
        "dir": "./ml_models/"
    },
    "Deep Learning" : {
        "dir": "./dl_models/"
    },
    "Statistical" : {
        "dir": "./stat_models/"
    }
}

sequence_keys = list(sequence.keys())

In [10]:
for idx in range(len(train_stations)):
    station = train_stations[idx]
    
    for sub_idx in range(len(sequence_keys)):
        key = sequence_keys[sub_idx]
        
        if (idx == 0):
            if (key == "Machine Learning"):
                models = [file.split("/")[-2] + " Regression" for file in glob(sequence[key]["dir"] + "*/")]
            else:
                models = [file.split("/")[-2] for file in glob(sequence[key]["dir"] + "*/")]
            for model in models:
                sequence[key][model] = {
                    "aod_values"   : {},
                    "predictions"  : {}
                }

        for file in glob(sequence[key]["dir"] + "*" + "/" + station + "_aod.pkl"):
            model = file.split("/")[-2]
            if (key == "Machine Learning"):
                model += " Regression"
            
            aod_values  = pickle.load(open(file, "rb"))
            predictions = pickle.load(open(sequence[key]["dir"] + station + "_predictions.pkl", "rb")) 

            aod_values = pd.DataFrame(aod_values, index=predictions.index.values, columns=['aod'])[START_DATE:END_DATE]
            predictions = predictions[START_DATE:END_DATE]

            sequence[key][model]['aod_values'][station]  = aod_values
            sequence[key][model]['predictions'][station] = predictions

In [11]:
test_sequence = {}

for idx in range(len(test_stations)):
    station = test_stations[idx]
    predictions = pickle.load(open("./ml_models/" + station + "_predictions.pkl", "rb")) 
    test_sequence[station] = predictions[START_DATE:END_DATE]['Actual']

In [12]:
import datetime

dates = [datetime.datetime.utcfromtimestamp(date.tolist()/1e9) for date in predictions[START_DATE:END_DATE].index.values ]

In [13]:
def hypotenuse_distance(x, y):
    return np.sqrt(x**2 + y**2)

In [14]:
def get_weighted_value(y):
    weights = 1 / pow(y[:,1], 2)
    return sum(weights * y[:, 0]) / sum(weights)
    
def mix(y_list):
    return np.array(list(map(lambda x: get_weighted_value(x), y_list)))

In [15]:
def model_predict(X, regressor, scaler):    
    if scaler:
        try:
            sc_X, sc_y = scaler
            y_pred = sc_y.inverse_transform(regressor.predict(sc_X.transform(X)))
        except Exception as e:
            poly_reg = scaler
            y_pred = regressor.predict(poly_reg.transform(X))
    else:
        y_pred = regressor.predict(X)
    distances = np.array(list(map(lambda x: hypotenuse_distance(x[1], x[2]), X)))
    return np.array(list(zip(y_pred, distances)))

def aod2pm25_model_predict(X, regressor, scaler):    
    if scaler:
        try:
            sc_X, sc_y = scaler
            y_pred = sc_y.inverse_transform(regressor.predict(sc_X.transform(X)))
        except Exception as e:
            poly_reg = scaler
            y_pred = regressor.predict(poly_reg.transform(X))
    else:
        y_pred = regressor.predict(X)
    return y_pred

In [16]:
TEST_STATION_COORDS = np.array([get_lat_lng(station_id) for station_id in test_stations])

In [17]:
import multiprocessing as mp

manager = mp.Manager()
scores  = manager.dict()
results = manager.dict() 

for category in sequence.keys():
    scores[category]  = manager.dict({})
    results[category] = manager.dict({})

In [18]:
from pyhdf import SD
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

def perform_task(category, model):

    y_aod_predictions = []

    for idx in range(len(dates)):

        y_sp_interpolate = []

        for sub_idx in range(len(train_stations)):
            station_id = train_stations[sub_idx]

            best = pickle.load(open(f"{SPATIAL_MODELS_DIR}/{station_id}_spt.pkl", "rb")) 

            fix_station_lat, fix_station_lon = best['station_id']['nearest_lat'], best['station_id']['nearest_lon']
            x_coord = best['station_id']['x_coord']
            y_coord = best['station_id']['y_coord']

            regressor = best['regressor']
            scaler = best['scaler']


            # ------------------ Interpolation Begin -------------------------

            fix_station_aod = sequence[category][model]['aod_values'][station_id]['aod'].values[idx]

            X_sp_interpolate_sub = np.array(list(zip([fix_station_aod for k in range(TEST_STATION_COORDS.shape[0])],
                                     (fix_station_lat - TEST_STATION_COORDS[:,0]),
                                     (fix_station_lon - TEST_STATION_COORDS[:,1]),
                                    )
                                ))

            y_sp_interpolate_sub = model_predict(X_sp_interpolate_sub, regressor, scaler)
            y_sp_interpolate.append(y_sp_interpolate_sub)

            # ------------------ Interpolation End -------------------------

        y_sp_interpolate = np.array(y_sp_interpolate)
        y_sp_interpolate = y_sp_interpolate.transpose(1, 0, 2)
        y_sp_interpolate = mix(y_sp_interpolate).round(3)
        y_sp_interpolate = y_sp_interpolate.reshape(-1,1).T
        
#         print(y_sp_interpolate.shape)

        y_aod_predictions.append(y_sp_interpolate)

    # ------------------ AOD to PM2.5 Begin ----------------------------

    y_aod_predictions = np.array(y_aod_predictions)
    y_aod_predictions = y_aod_predictions.reshape(y_aod_predictions.shape[0], len(test_stations))

    aod2pm25_model = pickle.load(open(config['convert']["aod_pm25"], "rb"))

    y_pm25_predictions = aod2pm25_model_predict(y_aod_predictions.reshape(-1, 1), aod2pm25_model['regressor'], aod2pm25_model['scaler']).reshape(y_aod_predictions.shape[0], len(test_stations))

    y_pred = y_pm25_predictions.reshape(-1, 1)

    # ------------------ AOD to PM2.5 End- ----------------------------

    # ------------------ Scores Begin ---------------------------------

    y_test = np.array([])

    for key in test_stations:
        y_test = np.append(y_test, test_sequence[key].values)

    y_test = y_test.reshape(-1, 1)

    score = {
            "r2_score": r2_score(y_test, y_pred),
            "mae": mean_absolute_error(y_test, y_pred), 
            "rmse": np.sqrt(mean_squared_error(y_test, y_pred)),
            "mean": np.mean(y_test)
        }
    
    scores[category][model] = score
    
    results[category][model] = {
        "score": score,
        "y_pred": y_pred,
        "y_test": y_test
    }

    # ------------------ Scores End ---------------------------------

In [19]:
pairs = [(outer_key, inner_key) for outer_key in sequence.keys() for inner_key in sequence[outer_key].keys()]
pairs = [(a, b) for a, b in pairs if not b in ['dir']]

pool = mp.Pool(mp.cpu_count())
pool.starmap(perform_task, [pairs[idx] for idx in range(len(pairs))])
pool.close()

In [20]:
scores = dict(scores)
for key in scores.keys():
    scores[key] = dict(scores[key])
    
results = dict(results)
for key in results.keys():
    results[key] = dict(results[key])

In [21]:
import pickle

pickle.dump(results, open("final_results.pkl", "wb"), protocol=4)
pickle.dump(scores, open("final_scores.pkl", "wb"), protocol=4)

In [22]:
reform = {(outerKey, innerKey): values for outerKey, innerDict in scores.items() for innerKey, values in innerDict.items()}

In [23]:
result_df = pd.DataFrame(reform).transpose()

In [24]:
del result_df['r2_score']

In [25]:
result_df

Unnamed: 0,Unnamed: 1,mae,rmse,mean
Machine Learning,Polynomial Regression,10.399981,12.36935,27.755
Machine Learning,SVR Regression,10.365733,12.368533,27.755
Machine Learning,Linear Regression,10.318772,12.333228,27.755
Machine Learning,Decision Tree Regression,10.30087,12.324597,27.755
Machine Learning,Random Forest Regression,10.412176,12.437565,27.755
Deep Learning,BiDirectional LSTM,10.31981,12.302492,27.755
Deep Learning,LSTM AutoEncoder,10.335377,12.328911,27.755
Deep Learning,Multi Layer Perceptron,10.316437,12.290038,27.755
Statistical,Holt-Winters,10.308913,12.333158,27.755
Statistical,ARIMA,10.306578,12.298804,27.755
