In [1]:
import pandas as pd
import numpy as np

import pickle
import glob

import warnings
warnings.filterwarnings('ignore')

In [2]:
import configparser

config = configparser.ConfigParser()
config.read("./config.ini")

['./config.ini']

In [3]:
mcd19a2_obj = pickle.load(open("./mcd19a2.pkl", "rb"))
mcd19a2_longitude, mcd19a2_latitude = mcd19a2_obj['longitude'], mcd19a2_obj['latitude']

In [4]:
stations_df = pd.read_pickle('../2015-2020-pm25/india_stations.pkl')
stations_df = stations_df[stations_df['City'] == 'Bengaluru']
stations_df

Unnamed: 0,StationName,StationId,City,Latitude,Longitude
77,"BTM Layout, Bengaluru - CPCB",KA002,Bengaluru,12.916576,77.610116
78,"BWSSB Kadabesanahalli, Bengaluru - CPCB",KA003,Bengaluru,12.960388,77.718993
79,"Bapuji Nagar, Bengaluru - KSPCB",KA004,Bengaluru,12.95678,77.539729
80,"City Railway Station, Bengaluru - KSPCB",KA005,Bengaluru,12.97584,77.565756
81,"Hebbal, Bengaluru - KSPCB",KA006,Bengaluru,13.0354,77.5988
82,"Hombegowda Nagar, Bengaluru - KSPCB",KA007,Bengaluru,12.9375,77.5949
83,"Peenya, Bengaluru - CPCB",KA009,Bengaluru,13.028513,77.519676
84,"Sanegurava Halli, Bengaluru - KSPCB",KA010,Bengaluru,12.915518,77.585666
85,"Silk Board, Bengaluru - KSPCB",KA011,Bengaluru,12.91771,77.623786


In [5]:
station_ids = [file.split("/")[-1][:5] for file in glob.glob("./dl_models/*.pkl")]

In [6]:
np.random.seed(46)
train_stations = list(np.random.choice(station_ids, size=6, replace=False))
test_stations  = list(set(station_ids) - set(train_stations))

In [7]:
train_stations

['KA004', 'KA003', 'KA006', 'KA007', 'KA009', 'KA011']

In [8]:
test_stations

['KA002']

In [9]:
LOCATIONS = {}

def add_in_dict(row):
    LOCATIONS[row[1]] = (row[0], row[2], row[3])

[add_in_dict(row) for row in stations_df[['StationName', 'StationId', 'Latitude', 'Longitude']].values];

In [10]:
def get_nearest_point_idx(latitude, longitude, user_lat, user_lon):
        
    R = 6371000
    lat1 = np.radians(user_lat)
    lat2 = np.radians(latitude)
    delta_lat = np.radians(latitude-user_lat)
    delta_lon = np.radians(longitude-user_lon)
    a = (np.sin(delta_lat/2))*(np.sin(delta_lat/2))+(np.cos(lat1))*(np.cos(lat2))*(np.sin(delta_lon/2))*(np.sin(delta_lon/2))
    c = 2*np.arctan2(np.sqrt(a),np.sqrt(1-a))
    d = R*c
    
    x, y = np.unravel_index(d.argmin(),d.shape)
    
    return x, y

In [11]:
def get_nearest_3x3_grid(data, x, y):
    
    if x < 1:
        x += 1
    if x > data.shape[0]-2:
        x -= 2
    if y < 1:
        y += 1
    if y > data.shape[1]-2:
        y -= 2  
    
    three_by_three = data[x-1:x+2,y-1:y+2]
    three_by_three = three_by_three.astype(float)
    
    not_nans = np.count_nonzero(~np.isnan(three_by_three))
    
    if not_nans == 0:
        return {
            "x": x,
            "y": y,
        }
    else:
        three_by_three_average = np.nanmean(three_by_three)
        three_by_three_std = np.nanstd(three_by_three)
        three_by_three_median = np.nanmedian(three_by_three)
        
        return {
            "x": x,
            "y": y,
            "data": three_by_three,
            "average": three_by_three_average,
            "std": three_by_three_std,
            "median": three_by_three_median
        }    

In [12]:
def nearest_lat_lon(station_lat, station_lon):
    x, y = get_nearest_point_idx(mcd19a2_latitude, mcd19a2_longitude, station_lat, station_lon)
    nearest_lon, nearest_lat = np.round(mcd19a2_longitude[x,y], 8), np.round(mcd19a2_latitude[x,y], 8)
    
    return x, y, nearest_lat, nearest_lon

In [13]:
import datetime

def extract_date_from_file_name(FILE_NAME):
    return datetime.datetime.strptime(FILE_NAME.split('/')[-1].split('.')[1:][0][1:], "%Y%j")

In [14]:
PATH = config['convert']['hdf_path'] + "*.hdf"
FILE_LIST = glob.glob(PATH)

In [15]:
from pyhdf import SD
import multiprocessing as mp

manager = mp.Manager()
rows = manager.list()


def perform_task(idx):

    # ------------- Data Loading Start ------------------

    SDS_NAME = "Optical_Depth_047"
    FILE_NAME = FILE_LIST[idx]
    hdf = SD.SD(FILE_NAME)
    sds = hdf.select(SDS_NAME)

    NAME = FILE_NAME.split('/')[-1]

    data = sds.get()

    attributes = sds.attributes()
    scale_factor = attributes['scale_factor']
    fv = attributes['_FillValue']

    data = data.astype(float)
    data[data == fv] = np.nan
    data = np.nanmean(data, axis=0)

    scaled_data = data * scale_factor

    # ------------- Data Loading End ------------------

    date = extract_date_from_file_name(FILE_NAME)
    
    # ------------- AOD Extraction Start --------------

    for sub_idx in range(len(station_ids)):

        station_id = station_ids[sub_idx]
        station_name, station_lat, station_lon = LOCATIONS[station_id]
        x_coord, y_coord, nearest_lat, nearest_lon = nearest_lat_lon(station_lat, station_lon)

        try:
            fix_station_aod = get_nearest_3x3_grid(scaled_data, x_coord, y_coord)['average'].round(3)

            row = [date.strftime("%Y-%m-%d"), station_id, fix_station_aod, nearest_lat, nearest_lon]
            rows.append(row)
        except:
            pass
        
    # ------------- AOD Extraction End --------------

In [16]:
pool = mp.Pool(mp.cpu_count())
pool.map(perform_task, [idx for idx in range(len(FILE_LIST))])
pool.close()

In [17]:
rows = np.array(list(rows))

In [18]:
aod_df = pd.DataFrame(rows, columns=['date', 'station_id', 'aod', 'latitude', 'longitude'])
aod_df['date'] = pd.to_datetime(aod_df['date'], format='%Y-%m-%d')
aod_df = aod_df.set_index(['date', 'station_id'])
aod_df = aod_df.sort_index()

In [19]:
aod_df

Unnamed: 0_level_0,Unnamed: 1_level_0,aod,latitude,longitude
date,station_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-10-03,KA004,0.71,12.96080067,77.54263828
2019-10-03,KA006,0.813,13.03586322,77.60128382
2019-10-25,KA006,0.426,13.03586322,77.60128382
2019-10-25,KA009,0.423,13.02752294,77.51918007
2019-10-28,KA002,0.53,12.91909925,77.61301293
...,...,...,...,...
2019-12-29,KA004,0.993,12.96080067,77.54263828
2019-12-29,KA007,1.027,12.93577982,77.58955471
2019-12-29,KA009,1.021,13.02752294,77.51918007
2019-12-30,KA002,1.135,12.91909925,77.61301293


In [20]:
stations_df_pm25_list = pd.read_pickle('../2015-2020-pm25/india_stations_pm25.pkl')
pm25_values_df_list = []

for df in stations_df_pm25_list:
    if (df['StationId'].iloc[0] in station_ids):
        pm25_values_df_list.append(df)

In [21]:
START = config['convert']['start_date']
END = config['convert']['end_date']

In [22]:
def previous_window_impute(station_df, window_length=14):
    column = "PM2.5"
    try:
        for idx, value in enumerate(station_df[column].values):
            if np.isnan(value):
                station_df[column][idx] = station_df[column][idx - window_length]
    except:
        pass
    return station_df

In [23]:
for i in range(len(pm25_values_df_list)):
    df = pm25_values_df_list[i]
    nan_count = df.isna().sum()['PM2.5']
    
    if (nan_count):
        df = previous_window_impute(df)
        nan_count = df.isna().sum()['PM2.5']
        if (nan_count):
            print(df['StationId'].values[0])
    
    pm25_values_df_list[i] = pm25_values_df_list[i][START:END]

In [24]:
pm25_df = pd.DataFrame(columns=pm25_values_df_list[0].columns)

In [25]:
for i in range(len(pm25_values_df_list)):
    pm25_df = pm25_df.append(pm25_values_df_list[i])

In [26]:
pm25_df = pm25_df.reset_index()
pm25_df.columns = ["date", "station_id", "PM2.5"] 
pm25_df = pm25_df.set_index(['date', 'station_id'])
pm25_df = pm25_df.sort_index()

In [27]:
merged_df = pd.concat([aod_df, pm25_df], axis=1).dropna().astype("float64").reset_index()

In [28]:
import random

random.seed(42)
random_state = 42

ratio = 0.2

In [29]:
train_df = merged_df[~merged_df['station_id'].isin(test_stations)]
test_df  = merged_df[merged_df['station_id'].isin(test_stations)]

In [30]:
X_columns = ['aod']
y_columns = ['PM2.5']

X_train, y_train = train_df[X_columns].values, train_df[y_columns].values.reshape(-1,1)
X_test, y_test = test_df[X_columns].values, test_df[y_columns].values.reshape(-1,1)

In [31]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_squared_log_error, median_absolute_error
from sklearn.metrics import r2_score, mean_absolute_percentage_error
from sklearn.preprocessing import StandardScaler

In [32]:
def get_polynomial_reg_model():
    
    poly_reg = PolynomialFeatures()
    X_poly = poly_reg.fit_transform(X_train)

    regressor = LinearRegression()
    regressor.fit(X_poly, y_train)

    y_pred = regressor.predict(poly_reg.transform(X_test))
    score = {
        "r2_score": r2_score(y_test, y_pred),
        "MAE": mean_absolute_error(y_test, y_pred), 
        "MSLE": mean_squared_log_error(y_test, y_pred),
        "MdAbsE": median_absolute_error(y_test, y_pred),
        "MAPE": mean_absolute_percentage_error(y_test, y_pred),
        "RMSE": np.sqrt(mean_squared_error(y_test, y_pred)),
        "mean": np.mean(y_test)
    }
    
    return regressor, score, poly_reg

def get_linear_reg_model():
    
    regressor = LinearRegression()
    regressor.fit(X_train, y_train)

    y_pred = regressor.predict(X_test)
    score = {
        "r2_score": r2_score(y_test, y_pred),
        "MAE": mean_absolute_error(y_test, y_pred), 
        "MSLE": mean_squared_log_error(y_test, y_pred),
        "MdAbsE": median_absolute_error(y_test, y_pred),
        "MAPE": mean_absolute_percentage_error(y_test, y_pred),
        "RMSE": np.sqrt(mean_squared_error(y_test, y_pred)),
        "mean": np.mean(y_test)
    }
    
    return regressor, score, None

def get_decision_tree_reg_model():
    
    regressor = DecisionTreeRegressor(random_state=random_state)
    regressor.fit(X_train, y_train)
    
    y_pred = regressor.predict(X_test)
    score = {
        "r2_score": r2_score(y_test, y_pred),
        "MAE": mean_absolute_error(y_test, y_pred), 
        "MSLE": mean_squared_log_error(y_test, y_pred),
        "MdAbsE": median_absolute_error(y_test, y_pred),
        "MAPE": mean_absolute_percentage_error(y_test, y_pred),
        "RMSE": np.sqrt(mean_squared_error(y_test, y_pred)),
        "mean": np.mean(y_test)
    }
    
    return regressor, score, None

def get_random_forest_reg_model():
    
    regressor = RandomForestRegressor(random_state=random_state)
    regressor.fit(X_train, y_train)

    y_pred = regressor.predict(X_test)
    score = {
        "r2_score": r2_score(y_test, y_pred),
        "MAE": mean_absolute_error(y_test, y_pred), 
        "MSLE": mean_squared_log_error(y_test, y_pred),
        "MdAbsE": median_absolute_error(y_test, y_pred),
        "MAPE": mean_absolute_percentage_error(y_test, y_pred),
        "RMSE": np.sqrt(mean_squared_error(y_test, y_pred)),
        "mean": np.mean(y_test)
    }
    
    return regressor, score, None

def get_svr_reg_model():
    
    sc_X = StandardScaler()
    sc_y = StandardScaler()
    X_train_sc = sc_X.fit_transform(X_train)
    y_train_sc = sc_y.fit_transform(y_train)
    
    regressor = SVR(kernel='rbf')
    regressor.fit(X_train_sc, y_train_sc)
    
    y_pred = sc_y.inverse_transform(regressor.predict(sc_X.transform(X_test)))
    score = {
        "r2_score": r2_score(y_test, y_pred),
        "MAE": mean_absolute_error(y_test, y_pred), 
        "MSLE": mean_squared_log_error(y_test, y_pred),
        "MdAbsE": median_absolute_error(y_test, y_pred),
        "MAPE": mean_absolute_percentage_error(y_test, y_pred),
        "RMSE": np.sqrt(mean_squared_error(y_test, y_pred)),
        "mean": np.mean(y_test)
    }
    
    return regressor, score, (sc_X, sc_y)

In [33]:
MODEL_LIST = [
    ('Polynomial', get_polynomial_reg_model),
    ('Linear', get_linear_reg_model),
    ('Decision Tree', get_decision_tree_reg_model),
    ('Random Forest', get_random_forest_reg_model),
    ('SVR', get_svr_reg_model),
]

In [34]:
best = {}
best_score = 9e9

rows = []

for name, model_fn in MODEL_LIST:
    regressor, score, scaler = model_fn()
    row = [f"{name} Regression", score['MAE'], score['RMSE'], score['MSLE'], score['MdAbsE']]
    rows.append(row)
    
    if score['RMSE'] < best_score:
        best['name'] = name + " Regression"
        best['regressor'] = regressor
        best['scaler'] = scaler
        best['score'] = score
        
        best_score = score['RMSE']

model_dfs = pd.DataFrame(rows, columns=["model", "MAE", "RMSE", "MSLE", "MdAbsE"])
model_dfs

Unnamed: 0,model,MAE,RMSE,MSLE,MdAbsE
0,Polynomial Regression,14.22661,20.02047,0.259888,10.728918
1,Linear Regression,14.22882,19.397858,0.258909,12.114153
2,Decision Tree Regression,17.380402,21.284743,0.367165,15.74
3,Random Forest Regression,16.191672,20.538998,0.318362,14.3783
4,SVR Regression,13.735576,20.209101,0.248096,9.318453


In [35]:
print(model_dfs.round(3).to_latex())

\begin{tabular}{llrrrr}
\toprule
{} &                     model &     MAE &    RMSE &   MSLE &  MdAbsE \\
\midrule
0 &     Polynomial Regression &  14.227 &  20.020 &  0.260 &  10.729 \\
1 &         Linear Regression &  14.229 &  19.398 &  0.259 &  12.114 \\
2 &  Decision Tree Regression &  17.380 &  21.285 &  0.367 &  15.740 \\
3 &  Random Forest Regression &  16.192 &  20.539 &  0.318 &  14.378 \\
4 &            SVR Regression &  13.736 &  20.209 &  0.248 &   9.318 \\
\bottomrule
\end{tabular}



In [36]:
import pickle

with open(config['convert']["aod_pm25"], "wb") as file:
    pickle.dump(best, file, protocol=4)

In [37]:
y_columns = ['aod']
X_columns = ['PM2.5']

X_train, y_train = train_df[X_columns].values, train_df[y_columns].values.reshape(-1,1)
X_test, y_test = test_df[X_columns].values, test_df[y_columns].values.reshape(-1,1)

In [38]:
best = {}
best_score = 9e9

rows = []

for name, model_fn in MODEL_LIST:
    regressor, score, scaler = model_fn()
    row = [f"{name} Regression", score['MAE'], score['RMSE'], score['MSLE'], score['MdAbsE']]
    rows.append(row)
    
    if score['RMSE'] < best_score:
        best['name'] = name + " Regression"
        best['regressor'] = regressor
        best['scaler'] = scaler
        best['score'] = score
        
        best_score = score['RMSE']

model_dfs = pd.DataFrame(rows, columns=["model", "MAE", "RMSE", "MSLE", "MdAbsE"])
model_dfs

Unnamed: 0,model,MAE,RMSE,MSLE,MdAbsE
0,Polynomial Regression,0.101746,0.151215,0.008278,0.072327
1,Linear Regression,0.100525,0.147894,0.007978,0.074721
2,Decision Tree Regression,0.132345,0.17844,0.012106,0.088
3,Random Forest Regression,0.117805,0.163011,0.009981,0.09126
4,SVR Regression,0.088498,0.142416,0.007156,0.053624


In [39]:
print(model_dfs.round(3).to_latex())

\begin{tabular}{llrrrr}
\toprule
{} &                     model &    MAE &   RMSE &   MSLE &  MdAbsE \\
\midrule
0 &     Polynomial Regression &  0.102 &  0.151 &  0.008 &   0.072 \\
1 &         Linear Regression &  0.101 &  0.148 &  0.008 &   0.075 \\
2 &  Decision Tree Regression &  0.132 &  0.178 &  0.012 &   0.088 \\
3 &  Random Forest Regression &  0.118 &  0.163 &  0.010 &   0.091 \\
4 &            SVR Regression &  0.088 &  0.142 &  0.007 &   0.054 \\
\bottomrule
\end{tabular}



In [40]:
import pickle

with open(config['convert']["pm25_aod"], "wb") as file:
    pickle.dump(best, file, protocol=4)