In [228]:
### libraries and modules
from datetime import datetime
import matplotlib.pyplot as plt
import tqdm.notebook as tqdm
import pandas as pd
import numpy as np
import os
import copy
import torch
from torch.utils.data import TensorDataset, DataLoader
import pickle

In [229]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [230]:
#Normalization of data
def m_s_adjust(x):
    """
    Adjusts for mean and std for input/output normalization. Used in the next function.
    
    || INPUT ||
    x - dataset to standardize 
    
    || OUTPUT ||
    x - transformed dataset
    mx - mean of original dataset
    sx - std of original dataset
    """
    x = x.clone()
    mx = x.mean(0, keepdim=True)
    sx = x.std(0, unbiased=False, keepdim=True)
    x -= mx
    x /= sx
    return x, mx, sx

def normalize(train_tuple, test_tuple = None, ms = None):
    """
    Normalize the train and/or test datasets
    
    || INPUT ||
    train_tuple - (train features, train output variable) 
    test_tuple - (test features, test output variable) 
    
    || OUTPUT ||
    outputs transformed train and test tuples (tuples as defined above) along with learnt mean and std. deviations
    """
    train_x, train_y = train_tuple
    train_x, train_y  = train_x.clone(), train_y.clone() 
    
    if ms == None:
        train_x, mx, sx = m_s_adjust(train_x)
        train_y, my, sy = m_s_adjust(train_y)
        train_tuple = (train_x, train_y)
        
        if test_tuple:
            test_x, test_y = test_tuple
            test_x, test_y  = test_x.clone(), test_y.clone() 
            test_x -= mx
            test_x /= sx
            test_y -= my
            test_y /= sy
            test_tuple = (test_x, test_y)
        else:
            test_tuple = None
    else:
        ms_x, ms_y = ms
        mx, sx = ms_x
        my, sy = ms_y

        train_x -= mx
        train_x /= sx
        train_y -= my
        train_y /= sy
        train_tuple = (train_x, train_y)
        test_tuple = None
  
    return train_tuple, test_tuple, (mx, sx), (my, sy)
    
def norm(y):
    mean = np.mean(y)
    std = np.std(y)
    return (y-mean)/std, (mean,std)



In [231]:

def to_dt(time_string):
    return pd.to_datetime(time_string).tz_localize('UTC').tz_convert('Asia/Kolkata')


def train_test_valid(date, data_dir = "./data/", train_frac = 0.9, hour = None, valid_cvs = None, cvs = 1):
    file = date + "_all.csv"
    df = pd.read_csv(data_dir + file, index_col = 0, parse_dates = ["dateTime"])
    

    ### filter time for the current day...we should be able to remove this(just preprocessing step)
    df = df[(df.dateTime >= to_dt(date)) & (df.dateTime <= to_dt(date+ " 18:00:00"))].reset_index(drop = True)

    cv_dict = create_train_test(df, hour = hour, train_frac = train_frac, cvs = cvs)
    print("Total Sets Created:", cv_dict.keys())
    if valid_cvs:
        assert type(valid_cvs) == int
        cv_dict = create_validation(cv_dict, valid_cvs, train_frac)
    return cv_dict

def create_train_test(df, hour = [18], train_frac = 0.9, cvs = 1):
    '''
    The function will return a cv_dict with the keys: train, test.
    also it uses only 1 hour data for creating train, test.
    '''
    # day subset
    dfHour = df[['dateTime','lat','long','pm2_5']].copy()
    print(len(dfHour))
    # hour subset
    if hour:
        dfHour["hour"] = dfHour.dateTime.dt.hour
        dfHour = dfHour[dfHour.hour.isin(hour)]
        dfHour = dfHour.drop("hour", axis = 1)
  
    meaned = dfHour
    # convert into minutes
    meaned.dateTime = meaned.dateTime.dt.hour*60 + meaned.dateTime.dt.minute
    # take time from 9am to 10am, taking 1 hour time in consideration for model
    meaned = meaned[(meaned.dateTime >= 540) & (meaned.dateTime <= 600)] 
    meaned = meaned.sort_values(by = ['dateTime','lat','long'])
    


    cv_dict = {}
    original_cols = ["dateTime", "lat", "long", "pm2_5"]
    ### you can change the number of cross validations here
    for i in range(cvs):
        cv_dict[i] = {}
        dfHour = meaned.reset_index(drop = True).copy()

        # test data selection
        testIdx = dfHour.sample(frac = 1-train_frac, random_state = i).index.tolist()
        testData = dfHour.iloc[testIdx, :].loc[:, original_cols]
        dfHour = dfHour.drop(testIdx, axis=0).reset_index(drop = True)
        
        # original train data retention 
        trainData = dfHour.loc[:, original_cols].copy()

        # getting numpy arrays
        testData = testData.values
        trainData = trainData.values

        cv_dict[i]["train"] = trainData
        cv_dict[i]["test"] = testData

    return cv_dict

def create_validation(cv_dict, valid_cvs, train_frac):
    for main_cv in cv_dict.keys():
        train_df = pd.DataFrame(cv_dict[main_cv]["train"])
        cv_dict[main_cv]["train"] = []
        cv_dict[main_cv]["valid"] = []
        for cv in range(valid_cvs):
            validIdx = train_df.sample(frac = 1-train_frac, random_state = cv).index.tolist()
            validData = train_df.iloc[validIdx, :]
            train_df = train_df.drop(validIdx, axis=0).reset_index(drop = True)
            
            cv_dict[main_cv]["train"].append(train_df.values)
            cv_dict[main_cv]["valid"].append(validData.values)

    return cv_dict


def downsize(df, spatial_round = 3, temporal_round = None):    
    df = pd.DataFrame(df.copy())
    df.columns = ["dateTime", "lat", "long", "pm"]
    df.lat, df.long = df.lat.round(spatial_round), df.long.round(spatial_round)
    
    if temporal_round:
        temp_arr = []
        for h in range(9, 21):
          for j in range(0, 60, temporal_round):
              temp_arr.append(h*60 + j)
        temp_arr = np.array(temp_arr)
        df.dateTime = [temp_arr[temp_arr <= x][-1] for x in df.iloc[:, 0]]

    df = df.groupby(['dateTime','lat','long']).mean().reset_index()
    df = df.values
    return df

def keep_best(prev_train, prev_test, train_rmse, test_rmse):
    """
    This function helps us keep the best model learnt till now. It checks how current train and test rmse are compared to 
    previous iterations train and test rmse. Returns True if both are lower than previous.  
    """
    if train_rmse <= prev_train:
        if test_rmse <= prev_test:
            result = True
        else:
            result = False
    else:
        result = False
    return result


# new function
def deNormalize(y,y_ms):
    '''to denormalize'''
    # print(y.is_cuda,y_ms.is_cuda)
    y_denorm = y*y_ms[1] + y_ms[0]
    return y_denorm


def rmse(y_pred,y_true):
    '''This function calculate RMSE'''
    rmse = torch.sqrt(torch.mean(torch.square(y_pred-y_true)))
    return rmse

def evaluateRMSE(y_pred,y_test,y_ms):
    y_pred_denorm = deNormalize(y_pred,y_ms)
    y_test_denorm = deNormalize(y_test,y_ms)
    test_rmse = rmse(y_pred_denorm,y_test_denorm)
    return test_rmse



In [232]:
class MLP(torch.nn.Module):
        def __init__(self, input_size, hidden_size):
            super(MLP, self).__init__()
            self.input_size = input_size
            self.hidden_size  = hidden_size
            self.fc1 = torch.nn.Linear(self.input_size, self.hidden_size)
            self.relu = torch.nn.ReLU()
            self.fc2 = torch.nn.Linear(self.hidden_size, 1)
#             self.sigmoid = torch.nn.Sigmoid()
        def forward(self, x):
            hidden = self.fc1(x)
            relu = self.relu(hidden)
            output = self.fc2(relu)
#             output = self.sigmoid(output)
            return output
# def fit_model()

torch.manual_seed(0)
def init_weights(m):
    if type(m) == torch.nn.Linear:
        torch.nn.init.kaiming_normal_(m.weight)
        m.bias.data.fill_(0.01)
        
def fit_model(date,cv,data_tuple, n_epochs, normalizer = True, downsize_bool = False, spatial_round = 5, converge_after = 50):
    # Data preprocessing
    
    # forming needed tuples
    train_x, train_y, test_x, test_y = data_tuple    
    if downsize_bool:
        df = downsize(np.hstack([train_x.numpy(), train_y.reshape(-1,1).numpy()]), spatial_round = spatial_round, temporal_round = 15)
        train_original_x, train_original_y = train_x.clone(), train_y.clone()
        train_x, train_y = torch.Tensor(df[:, :3]), torch.Tensor(df[:, 3])
        print("Pre and post downsizing: {}, {}".format(train_original_x.shape, train_x.shape))
    train_x, train_y, test_x, test_y = train_x.to(device),train_y.to(device),test_x.to(device),test_y.to(device)

    train_tuple = train_x, train_y
    test_tuple = test_x, test_y
    # normalizing
    start_time = datetime.now()
    if normalizer:
        train_tuple, test_tuple, x_ms, y_ms = normalize(train_tuple, test_tuple)
        x_train, y_train = train_tuple
        x_test, y_test = test_tuple
    else:
        x_ms = (0,1)
        y_ms = (0, 1)

    ms = (x_ms, y_ms)

    # Model initialization
    model = torch.nn.Sequential(
            torch.nn.Linear(3, 200),
#             torch.nn.BatchNorm1d(10),
            torch.nn.ReLU(),
            torch.nn.Linear(200, 200),
#             torch.nn.BatchNorm1d(10),
            torch.nn.ReLU(),
             torch.nn.Linear(200, 200),
#             torch.nn.BatchNorm1d(20),
            torch.nn.ReLU(),
             torch.nn.Linear(200, 200),
#             torch.nn.BatchNorm1d(20),
            torch.nn.ReLU(),
            torch.nn.Linear(200, 1)
        )
    model.apply(init_weights)
    criterion = torch.nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr = 0.001)
    decayRate = 0.9995
    lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer=optimizer
                                                      , gamma=decayRate)
    
    if downsize_bool:
        train_original_x, train_original_y = train_original_x.to(device), train_original_y.to(device)
        train_original_tuple = (train_original_x, train_original_y)
        train_original_tuple, _, __, ___ = normalize(train_original_tuple, ms = ms)
        train_original_x, train_original_y = train_original_tuple
        train_original_tuple = (train_original_x, train_original_y)          

#     for param in model.parameters():
#         print(param.data)
    #Training
    
    #Transfering model and train test splits to gpu
    model.to(device)
    x_train, y_train, x_test, y_test = x_train.to(device), y_train.to(device), x_test.to(device), y_test.to(device)
    # x_ms, y_ms = x_ms.to(device), y_ms.to(device)


    model.train()
    epoch = n_epochs
    best_train = np.inf # these are to aid us in the selection of the best model
    best_test = np.inf
    best_model = None
    train_error, test_error =[],[]
    best_before = 0
    for epoch in range(epoch):

        optimizer.zero_grad()
        # Forward pass
        y_pred = model(x_train)
        # Compute Loss
        loss = criterion(y_pred.squeeze(), y_train)
        train_rmse = evaluateRMSE(y_pred.squeeze(),y_train,y_ms)
        y_pred = model(x_test)
        test_rmse = evaluateRMSE(y_pred.squeeze(),y_test,y_ms)
        
        print('Epoch {}: tr : {:.3f} , ts {:.3f} , ls {:.5f}'.format(epoch, train_rmse.item(),
                                                         test_rmse.item(),loss.item()))
        if epoch%10 == 0 and downsize_bool:
            y_pred = model(train_original_x)
            train_original_rmse = evaluateRMSE(y_pred.squeeze(),train_original_y,y_ms)
            print('Train original rmse ', train_original_rmse.item())

        train_error.append(train_rmse.item())
        test_error.append(test_rmse.item())
        # Backward pass
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        # saving best model
        model_best = keep_best(best_train, best_test, train_rmse, test_rmse)
        if model_best:
            best_model = copy.deepcopy(model)
            best_train = train_rmse
            best_test = test_rmse
            best_before = 0
        else:
            best_before += 1
        if best_before == converge_after:
            print('Flag')
            break
        
    #Evaluate
    model = copy.deepcopy(best_model)
    #global i
    dictionary ={'train_error': train_error, 'test_error': test_error}
    dataframe = pd.DataFrame(dictionary)
    name = ''+str(date)+'_'+str(cv)
    dataframe.to_csv("output_graphs_mlp/"+str(name)+'.csv')

    
    model.eval()
    y_pred = model(x_train)
    train_rmse = evaluateRMSE(y_pred.squeeze(),y_train,y_ms)
    y_pred = model(x_test)
    test_rmse = evaluateRMSE(y_pred.squeeze(),y_test,y_ms)
    #Timings
    end_time = datetime.now()
    total_time = (end_time-start_time).total_seconds()
    if downsize_bool:
        y_pred = model(train_original_x)
        train_original_rmse = evaluateRMSE(y_pred.squeeze(),train_original_y,y_ms)

        print('Train original rmse ', train_original_rmse.item())
        train_rmse = train_original_rmse
    
    
    return model , train_rmse.item(),test_rmse.item(),total_time, ms
    


In [233]:
all_cols = ["lat", "long", "datetime", "pm10"] 
x_cols = ["lat", "long", "datetime"]
y_cols = ["pm10"]

In [234]:
#dictionary to store RMSE values for particular date
rmse_dict = {}


#these three list will contain traintime,trainRmse and testRMSE value for each cv
all_time = []
all_train = []
all_test = []
n_epochs = 1000




date = "2020-12-24"
print(date)

rmse_dict[date] = {}
variations[date] = {}
cv_dict = train_test_valid(date, "PM Datasets/", hour = None, train_frac = 0.8, valid_cvs = 1, cvs=3)

rmse_dict[date]["time"] = 0
rmse_dict[date]["train_rmse"] =  0
rmse_dict[date]["test_rmse"] = 0


for cv in cv_dict.keys():
    print(f"|| SETCV: {cv}||")
    sub_test = cv_dict[cv]["test"]
    x_test, y_test = sub_test[:, :3], sub_test[:, 3].flatten()
    x_test, y_test = torch.Tensor(x_test), torch.Tensor(y_test)
    for valid_cv in range(len(cv_dict[cv]["train"])):
        print(f"|| CV: {valid_cv}||")
        sub_train = cv_dict[cv]["train"][valid_cv]
        sub_valid = cv_dict[cv]['valid'][valid_cv]
        x_train, y_train = sub_train[:, :3], sub_train[:, 3].flatten()
        x_valid, y_valid = sub_valid[:, :3], sub_valid[:, 3].flatten()
        x_train, y_train = torch.Tensor(x_train), torch.Tensor(y_train)
        x_valid, y_valid = torch.Tensor(x_valid), torch.Tensor(y_valid)

        data_tuple = (x_train, y_train, x_valid, y_valid)

        if valid_cv==0:
            print("Train Size: {}, Validation Size: {}, Test Size: {}".format(data_tuple[0].shape[0], data_tuple[2].shape[0], x_test.shape[0]))

     
        model , train_rmse,test_rmse,total_time, ms = fit_model(date,cv,data_tuple, n_epochs, normalizer = True, downsize_bool = True)

        # test
        x_test, y_test = x_test.to(device), y_test.to(device)
        test_tuple = (x_test, y_test)
        test_tuple, _, x_ms, y_ms = normalize(test_tuple, ms = ms)
        x_test, y_test = test_tuple

        model.eval()
        y_pred = model(x_test)
        test_rmse = evaluateRMSE(y_pred.squeeze(),y_test,y_ms)
        test_rmse = test_rmse.item()
        print("TEST:", test_rmse)
   



        rmse_dict[date]["time"] += total_time/3  # just the train time, spent in fit function
        rmse_dict[date]["train_rmse"] += train_rmse/3
        rmse_dict[date]["test_rmse"] += test_rmse/3


        all_time.append(total_time)
        all_train.append(train_rmse)
        all_test.append(test_rmse)

rmse_dict[date]["time"] = round(rmse_dict[date]["time"], 2)
rmse_dict[date]["train_rmse"] =  round(rmse_dict[date]["train_rmse"], 2)
rmse_dict[date]["test_rmse"] = round(rmse_dict[date]["test_rmse"], 2)



2020-12-24
139526
Total Sets Created: dict_keys([0, 1, 2])
|| SETCV: 0||
|| CV: 0||
Train Size: 5608, Validation Size: 1402, Test Size: 1753
Pre and post downsizing: torch.Size([5608, 3]), torch.Size([5067, 3])
Epoch 0: tr : 55.829 , ts 63.760 , ls 1.05813
Train original rmse  61.065330505371094
Epoch 1: tr : 76.148 , ts 69.908 , ls 1.96849
Epoch 2: tr : 61.090 , ts 72.011 , ls 1.26695
Epoch 3: tr : 46.695 , ts 52.490 , ls 0.74023
Epoch 4: tr : 53.212 , ts 57.313 , ls 0.96125
Epoch 5: tr : 55.909 , ts 62.130 , ls 1.06118
Epoch 6: tr : 48.159 , ts 53.461 , ls 0.78736
Epoch 7: tr : 43.267 , ts 45.682 , ls 0.63553
Epoch 8: tr : 42.441 , ts 44.185 , ls 0.61149
Epoch 9: tr : 42.285 , ts 44.942 , ls 0.60699
Epoch 10: tr : 43.201 , ts 45.834 , ls 0.63359
Train original rmse  43.522281646728516
Epoch 11: tr : 43.696 , ts 45.893 , ls 0.64818
Epoch 12: tr : 41.866 , ts 44.549 , ls 0.59504
Epoch 13: tr : 39.122 , ts 42.955 , ls 0.51959
Epoch 14: tr : 38.316 , ts 42.580 , ls 0.49841
Epoch 15: tr :

Train original rmse  40.996585845947266
Epoch 21: tr : 38.612 , ts 37.346 , ls 0.49945
Epoch 22: tr : 37.574 , ts 36.108 , ls 0.47295
Epoch 23: tr : 37.500 , ts 35.782 , ls 0.47109
Epoch 24: tr : 38.051 , ts 36.142 , ls 0.48504
Epoch 25: tr : 38.673 , ts 36.707 , ls 0.50101
Epoch 26: tr : 38.914 , ts 37.051 , ls 0.50729
Epoch 27: tr : 38.624 , ts 36.995 , ls 0.49976
Epoch 28: tr : 37.965 , ts 36.648 , ls 0.48284
Epoch 29: tr : 37.285 , ts 36.240 , ls 0.46570
Epoch 30: tr : 36.830 , ts 35.974 , ls 0.45440
Train original rmse  38.60698318481445
Epoch 31: tr : 36.679 , ts 35.942 , ls 0.45070
Epoch 32: tr : 36.757 , ts 36.089 , ls 0.45262
Epoch 33: tr : 36.896 , ts 36.297 , ls 0.45605
Epoch 34: tr : 36.938 , ts 36.419 , ls 0.45708
Epoch 35: tr : 36.807 , ts 36.368 , ls 0.45385
Epoch 36: tr : 36.528 , ts 36.143 , ls 0.44698
Epoch 37: tr : 36.203 , ts 35.820 , ls 0.43907
Epoch 38: tr : 35.965 , ts 35.502 , ls 0.43332
Epoch 39: tr : 35.946 , ts 35.275 , ls 0.43285
Epoch 40: tr : 36.046 , ts 3

Epoch 87: tr : 31.819 , ts 35.991 , ls 0.34813
Epoch 88: tr : 31.786 , ts 35.889 , ls 0.34743
Epoch 89: tr : 31.766 , ts 35.914 , ls 0.34699
Epoch 90: tr : 31.737 , ts 35.841 , ls 0.34635
Train original rmse  36.51469802856445
Epoch 91: tr : 31.721 , ts 35.781 , ls 0.34599
Epoch 92: tr : 31.707 , ts 35.888 , ls 0.34571
Epoch 93: tr : 31.697 , ts 35.726 , ls 0.34549
Epoch 94: tr : 31.679 , ts 35.924 , ls 0.34508
Epoch 95: tr : 31.643 , ts 35.797 , ls 0.34431
Epoch 96: tr : 31.615 , ts 35.848 , ls 0.34369
Epoch 97: tr : 31.604 , ts 35.937 , ls 0.34345
Epoch 98: tr : 31.599 , ts 35.775 , ls 0.34333
Epoch 99: tr : 31.587 , ts 36.006 , ls 0.34308
Epoch 100: tr : 31.568 , ts 35.813 , ls 0.34267
Train original rmse  36.51529312133789
Epoch 101: tr : 31.542 , ts 36.037 , ls 0.34210
Epoch 102: tr : 31.513 , ts 35.896 , ls 0.34149
Epoch 103: tr : 31.487 , ts 36.006 , ls 0.34091
Epoch 104: tr : 31.463 , ts 35.917 , ls 0.34040
Epoch 105: tr : 31.448 , ts 35.921 , ls 0.34006
Epoch 106: tr : 31.431 

In [235]:

###average RMSEs and Time for all CV
print('Training time ' + str(rmse_dict[date]['time']))
print('Train_rmse value ' + str(rmse_dict[date]['train_rmse']))
print('Test_rmse value ' + str(rmse_dict[date]['test_rmse']))


Training time 3.56
Train_rmse value 37.23
Test_rmse value 37.58
