In [1]:
import pandas as pd
import datetime
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
import sklearn.metrics as sm
from sklearn.neural_network import MLPRegressor
from tqdm.autonotebook import tqdm

  from tqdm.autonotebook import tqdm


In [2]:
chunksize = 1000000
sliding_window_data = pd.read_csv("sliding_window_data.csv", chunksize=chunksize, iterator=True)

#### Setting Seed value

In [3]:
seed = 3101
np.random.seed(3101)

In [4]:
drop_cols = ["date","time","station"]
for i in range(1,7):
    for j in range(1,7):
        drop_cols.append(f"T{i}S{j}_station number")

In [5]:
regression_model = MLPRegressor(random_state=seed, shuffle=True)
for batch in tqdm(sliding_window_data):
    drop_cols = ["date","time","station"]
    for i in range(1,7):
        for j in range(1,7):
            drop_cols.append(f"T{i}S{j}_station number")
    batch.drop(drop_cols, axis=1, inplace=True)
    batch.fillna(value=0, inplace=True)
    regression_model.partial_fit(batch.iloc[:,1:], batch.iloc[:,0])

16it [22:43, 85.24s/it]


KeyboardInterrupt: 

In [None]:
random_skips = np.random.randint(1,chunksize,size=3101)
test_data = pd.read_csv("sliding_window_data.csv", header=0, nrows=chunksize, skiprows=random_skips)
drop_cols = ["date","time","station"]
for i in range(1,7):
    for j in range(1,7):
        drop_cols.append(f"T{i}S{j}_station number")
test_data.drop(drop_cols, axis=1, inplace=True)
test_data.fillna(value=0, inplace=True)

In [None]:
prediction = regression_model.predict(test_data.iloc[:,1:])

In [None]:
def display_metrics(y_pred,y_true):
    print(f'R2 score is : {sm.r2_score(y_true,y_pred)}')
    print(f'MAE is : {sm.mean_absolute_error(y_true,y_pred)}')
    print(f'MSE is : {sm.mean_squared_error(y_true,y_pred)}')
    
    threshold = 0.0
    temp = pd.concat([pd.DataFrame(prediction, columns=["predict"]),y_true], axis=1)
    temp["predict"].round(decimals=1)
    fn = temp[temp["value"] > threshold]
    fn = fn[fn["predict"] <= threshold ]
    p = y_true[y_true > threshold]
    fnr = len(fn)/len(p)
    recall = 1-fnr
    print(f'Recall is : {recall}')
    print(f'False negative rate is : {fnr}')

    tn = temp[temp["value"] <= threshold]
    tn = tn[tn["predict"] <= threshold ]
    fp = temp[temp["value"] <= threshold]
    fp = fp[fp["predict"] > threshold]
    fpr = len(tn)/(len(tn) + len(fp))
    print(f'False positive rate (FP/FP+TN) is : {fpr}')


In [None]:
display_metrics(prediction,test_data.iloc[:,0])

R2 score is : 0.03403117894629293
MAE is : 0.030919457142454622
MSE is : 0.06720268191664644
Recall is : 0.9926425943783707
False negative rate is : 0.0073574056216293435
False positive rate (FP/FP+TN) is : 0.0035288029539064262


#### Hyper parameter tuning
Training on a subset of the data for faster performance

In [8]:
sample_size = 1000000
sample_data = pd.read_csv("sliding_window_data.csv", header=0, nrows=sample_size, skiprows=range(1,3101888))

In [9]:
drop_cols = ["date","time","station"]
for i in range(1,7):
    for j in range(1,7):
        drop_cols.append(f"T{i}S{j}_station number")
sample_data.drop(drop_cols, axis=1, inplace=True)
sample_data.fillna(value=0, inplace=True)

In [10]:
train, test = train_test_split(sample_data, test_size=0.2, random_state=seed, shuffle=True)

In [11]:
train.iloc[:,1:].values

array([[-3.500000e+01,  1.176500e-04,  0.000000e+00, ..., -6.000000e+01,
         0.000000e+00,  0.000000e+00],
       [-3.500000e+01,  1.085396e-04,  0.000000e+00, ..., -6.000000e+01,
         0.000000e+00,  0.000000e+00],
       [-3.500000e+01,  1.176500e-04,  0.000000e+00, ..., -6.000000e+01,
         0.000000e+00,  0.000000e+00],
       ...,
       [-3.500000e+01,  8.021000e-05,  0.000000e+00, ..., -6.000000e+01,
         0.000000e+00,  0.000000e+00],
       [-3.500000e+01,  1.085396e-04,  0.000000e+00, ..., -6.000000e+01,
         0.000000e+00,  0.000000e+00],
       [-3.500000e+01,  1.176500e-04,  0.000000e+00, ..., -6.000000e+01,
         0.000000e+00,  0.000000e+00]])

In [12]:
mlpr = MLPRegressor(random_state=seed, shuffle=True)
param_list = {"hidden_layer_sizes": [(64,),(32,), (16,), (32,64), (16,32), (128,)], "activation": ["logistic", "relu"],
              "solver": ["sgd", "adam"]}

gridCV = GridSearchCV(estimator=mlpr, param_grid=param_list, verbose=10, return_train_score=True, n_jobs=3, refit=True)
gridCV.fit(train.iloc[:,1:].values, train.iloc[:,0].values)


Fitting 5 folds for each of 24 candidates, totalling 120 fits


In [15]:
gridCV.cv_results_

{'mean_fit_time': array([ 174.67565598,  161.4877439 ,  175.65035949,  116.5701314 ,
         119.62938151,   47.94330602,  129.71486425,  139.04523106,
          71.09157314,   64.89616046,  153.55541716,  179.73648434,
         840.23184481,  584.70978827,  600.27960057,  428.92120748,
         510.81619568,   49.19486008,  942.15984325,  263.40833902,
          38.27595077,  137.87281265, 1147.1403965 ,  713.14008183]),
 'std_fit_time': array([1.11274859e+01, 1.41429465e+01, 4.91472794e+01, 9.39660073e+00,
        1.56367907e+01, 8.42186694e+00, 4.32783196e+00, 1.41824743e+01,
        3.49712551e+00, 1.40794710e+00, 4.50001439e+00, 1.66632470e+01,
        3.44349174e+01, 3.19805452e+01, 2.93086729e+01, 1.30732169e+01,
        1.43024580e+01, 1.72301977e+00, 4.38147313e+02, 3.51607835e+01,
        2.35853914e-01, 4.52815057e+01, 1.04614027e+01, 3.51501360e+01]),
 'mean_score_time': array([1.21868005, 1.38646202, 0.70611916, 0.69608545, 0.34127231,
        0.21599698, 1.50089884, 1.09

In [14]:
gridCV.best_params_

{'activation': 'relu', 'hidden_layer_sizes': (32,), 'solver': 'adam'}

In [None]:
chunksize = 1000000
sliding_window_data = pd.read_csv("sliding_window_data.csv", chunksize=chunksize, iterator=True)

In [None]:
regression_model_hyper = MLPRegressor(activation="relu", hidden_layer_sizes=(32,), solver="adam", random_state=seed, shuffle=True)
for batch in tqdm(sliding_window_data):
    drop_cols = ["date","time","station"]
    for i in range(1,7):
        for j in range(1,7):
            drop_cols.append(f"T{i}S{j}_station number")
    batch.drop(drop_cols, axis=1, inplace=True)
    batch.fillna(value=0, inplace=True)
    regression_model_hyper.partial_fit(batch.iloc[:,1:], batch.iloc[:,0])