In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import PassiveAggressiveRegressor
from sklearn.model_selection import train_test_split
import sklearn.metrics as sm
from sklearn.model_selection import GridSearchCV
from tqdm.autonotebook import tqdm

  from tqdm.autonotebook import tqdm


#### Train Test split
Total 25 million rows <br>
Split 0.8 Train : 0.2 Test

In [3]:
test_index = range(10_000_000,15_000_000+1)

In [4]:
chunksize = 1000000
sliding_window_data = pd.read_csv("sliding_window_data.csv", chunksize=chunksize, iterator=True, skiprows=test_index)

#### Setting Seed value

In [5]:
seed = 3101
np.random.seed(3101)

In [6]:
regression_model = PassiveAggressiveRegressor(random_state=seed, shuffle=True)
for batch in tqdm(sliding_window_data):
    drop_cols = ["date","time","station"]
    for i in range(1,7):
        for j in range(1,7):
            drop_cols.append(f"T{i}S{j}_station number")
    batch.drop(drop_cols, axis=1, inplace=True)
    batch.fillna(value=0, inplace=True)
    regression_model.partial_fit(batch.iloc[:,1:], batch.iloc[:,0])

21it [07:32, 21.55s/it]


In [7]:
def display_metrics(y_pred,y_true):
    print(f'R2 score is : {sm.r2_score(y_true,y_pred)}')
    print(f'MAE is : {sm.mean_absolute_error(y_true,y_pred)}')
    print(f'MSE is : {sm.mean_squared_error(y_true,y_pred)}')
    
    threshold = 0.0
    temp = pd.concat([pd.DataFrame(y_pred, columns=["predict"]),y_true], axis=1)
    temp["predict"].round(decimals=1)
    fn = temp[temp["value"] > threshold]
    fn = fn[fn["predict"] <= threshold ]
    p = y_true[y_true > threshold]
    fnr = len(fn)/len(p)
    recall = 1-fnr
    print(f'Recall is : {recall}')
    print(f'False negative rate is : {fnr}')

    tn = temp[temp["value"] <= threshold]
    tn = tn[tn["predict"] <= threshold ]
    fp = temp[temp["value"] <= threshold]
    fp = fp[fp["predict"] > threshold]
    fpr = len(tn)/(len(tn) + len(fp))
    print(f'False positive rate (FP/FP+TN) is : {fpr}')


In [8]:
test_dataset = pd.read_csv("sliding_window_data.csv", header=0, nrows=5_000_000, skiprows=range(1,10_000_000))
drop_cols = ["date","time","station"]
for i in range(1,7):
    for j in range(1,7):
        drop_cols.append(f"T{i}S{j}_station number")
test_dataset.drop(drop_cols, axis=1, inplace=True)
test_dataset.fillna(value=0, inplace=True)

In [9]:
prediction_base = regression_model.predict(test_dataset.iloc[:,1:])

In [14]:
display_metrics(prediction_base,test_dataset.iloc[:,0])

R2 score is : -0.021399897507405896
MAE is : 0.1137264995162555
MSE is : 0.07751863237041332
Recall is : 0.9979424016345676
False negative rate is : 0.0020575983654323475
False positive rate (FP/FP+TN) is : 0.0003951859317922659


#### Hyper parameter tuning

In [46]:
sample_size = 2000000
sample_data = pd.read_csv("sliding_window_data.csv", header=0, nrows=sample_size, skiprows=range(1,3101888))

In [47]:
drop_cols = ["date","time","station"]
for i in range(1,7):
    for j in range(1,7):
        drop_cols.append(f"T{i}S{j}_station number")
sample_data.drop(drop_cols, axis=1, inplace=True)
sample_data.fillna(value=0, inplace=True)

In [48]:
train, test = train_test_split(sample_data, test_size=0.2, random_state=seed, shuffle=True)

In [49]:
train.iloc[:,0].values

array([0., 0., 0., ..., 0., 0., 0.])

In [50]:
par = PassiveAggressiveRegressor(random_state=seed, shuffle=True)
param_list = {"C":[0.01, 0.05, 0.1, 0.5, 1.0, 1.5, 2.0]}

gridCV = GridSearchCV(estimator=par, param_grid=param_list, verbose=10, return_train_score=True, n_jobs=10, refit=True)
gridCV.fit(train.iloc[:,1:].values, train.iloc[:,0].values)


Fitting 5 folds for each of 7 candidates, totalling 35 fits


In [51]:
gridCV.cv_results_

{'mean_fit_time': array([140.00676675, 232.16033106,  41.2871768 ,  32.2199295 ,
         27.02188406,  21.62916765,  14.64967709]),
 'std_fit_time': array([73.0610917 , 60.85381696,  3.82128928, 10.76089808,  5.13222943,
         5.31280306,  5.2167164 ]),
 'mean_score_time': array([4.43432217, 3.07508221, 0.42871757, 0.70129609, 2.45876045,
        1.4311995 , 0.19010077]),
 'std_score_time': array([3.03711246, 2.7261403 , 0.12104769, 0.52798516, 3.71874702,
        2.08347441, 0.01836753]),
 'param_C': masked_array(data=[0.01, 0.05, 0.1, 0.5, 1.0, 1.5, 2.0],
              mask=[False, False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'C': 0.01},
  {'C': 0.05},
  {'C': 0.1},
  {'C': 0.5},
  {'C': 1.0},
  {'C': 1.5},
  {'C': 2.0}],
 'split0_test_score': array([-0.09911644, -0.09911644, -0.09911644, -0.09911644, -0.09911644,
        -0.09911644, -0.09911644]),
 'split1_test_score': array([0.0013623, 0.0013623, 0.0013623, 0.00136

In [54]:
gridCV.best_params_

{'C': 0.01}

#### Fitting a new Passive Aggressive Regressor with the best params from hyperparamater tuning

#### Re-generate train dataset iterator

In [11]:
sliding_window_data = pd.read_csv("sliding_window_data.csv", chunksize=chunksize, iterator=True, skiprows=test_index)

In [12]:
regression_model_hyper = PassiveAggressiveRegressor(C=0.01,random_state=seed, shuffle=True)
for batch in tqdm(sliding_window_data):
    drop_cols = ["date","time","station"]
    for i in range(1,7):
        for j in range(1,7):
            drop_cols.append(f"T{i}S{j}_station number")
    batch.drop(drop_cols, axis=1, inplace=True)
    batch.fillna(value=0, inplace=True)
    regression_model_hyper.partial_fit(batch.iloc[:,1:], batch.iloc[:,0])

21it [08:52, 25.35s/it]


In [13]:
prediction_hyper = regression_model_hyper.predict(test_dataset.iloc[:,1:])
display_metrics(prediction_hyper,test_dataset.iloc[:,0])

R2 score is : -0.021399897507405896
MAE is : 0.1137264995162555
MSE is : 0.07751863237041332
Recall is : 0.9979424016345676
False negative rate is : 0.0020575983654323475
False positive rate (FP/FP+TN) is : 0.0003951859317922659
