In [1]:
import pandas as pd
import datetime
import numpy as np
from sklearn.linear_model import SGDRegressor, PassiveAggressiveRegressor
from sklearn.model_selection import train_test_split
import sklearn.metrics as sm
from sklearn.neural_network import MLPRegressor
from tqdm.autonotebook import tqdm

  from tqdm.autonotebook import tqdm


In [2]:
chunksize = 1000000
sliding_window_data = pd.read_csv("sliding_window_data.csv", usecols = [], chunksize=chunksize, iterator=True)

#### Setting Seed value

In [3]:
seed = 3101
np.random.seed(3101)

In [4]:
# train_data, test_data = train_test_split(sliding_window_data, test_size=0.2, random_state=seed)

In [5]:
# sliding_window_data.dtypes

In [15]:
drop_cols = ["date","time","station"]
for i in range(1,7):
    for j in range(1,7):
        drop_cols.append(f"T{i}S{j}_station number")

In [8]:
regression_model = SGDRegressor(random_state=seed, shuffle=True)
for batch in tqdm(sliding_window_data):
    drop_cols = ["date","time","station"]
    for i in range(1,7):
        for j in range(1,7):
            drop_cols.append(f"T{i}S{j}_station number")
    batch.drop(drop_cols, axis=1, inplace=True)
    batch.fillna(value=0, inplace=True)
    regression_model.partial_fit(batch.iloc[:,1:], batch.iloc[:,0])

24it [07:21, 18.41s/it]


In [14]:
random_skips = np.random.randint(1,chunksize,size=3101)
test_data = pd.read_csv("sliding_window_data.csv", header=0, nrows=chunksize, skiprows=random_skips)
drop_cols = ["date","time","station"]
for i in range(1,7):
    for j in range(1,7):
        drop_cols.append(f"T{i}S{j}_station number")
test_data.drop(drop_cols, axis=1, inplace=True)
test_data.fillna(value=0, inplace=True)

In [15]:
prediction = regression_model.predict(test_data.iloc[:,1:])

In [33]:
print()

             predict
0       4.123007e+12
1       4.123007e+12
2       4.123007e+12
3       4.123007e+12
4       4.123007e+12
...              ...
999995  4.501534e+12
999996  4.398269e+12
999997  4.143803e+12
999998  4.335334e+12
999999  4.316067e+12

[1000000 rows x 1 columns]


In [43]:
def display_metrics(y_pred,y_true):
    print(f'R2 score is : {sm.r2_score(y_true,y_pred)}')
    print(f'MAE is : {sm.mean_absolute_error(y_true,y_pred)}')
    print(f'MSE is : {sm.mean_squared_error(y_true,y_pred)}')
    
    threshold = 0.25
    temp = pd.concat([pd.DataFrame(prediction, columns=["predict"]),y_true], axis=1)
    fn = temp[temp["value"] > threshold]
    fn = fn[fn["predict"] <= threshold ]
    p = y_pred[y_pred > 0.25]
    fnr = len(fn)/len(p)
    recall = 1-fnr
    print(f'Recall is : {recall}')
    print(f'False negative rate is : {fnr}')

    tp = temp[temp["value"] > threshold]
    tp = tp[tp["predict"] > threshold ]
    fp = temp[temp["value"] <= threshold]
    fp = fp[fp["predict"] > threshold]
    precision = len(tp)/(len(tp) + len(fp))
    fdr = 1-precision
    print(f'Precision is : {precision}')
    print(f'False discovery rate (FP/FP+TP) is : {fdr}')


In [44]:
display_metrics(prediction,test_data.iloc[:,0])

R2 score is : -2.441630395623924e+26
MAE is : 4116997104511.3364
MSE is : 1.6966181490970053e+25
Recall is : 0.9999929998599972
False negative rate is : 7.000140002800056e-06
Precision is : 0.01384227684553691
False discovery rate (FP/FP+TP) is : 0.986157723154463
