In [1]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
import sklearn.metrics as skm
import pandas as pd

# Load data
data = pd.read_csv('sliding_window_data.csv',nrows=1000000)

In [2]:
#Fill the NA data with 0
data = data.fillna(0)

In [6]:
data

Unnamed: 0,date,time,station,value,T1S1_time,T1S1_station number,T1S1_dist,T1S1_value,T1S2_time,T1S2_station number,...,T6S4_dist,T6S4_value,T6S5_time,T6S5_station number,T6S5_dist,T6S5_value,T6S6_time,T6S6_station number,T6S6_dist,T6S6_value
0,2023-02-03,00:10:00,S77,0,-35,S223,0.000135,0,-35,S222,...,0.000434,0,-60,S79,0.000655,0,-60,S77,0,0
1,2023-02-03,00:15:00,S77,0,-35,S223,0.000135,0,-35,S222,...,0.000434,0,-60,S79,0.000655,0,-60,S77,0,0
2,2023-02-03,00:20:00,S77,0,-35,S223,0.000135,0,-35,S222,...,0.000434,0,-60,S79,0.000655,0,-60,S77,0,0
3,2023-02-03,00:25:00,S77,0,-35,S223,0.000135,0,-35,S222,...,0.000434,0,-60,S79,0.000655,0,-60,S77,0,0
4,2023-02-03,00:30:00,S77,0,-35,S223,0.000135,0,-35,S222,...,0.000434,0,-60,S79,0.000655,0,-60,S77,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,2020-04-07,21:50:00,S90,0,-35,S120,0.000109,0,-35,S213,...,0.000579,0,-60,S223,0.000642,0,-60,S90,0,0
999996,2020-04-07,21:55:00,S90,0,-35,S120,0.000109,0,-35,S213,...,0.000579,0,-60,S223,0.000642,0,-60,S90,0,0
999997,2020-04-07,22:00:00,S90,0,-35,S120,0.000109,0,-35,S213,...,0.000579,0,-60,S223,0.000642,0,-60,S90,0,0
999998,2020-04-07,22:05:00,S90,0,-35,S120,0.000109,0,-35,S213,...,0.000579,0,-60,S223,0.000642,0,-60,S90,0,0


In [4]:
#Change the time variable as factor
cols_to_change = ['T1S1_time','T1S2_time','T1S3_time','T1S4_time','T1S5_time','T1S6_time',
                 'T2S1_time','T2S2_time','T2S3_time','T2S4_time','T2S5_time','T2S6_time',
                 'T3S1_time','T3S2_time','T3S3_time','T3S4_time','T3S5_time','T3S6_time',
                 'T4S1_time','T4S2_time','T4S3_time','T4S4_time','T4S5_time','T4S6_time',
                 'T5S1_time','T5S2_time','T5S3_time','T5S4_time','T5S5_time','T5S6_time',
                 'T6S1_time','T6S2_time','T6S3_time','T6S4_time','T6S5_time','T6S6_time']

data[cols_to_change] = data[cols_to_change].astype(object)

In [5]:
cols_to_modify = ['value','T1S1_value','T1S2_value','T1S3_value','T1S4_value','T1S5_value','T1S6_value',
                 'T2S1_value','T2S2_value','T2S3_value','T2S4_value','T2S5_value','T2S6_value',
                 'T3S1_value','T3S2_value','T3S3_value','T3S4_value','T3S5_value','T3S6_value',
                 'T4S1_value','T4S2_value','T4S3_value','T4S4_value','T4S5_value','T4S6_value',
                 'T5S1_value','T5S2_value','T5S3_value','T5S4_value','T5S5_value','T5S6_value',
                 'T6S1_value','T6S2_value','T6S3_value','T6S4_value','T6S5_value','T6S6_value']
for i in cols_to_modify:
    data[i] = [1 if item > 0 else 0 for item in data[i]]

In [5]:
#data[cols_to_modify] = data[cols_to_modify].astype(object)

In [7]:
#with the origin station data
X = data.drop(['date','time','station','value',
               'T1S1_station number', 'T1S2_station number', 'T1S3_station number','T1S4_station number','T1S5_station number','T1S6_station number',
              'T2S1_station number', 'T2S2_station number', 'T2S3_station number','T2S4_station number','T2S5_station number','T2S6_station number',
              'T3S1_station number', 'T3S2_station number', 'T3S3_station number','T3S4_station number','T3S5_station number','T3S6_station number',
              'T4S1_station number', 'T4S2_station number', 'T4S3_station number','T4S4_station number','T4S5_station number','T4S6_station number',
              'T5S1_station number', 'T5S2_station number', 'T5S3_station number','T5S4_station number','T5S5_station number','T5S6_station number',
              'T6S1_station number', 'T6S2_station number', 'T6S3_station number','T6S4_station number','T6S5_station number','T6S6_station number'], axis=1) # Input features
y = data['value'] # Target variable
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)

In [8]:

#X = pd.get_dummies(X) # One-hot encoding
# or
#from sklearn.preprocessing import LabelEncoder
#le = LabelEncoder()
#y = le.fit_transform(y)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)

In [19]:
from sklearn.ensemble import RandomForestClassifier
rf_100 = RandomForestRegressor(n_estimators=40, random_state=42)
rf_100.fit(X_train, y_train)

RandomForestRegressor(n_estimators=40, random_state=42)

In [20]:
y_pred = rf_100.predict(X_test)

In [19]:
#skm.confusion_matrix(y_pred, y_test)
#1-skm.recall_score(y_pred, y_test)

array([[386520,   8207],
       [  2174,   3099]])

In [21]:
result = pd.DataFrame({'predicted': y_pred, 'actual': y_test}, columns=['predicted', 'actual'])
result[result['predicted'] >0.4]

Unnamed: 0,predicted,actual
974907,0.439584,0
320111,0.567857,0
354251,0.600000,1
112771,0.539899,0
524,0.450000,0
...,...,...
715181,0.550000,1
290068,0.542642,0
638295,0.525000,0
179257,0.447799,0
