In [231]:
import pandas as pd
import os
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error

In [232]:
label = 'COND_mS_m'
station = 'H033402'

columns = ['Mg_meq_L', 'TOTP_mg_L', 'pH', 'TEMP_°C', 'TSS_mg_L', 'DO_mg_L',
           'NH4N_mg_L', 'SO4_meq_L', 'Ca_meq_L', 'Cl_meq_L', 'NO32_mg_L', 'ALK_meq_L', label]
columns = list(set(columns))

train_path = '../data/train/'
test_path = '../data/test/'
train_files = os.listdir(train_path)
test_files = os.listdir(test_path)
train_df = pd.concat(
    [pd.read_csv(os.path.join(train_path, f)) for f in train_files])
test_df = pd.concat(
    [pd.read_csv(os.path.join(test_path, f)) for f in test_files])


In [233]:
station_train_df = train_df[train_df['STATID'] == station]
station_train_df

Unnamed: 0,MC,STATID,SDATE,Y,M,D,TEMP_°C,pH,TSS_mg_L,COND_mS_m,...,Na_meq_L,K_meq_L,ALK_meq_L,Cl_meq_L,SO4_meq_L,NO32_mg_L,NH4N_mg_L,TOTP_mg_L,DO_mg_L,CODMN_mg_L
0,Cambodia,H033402,2005-12-26,2005,12,26,27.8,6.35,42.00,8.50,...,0.292,0.043,0.757,0.088,0.144,0.089,0.056,0.022,6.581,2.002
1,Cambodia,H033402,2001-12-21,2001,12,21,29.6,7.70,12.00,9.56,...,0.184,0.035,0.768,0.040,0.149,0.153,0.002,0.014,5.700,4.320
2,Cambodia,H033402,2003-09-28,2003,9,28,29.0,6.22,110.00,10.49,...,0.180,0.027,0.746,0.022,0.232,0.130,0.002,0.088,6.000,1.400
3,Cambodia,H033402,2013-08-23,2013,8,23,29.5,7.42,52.00,10.49,...,0.173,0.038,0.765,0.121,0.181,0.318,0.066,0.200,7.290,1.566
4,Cambodia,H033402,2012-10-22,2012,10,22,31.0,6.75,88.50,10.07,...,0.254,0.022,0.729,0.103,0.209,0.033,0.082,0.010,6.370,2.853
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103,Cambodia,H033402,2006-07-19,2006,7,19,29.5,6.85,113.00,8.87,...,0.237,0.034,0.723,0.123,0.101,0.163,0.018,0.223,7.782,4.899
104,Cambodia,H033402,2003-02-22,2003,2,22,30.0,7.02,3.33,10.17,...,0.240,0.044,0.852,0.031,0.139,0.144,0.002,0.001,7.304,2.522
105,Cambodia,H033402,2004-05-19,2004,5,19,30.2,7.10,2.00,16.94,...,0.520,0.051,1.140,0.280,0.316,0.032,0.041,0.061,9.327,4.000
106,Cambodia,H033402,1999-10-14,1999,10,14,30.4,6.67,114.00,10.54,...,0.240,0.030,0.855,0.066,0.066,0.430,0.030,0.010,5.300,3.448


In [234]:
# remove rows where STATID is not station
train_df = train_df[train_df['STATID'] != station]

In [235]:
test_df = test_df[test_df['STATID'] == station]


train_df = train_df[columns]
test_df = test_df[columns]

X_train = train_df.drop(columns=[label])
y_train = train_df[label]
X_test = test_df.drop(columns=[label])
y_test = test_df[label]

In [237]:
model = RandomForestRegressor(n_estimators=100, random_state=42)

In [238]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [239]:
model.fit(X_train, y_train)
y_pred = model.predict(X_test)


In [240]:
r2_score = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
print(f'R2 score: {r2_score}')
print(f'MSE: {mse}')

R2 score: 0.8738755847991352
MSE: 1.2287587092222216


In [241]:
# get the pre trained model to fit the train data of that station
station_train_df = station_train_df[columns]
X_station_train = station_train_df.drop(columns=[label])
y_station_train = station_train_df[label]

model.fit(X_station_train, y_station_train)

In [242]:
y_pred = model.predict(X_test)
y_test = y_test.values



In [243]:
y_test, y_pred

(array([15.44 ,  8.37 , 16.5  ,  8.9  , 11.25 , 10.2  ,  9.78 ,  9.2  ,
         9.34 , 13.28 ,  8.483,  8.54 , 16.52 , 12.4  , 13.48 ,  9.2  ,
         7.   , 10.6  , 13.8  , 12.9  , 11.67 , 16.6  ,  8.27 , 10.3  ,
        17.   , 17.6  ,  9.86 ]),
 array([10.3557 ,  7.82788,  9.03448,  7.8194 ,  7.9317 ,  7.82788,
         7.82898,  7.92534,  7.92534,  9.83314,  7.92856,  7.9271 ,
        10.665  ,  7.92534,  9.89104,  7.92534,  7.8194 ,  7.82788,
         9.86846,  7.82788,  7.82776,  9.05996,  7.82788,  9.83928,
        10.9488 , 11.00984,  7.89638]))

In [245]:
def r2_score(y_true, y_pred):
    y_mean = np.mean(y_true)
    ss_tot = np.sum((y_true - y_mean)**2)
    ss_res = np.sum((y_true - y_pred)**2)
    return 1 - ss_res/ss_tot

In [246]:
r2_score(y_test, y_pred)

-0.285240895175201