In [72]:
import pandas as pd
import numpy as np
import pyarrow
from sklearn.model_selection import train_test_split

#### Reading in the data from S3

In [None]:
import boto3
import pandas as pd

BUCKET = "bucket_name"
PREFIX = "prefix_name"

s3 = boto3.client("s3")

resp = s3.list_objects_v2(Bucket=BUCKET, Prefix=PREFIX)
keys = [o["Key"] for o in resp["Contents"] if o["Key"].endswith(".csv")]

df = pd.concat(
    [pd.read_csv(f"s3://{BUCKET}/{k}") for k in keys],
    ignore_index=True
)

df.drop(["snow", "wpgt", "tsun", "coco", "wdir"], axis =1, inplace = True)

df = df[(df['wspd']<=156) & (df["pres"]>=977)].copy()
df.head()

Unnamed: 0,time,station_name,station_lat,station_lon,station_elevation,temp,dwpt,rhum,prcp,wspd,pres
1324,2000-01-01 17:00:00,Hayward / Russell City,37.6589,-122.1218,16.0,10.0,7.1,82.0,,7.6,1020.4
1325,2000-01-01 20:00:00,Hayward / Russell City,37.6589,-122.1218,16.0,11.0,5.9,71.0,,13.0,1020.4
1326,2000-01-02 17:00:00,Hayward / Russell City,37.6589,-122.1218,16.0,6.0,0.9,70.0,,9.4,1027.7
1327,2000-01-04 12:00:00,Hayward / Russell City,37.6589,-122.1218,16.0,5.0,2.0,81.0,,7.6,1032.0
1328,2000-01-04 17:00:00,Hayward / Russell City,37.6589,-122.1218,16.0,8.0,4.0,76.0,,0.0,1033.5


In [74]:
df[["temp", "dwpt", "rhum", "prcp", "wspd", "pres"]].describe()

Unnamed: 0,temp,dwpt,rhum,prcp,wspd,pres
count,2942604.0,2941598.0,2941634.0,2594962.0,2943424.0,2943424.0
mean,14.93292,8.687834,69.72833,0.05249518,11.86738,1016.659
std,5.663648,4.355639,18.79068,0.3835878,9.299329,4.906063
min,-18.0,-45.0,0.0,0.0,0.0,977.0
25%,11.1,6.1,58.0,0.0,5.4,1013.3
50%,14.4,9.4,73.0,0.0,11.0,1016.2
75%,17.8,12.0,83.0,0.0,17.0,1019.9
max,45.2,27.0,100.0,24.4,143.0,1036.5


In [75]:
#Checking for NULLs

df.isna().sum()

time                      0
station_name              0
station_lat               0
station_lon               0
station_elevation         0
temp                    820
dwpt                   1826
rhum                   1790
prcp                 348462
wspd                      0
pres                      0
dtype: int64

#### Dividing the data into training and test datasets

In [76]:
prcp_df = df.drop_duplicates().copy()
prcp_df["time"] = pd.to_datetime(prcp_df["time"])

prcp_train = prcp_df[prcp_df["time"].dt.year<=2022].copy()
prcp_test = prcp_df[prcp_df["time"].dt.year>2022].copy()

print(f"The length of the test dataset is {len(prcp_test)/(len(prcp_train) + len (prcp_test))* 100} % \
of the entire dataset")

The length of the test dataset is 22.99260996716749 % of the entire dataset


#### Imputation with historic monthly mean on train set

In [77]:
prcp_train["month"] = prcp_train["time"].dt.month

for col in ["temp", "dwpt", "rhum", "prcp", "wspd", "pres"]:
    group_mean = prcp_train.groupby(["month"])[col].transform("mean")

    prcp_train[col] = prcp_train[col].fillna(group_mean)

    prcp_train[f"{col}_inputation_val"] = group_mean

#### Imputation with historic monthly mean on test set

In [78]:
test_inputation = prcp_train[["month", "temp_inputation_val", "dwpt_inputation_val", "rhum_inputation_val", "prcp_inputation_val", "wspd_inputation_val", \
"pres_inputation_val"]].drop_duplicates().copy()

prcp_test["month"] = prcp_test["time"].dt.month

prcp_test = prcp_test.merge(test_inputation, on = "month", how = 'left')


for col in ["temp", "dwpt", "rhum", "prcp", "wspd", "pres"]:
    prcp_test[col] = prcp_test[col].fillna(prcp_test[f"{col}_inputation_val"])
    prcp_test.drop(f"{col}_inputation_val", axis = 1, inplace = True)

In [79]:
prcp_train.drop(["temp_inputation_val", "dwpt_inputation_val", "rhum_inputation_val", "prcp_inputation_val", "wspd_inputation_val", \
"pres_inputation_val"], axis = 1, inplace = True)

In [None]:
import boto3

bucket = "bucket_name"
key = "train_test/train.csv"

prcp_train.to_csv("/tmp/train.csv", index=False)

s3 = boto3.client("s3")
s3.upload_file("/tmp/train.csv", bucket, key)



bucket = "bucket_name"
key = "train_test/test.csv"

prcp_test.to_csv("/tmp/test.csv", index=False)

s3 = boto3.client("s3")
s3.upload_file("/tmp/test.csv", bucket, key)



train_test_clean = pd.concat([prcp_train, prcp_test], axis = 0)

bucket = "bucket_name"
key = "train_test/train_test_clean.csv"

train_test_clean.to_csv("/tmp/train_test_clean.csv", index=False)

s3 = boto3.client("s3")
s3.upload_file("/tmp/train_test_clean.csv", bucket, key)



#### Fitting a Random Forest Model

#### Preparing training data for Random Forest model

In [None]:
train = pd.read_csv("train_address")
train.isna().sum()

time                 0
station_name         0
station_lat          0
station_lon          0
station_elevation    0
temp                 0
dwpt                 0
rhum                 0
prcp                 0
wspd                 0
pres                 0
month                0
dtype: int64

In [None]:
train.sort_values(by = ["station_name", "time"], inplace = True)

train['time'] = pd.to_datetime(train["time"])

train["time_plus_1"] = train["time"] + pd.Timedelta(hours = 1)

to_merge_train = train[["time", "station_name", "temp"]].copy()
to_merge_train.rename(columns = {"temp":"future_temp"}, inplace = True)

train = train.merge(to_merge_train, left_on = ["station_name", "time_plus_1"], right_on = ["station_name", "time"], how = "left")

train.rename(columns = {"time_x":"time"}, inplace = True)
train.drop(["time_y", "time_plus_1"], axis = 1, inplace = True)

train.dropna(subset = ["future_temp"], inplace = True)

train_red = train[["station_elevation", "month",  "temp", "dwpt", "rhum", "prcp", "wspd", "pres", "future_temp"]].copy()

Unnamed: 0,station_elevation,month,temp,dwpt,rhum,prcp,wspd,pres,future_temp
0,24.0,1,12.0,6.1,67.0,0.093955,6.8,1020.9,13.0
1,24.0,1,13.0,5.2,59.0,0.000000,7.9,1020.7,13.0
2,24.0,1,13.0,5.2,59.0,0.000000,4.0,1021.4,8.0
3,24.0,1,8.0,5.1,82.0,0.000000,1.8,1022.0,6.0
4,24.0,1,6.0,4.0,87.0,0.000000,5.0,1022.2,6.0
...,...,...,...,...,...,...,...,...,...
1133321,19.0,12,15.6,13.3,86.0,0.164267,20.5,1008.6,15.6
1133322,19.0,12,15.6,12.7,83.0,1.300000,22.3,1008.2,15.0
1133323,19.0,12,15.0,12.9,87.0,0.800000,14.8,1004.9,15.0
1133324,19.0,12,15.0,12.9,87.0,0.500000,18.4,1002.9,15.0


#### Preparing test data for Random Forest model

In [None]:
test = pd.read_csv("test_address")
test.isna().sum()

time                 0
station_name         0
station_lat          0
station_lon          0
station_elevation    0
temp                 0
dwpt                 0
rhum                 0
prcp                 0
wspd                 0
pres                 0
month                0
dtype: int64

In [None]:
test.sort_values(by = ["station_name", "time"], inplace = True)

test['time'] = pd.to_datetime(test["time"])

test["time_plus_1"] = test["time"] + pd.Timedelta(hours = 1)

to_merge_test = test[["time", "station_name", "temp"]].copy()
to_merge_test.rename(columns = {"temp":"future_temp"}, inplace = True)

test = test.merge(to_merge_test, left_on = ["station_name", "time_plus_1"], right_on = ["station_name", "time"], how = "left")

test.rename(columns = {"time_x":"time"}, inplace = True)
test.drop(["time_y", "time_plus_1"], axis = 1, inplace = True)

test.dropna(subset = ["future_temp"], inplace = True)

test_red = test[["station_elevation", "month",  "temp", "dwpt", "rhum", "prcp", "wspd", "pres", "future_temp"]].copy()

Unnamed: 0,station_elevation,month,temp,dwpt,rhum,prcp,wspd,pres,future_temp
0,24.0,1,12.0,6.1,67.0,0.093955,6.8,1020.9,13.0
1,24.0,1,13.0,5.2,59.0,0.000000,7.9,1020.7,13.0
2,24.0,1,13.0,5.2,59.0,0.000000,4.0,1021.4,8.0
3,24.0,1,8.0,5.1,82.0,0.000000,1.8,1022.0,6.0
4,24.0,1,6.0,4.0,87.0,0.000000,5.0,1022.2,6.0
...,...,...,...,...,...,...,...,...,...
1133321,19.0,12,15.6,13.3,86.0,0.164267,20.5,1008.6,15.6
1133322,19.0,12,15.6,12.7,83.0,1.300000,22.3,1008.2,15.0
1133323,19.0,12,15.0,12.9,87.0,0.800000,14.8,1004.9,15.0
1133324,19.0,12,15.0,12.9,87.0,0.500000,18.4,1002.9,15.0


#### Fitting model

In [None]:
from sklearn.ensemble import RandomForestRegressor

features = ["station_elevation","month", "temp","dwpt","rhum","prcp","wspd","pres"]
target = "future_temp"

X_train = train_red[features]
y_train = train_red[target]
X_test  = test_red[features]
y_test = test_red[target]

rf = RandomForestRegressor(random_state=42, n_jobs=-1, verbose = 0)

rf.fit(X_train, y_train)

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [86]:
from sklearn.metrics import mean_squared_error
import numpy as np

# predict
y_pred = rf.predict(X_test)
y_pred = pd.Series(y_pred)

# compute RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print("Root Mean Squared Error (RMSE):", rmse)



Root Mean Squared Error (RMSE): 0.4911737486254741
