# ID5059 Group 4 Assignment

In [1]:
# Libraries Required for Project (Added as Project Progresses)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor

## 1. Display the first few Observations  
The first step will be to load the dataset  
The head() and sample() methods will be used to look at the features and observations of the dataset.

In [2]:
# Load the dataset from CSV
train_df = pd.read_csv('train.csv')

# Confirm it's loaded by showing the shape
print(f"Data loaded: {train_df.shape[0]:,} rows and {train_df.shape[1]} columns")

# Display the first few rows
train_df.head()

# Display a random five rows
train_df.sample(5)

Data loaded: 13,288,920 rows and 13 columns


Unnamed: 0,id,valid_time,latitude,longitude,tp,u10,v10,sp,u100,v100,tcc,ptype,t2m
4614694,4614695,2018-05-07 17:00:00,50.0,-2.75,0.0,0.422897,-1.128342,101477.125,-0.582809,-0.239777,0.0,0.0,286.04297
11661644,11661645,2018-11-17 07:00:00,56.25,-4.5,7e-06,-4.839416,-0.319321,99037.19,-8.124847,-0.290573,0.996246,1.0,278.70483
10065540,10065541,2018-10-04 11:00:00,57.75,2.0,0.0,8.723419,6.990631,101337.81,12.268143,9.180862,0.220734,1.0,285.3452
9867429,9867430,2018-09-29 00:00:00,53.75,-8.0,0.0,1.102234,0.052383,102433.75,1.109055,-0.377686,0.532806,0.0,278.80933
2713978,2713979,2018-03-16 13:00:00,58.75,-2.0,5.6e-05,-14.05188,9.378448,101463.06,-16.914536,11.411743,1.0,1.0,277.0193


## 2. Missing Values and Duplicates  
Identify if there are any missing values or duplicates in the dataset as these could effect analysis

In [3]:
# Check for missing values
missing_values = train_df.isnull().sum()
missing_values = missing_values[missing_values > 0]

if missing_values.empty:
    print("No missing values in any column.")
else:
    print("Missing values found:")
    print(missing_values)

# Check for duplicates
duplicate_rows = train_df.duplicated().sum()
print(f"Number of duplicate rows: {duplicate_rows}")

No missing values in any column.
Number of duplicate rows: 0


## 3. Dataset Feature Descriptions  

id: Observation Identifier (Not Needed)  
valid_time: Timestamp of the observation (could convert to datetime)  
latitude: Geospatial location  
longitude: Geospatial location  
tp: Total precipitation (metres)  
u10: Wind u components at 10m height positive = eastward, negative = westward  
v10: Wind v components at 10m height positive = northward, negative = southward  
u100: Same as u10 but at 100m height  
v100: Same as v10 but at 100m height  
sp: Surface pressure (Pa)  
tcc: Total cloud cover (0 = clear, 1 = full cloud)  
ptype: Precipitation type (categorical)  
t2m: Temperature at 2 metres (in Kelvin — convert to Celsius?)

## 4. Simplify and Encode the feature ptype  

Precipitation Type Simplification and One-Hot Encoding  
The ptype column contains multiple weather conditions, but some are extremely rare. We grouped them into a few meaningful categories to reduce noise:  
0: No precipitation  
1: Regular rainfall  
2 - 11: Other 

In [4]:
# Define grouping function
def simplify_ptype(ptype):
    if ptype == 0:
        return 'none'
    elif ptype == 1:
        return 'rain'
    else:
        return 'other'

# Apply grouping
train_df['ptype_grouped'] = train_df['ptype'].apply(simplify_ptype)

# One-hot encode
ptype_dummies = pd.get_dummies(train_df['ptype_grouped'], prefix='ptype')
train_df = pd.concat([train_df, ptype_dummies], axis=1)

## 5. Convert Temperature from Kelvin to Celsius  
Temperature Conversion  
The t2m variable represents temperature in Kelvin, which is not ideal for interpretation or modelling. It will be converted to Celsius, which is more intuitive for most people and machine learning models.  

### I cannot see any client for this assignment and the predictions.csv file shows Kelvin for the temperature so we can keep it as Kelvin. Either way is fine.

In [5]:
train_df['t2m_C'] = train_df['t2m'] - 273.15

## 6. Drop the ID Feature  
Removing the id Column:  
The id column is a unique identifier for each row and does not carry any predictive information so it will be removed.

In [18]:
# Drop the 'id' column
train_df.drop(columns=['id'], inplace=True)

# Confirm removal
print("id column removed.")

id column removed.


## 7. Create Time-Based Features  
Creating Time-Based Features from valid_time  
The valid_time column contains timestamp information. This will be converted to datetime format and extract useful features like:  
hour: Hour of the day (0–23)  
month: Month of the year (1–12)  
dayofweek: Day of the week (0 = Monday, 6 = Sunday)  

These features may help capture daily and seasonal weather patterns.

### We could remove the dayofweek here, it isn't needed

In [6]:
# Convert 'valid_time' to datetime format
train_df['valid_time'] = pd.to_datetime(train_df['valid_time'])

# Extract time-based features
train_df['hour'] = train_df['valid_time'].dt.hour
train_df['month'] = train_df['valid_time'].dt.month
train_df['dayofweek'] = train_df['valid_time'].dt.dayofweek

# Display some examples
train_df[['valid_time', 'hour', 'month', 'dayofweek']].sample(10)

Unnamed: 0,valid_time,hour,month,dayofweek
2427222,2018-03-08 16:00:00,16,3,3
5850147,2018-06-10 16:00:00,16,6,6
3864974,2018-04-17 03:00:00,3,4,1
10781454,2018-10-24 03:00:00,3,10,2
464761,2018-01-13 18:00:00,18,1,5
10970289,2018-10-29 07:00:00,7,10,0
6772963,2018-07-06 00:00:00,0,7,4
7636443,2018-07-29 17:00:00,17,7,6
420322,2018-01-12 13:00:00,13,1,4
3445094,2018-04-05 14:00:00,14,4,3


## 8. Create Wind Speed Features  

Calculating Wind Speed from Wind Components. The wind is represented by two components:  

u = east–west (positive = eastward, negative = westward)  
v = north–south (positive = northward)  

To simplify these into one feature, we calculate wind speed using the formula:  
wind speed = sqrt(u2 + v2)  

### This can be done for both 10m and 100m wind levels or we could just do it for one of them and drop the other.

In [7]:
# Calculate wind speed at 10m and 100m
train_df['wind10_speed'] = np.sqrt(train_df['u10']**2 + train_df['v10']**2)
train_df['wind100_speed'] = np.sqrt(train_df['u100']**2 + train_df['v100']**2)

# Display a few rows to check
train_df[['u10', 'v10', 'wind10_speed', 'u100', 'v100', 'wind100_speed']].head()

Unnamed: 0,u10,v10,wind10_speed,u100,v100,wind100_speed
0,1.405868,-1.367767,1.961442,1.516388,-1.468002,2.11056
1,2.151962,-1.555267,2.655146,2.239044,-1.634994,2.772458
2,2.691025,-1.585541,3.123388,2.783966,-1.686752,3.255088
3,2.850204,-1.714447,3.326108,2.971466,-1.854721,3.502799
4,3.299423,-1.867767,3.791405,3.455841,-2.001205,3.993452


## 9. Wind Direction  

Calculating Wind Direction  

In addition to wind speed, we can calculate wind direction from the u and v components of wind.  

This will show where the wind is coming from, measured in degrees (0° = north, 90° = east, etc.)  
Wind direction may influence temperature, depending on regional wind patterns (e.g., warm air from the south, cold air from the north)  

What will be calculated:  

wind10_dir: Wind direction at 10 metres  
wind100_dir: Wind direction at 100 metres  

It will also convert the direction into sine and cosine components to handle the circular nature of angles (so the model understands that 0° and 360° are essentially the same).

### Again we can drop one of these remove the u100 and v100 features to try and save processing speed

In [8]:
# Wind direction (in degrees), where 0 = north, 90 = east, etc.
train_df['wind10_dir'] = (np.arctan2(train_df['u10'], train_df['v10']) * (180 / np.pi)) % 360
train_df['wind100_dir'] = (np.arctan2(train_df['u100'], train_df['v100']) * (180 / np.pi)) % 360

# Convert to radians for sine/cosine encoding
train_df['wind10_dir_rad'] = np.radians(train_df['wind10_dir'])

# Sine and cosine components
train_df['wind10_dir_sin'] = np.sin(train_df['wind10_dir_rad'])
train_df['wind10_dir_cos'] = np.cos(train_df['wind10_dir_rad'])

# Repeat for 100m wind in case it's useful
train_df['wind100_dir_rad'] = np.radians(train_df['wind100_dir'])
train_df['wind100_dir_sin'] = np.sin(train_df['wind100_dir_rad'])
train_df['wind100_dir_cos'] = np.cos(train_df['wind100_dir_rad'])

# Check the dataset
train_df[['u10', 'v10', 'wind10_dir', 'wind10_dir_sin', 'wind10_dir_cos', 'wind100_dir']].sample(10)

Unnamed: 0,u10,v10,wind10_dir,wind10_dir_sin,wind10_dir_cos,wind100_dir
650966,4.760696,2.263199,64.573941,0.90314,0.429346,66.026277
11172334,0.876099,9.998276,5.00775,0.08729,0.996183,5.211576
2897083,4.754364,-0.196503,92.366746,0.999147,-0.041296,93.497193
2441940,1.420212,1.56012,42.312297,0.673171,0.739487,62.648292
2433388,-2.655533,6.413925,337.509165,-0.382536,0.923941,338.113076
9029975,3.615951,-3.778641,136.260367,0.691382,-0.722489,138.324172
8767116,8.618744,-1.022278,96.764306,0.993039,-0.117785,97.117417
5874411,3.928131,-3.955841,135.201378,0.704617,-0.709588,135.674297
11396795,0.264923,5.76181,2.632557,0.045931,0.998945,4.498919
4746931,-5.028229,6.398361,321.837541,-0.617893,0.786262,326.753923


## 10. Scaling Numerical Features for Modelling  

It is planned to use linear regression and neural networks, so, it is important to scale our numerical features. These models are sensitive to differences in magnitude between features.  

For example:  

sp: (surface pressure) is around 100,000 Pa  
tp: (precipitation) is usually < 0.01
tcc:  (cloud cover) already ranges between 0 and 1  

If these features arenot scaled, models may unintentionally give more weight to features with larger values, even if they are not more important.  

Standard Scaling can be used, which transforms features to have:  
A mean of 0  
A standard deviation of 1  

The following will not scaled:

tcc: Already normalised between 0–1
One-hot encoded variables (e.g., precipitation type, day of week)
Wind direction sine/cosine: Already between -1 and 1  

These will be scaled:  
Surface pressure  
Wind speed  
Precipitation  
Time-based features (e.g., hour, month), optional, not sure but may be helpful

### Not sure if we need some of these features but I have normalised them anyway. We can drop the wind100_speed and dayofweek features from here as well

In [9]:
# List of features to scale
features_to_scale = [
    # Total precipitation
    'tp',
    # Surface pressure
    'sp',
    # Wind speed at 10m
    'wind10_speed',
    # Wind speed at 100m
    'wind100_speed',
    # Hour of day
    'hour',
    # Month of year
    'month',
    # Day of week
    'dayofweek',     
]

# Create a copy of the dataframe to preserve original features
train_scaled = train_df.copy()

# Initialise the scaler
scaler = StandardScaler()

# Apply scaling to selected features
train_scaled[features_to_scale] = scaler.fit_transform(train_scaled[features_to_scale])

# Display scaled values to confirm
train_scaled[features_to_scale].head()

Unnamed: 0,tp,sp,wind10_speed,wind100_speed,hour,month,dayofweek
0,-0.346954,-1.455549,-1.203479,-1.440209,-1.661325,-1.602745,-1.493348
1,-0.306935,-1.484714,-1.009811,-1.289457,-1.661325,-1.602745,-1.493348
2,-0.302488,-1.489575,-0.879087,-1.179535,-1.661325,-1.602745,-1.493348
3,-0.299524,-1.49079,-0.822491,-1.123118,-1.661325,-1.602745,-1.493348
4,-0.311381,-1.485929,-0.69259,-1.011368,-1.661325,-1.602745,-1.493348


## 11. Cyclical Time Feature Encoding  

Encoding Time Features as Cyclical Values  

Time-based features like hour, month, and dayofweek are cyclical, they repeat in a loop:  

Hour 23 is followed by hour 0 (midnight)  
December (12) is followed by January (1)  
Sunday (6) is followed by Monday (0)  

Treating these as plain numbers would mislead the model into thinking that the highest values are "further away" from the lowest, when in fact they are neighbours.  

To handle this, we transform each feature using sine and cosine. This places the values on a circle, helping the model understand the cyclical nature of time.

### I have kept these explanations in for our write up if we need it. Again no need for day of the week

In [10]:
# Encode 'hour' as cyclical
train_scaled['hour_sin'] = np.sin(2 * np.pi * train_scaled['hour'] / 24)
train_scaled['hour_cos'] = np.cos(2 * np.pi * train_scaled['hour'] / 24)

# Encode 'month' as cyclical
train_scaled['month_sin'] = np.sin(2 * np.pi * train_scaled['month'] / 12)
train_scaled['month_cos'] = np.cos(2 * np.pi * train_scaled['month'] / 12)

# Encode 'dayofweek' as cyclical (0=Monday, 6=Sunday)
train_scaled['dow_sin'] = np.sin(2 * np.pi * train_scaled['dayofweek'] / 7)
train_scaled['dow_cos'] = np.cos(2 * np.pi * train_scaled['dayofweek'] / 7)

# Check a few rows
train_scaled[['hour', 'hour_sin', 'hour_cos', 'month', 'month_sin', 'month_cos']].sample(10)

Unnamed: 0,hour,hour_sin,hour_cos,month,month_sin,month_cos
1211869,-0.794547,-0.206515,0.978443,-1.31271,-0.634478,0.772941
10480680,1.227936,0.315964,0.948771,1.007576,0.503431,0.864035
2384993,0.072232,0.018909,0.999821,-1.022674,-0.510246,0.860029
10817435,-1.372399,-0.351613,0.936146,1.007576,0.503431,0.864035
4353465,0.216695,0.0567,0.998391,-0.732638,-0.374269,0.92732
11393528,1.516862,0.386758,0.922181,1.297612,0.628348,0.777932
2352092,0.361158,0.09441,0.995533,-1.022674,-0.510246,0.860029
10445198,1.372399,0.351613,0.936146,1.007576,0.503431,0.864035
4455680,-0.361158,-0.09441,0.995533,-0.442602,-0.229677,0.973267
10097574,-0.505621,-0.131985,0.991252,1.007576,0.503431,0.864035


## I have deleted the rolling three hour feature and the lag feature of 1 hour as these seem unnecessary now

## 12. Feature Selection  

Selecting Input Features and Target Variable  

Define:

y: the target variable, temperature in Celsius (t2m_C)  
X: the input features we want the model to use to make predictions  

This includes weather features, time encodings, wind features, and one-hot precipitation types. 

In [15]:
# Define the target
y = train_scaled['t2m_C']

# Define input features for the model
feature_cols = [
    'tp', 'sp', 'wind10_speed', 'wind100_speed',
    'hour_sin', 'hour_cos',
    'month_sin', 'month_cos',
    'wind10_dir_sin', 'wind10_dir_cos',
    'ptype_none', 'ptype_rain', 'ptype_other',
]

# Create feature matrix
X = train_scaled[feature_cols]

print(f"Features and target selected. Feature matrix shape: {X.shape}")

Features and target selected. Feature matrix shape: (13288920, 13)


## Dataset with new features:

In [12]:
# Display 10 random rows from the processed dataset
train_scaled.sample(5, random_state=42)

Unnamed: 0,id,valid_time,latitude,longitude,tp,u10,v10,sp,u100,v100,...,wind10_dir_cos,wind100_dir_rad,wind100_dir_sin,wind100_dir_cos,hour_sin,hour_cos,month_sin,month_cos,dow_sin,dow_cos
4813198,4813199,2018-05-13 04:00:00,51.25,-7.25,-0.358812,5.886337,-0.648453,0.562722,6.440597,-0.707596,...,-0.1095,1.680222,0.994019,-0.109208,-0.279864,0.96004,-0.229677,0.973267,0.975237,0.221161
11648609,11648610,2018-11-16 22:00:00,52.5,-3.75,-0.34399,-4.333618,1.897888,-0.8026,-7.07869,3.492432,...,0.401161,5.170721,-0.896792,0.442453,0.386758,0.922181,0.628348,0.777932,0.436512,0.899698
1226538,1226539,2018-02-03 16:00:00,54.25,-2.25,0.171814,-1.242538,-1.477005,-2.438537,-1.713867,-2.03067,...,-0.765232,3.842588,-0.644978,-0.764201,0.169371,0.985552,-0.634478,0.772941,0.783175,0.621802
9860476,9860477,2018-09-28 19:00:00,50.0,-3.75,-0.358812,-7.468582,-4.135788,1.452785,-9.300003,-4.97821,...,-0.484441,4.220909,-0.881635,-0.471932,0.279864,0.96004,0.366927,0.93025,0.436512,0.899698
2016645,2016646,2018-02-25 09:00:00,55.75,-3.25,-0.358812,-2.91777,2.137604,-0.592904,-4.92923,4.259506,...,0.590987,5.425031,-0.756637,0.653835,-0.09441,0.995533,-0.634478,0.772941,0.975237,0.221161


## 13. Removal of Unwanted Features  

We can use this code to remove any features we don't want  

In the code the features dow_sin and dow_cos (days of week) have been removed, we can just add further features to drop here  

In [13]:
# Define for removal
features_to_drop = ['dow_sin', 'dow_cos']

# Drop the features
cleaned_df = train_scaled.drop(columns=features_to_drop)

# Display first 5 rows of the cleaned dataset
print("Preview of cleaned dataset:")
display(cleaned_df.head())

# Save the head (first 5 rows) to a CSV file
head_df = cleaned_df.head()
head_df.to_csv("cleaned_dataset_head.csv", index=False)

print("Saved 'cleaned_dataset_head.csv'")

Preview of cleaned dataset:


Unnamed: 0,id,valid_time,latitude,longitude,tp,u10,v10,sp,u100,v100,...,wind10_dir_rad,wind10_dir_sin,wind10_dir_cos,wind100_dir_rad,wind100_dir_sin,wind100_dir_cos,hour_sin,hour_cos,month_sin,month_cos
0,1,2018-01-01,59.0,-8.0,-0.346954,1.405868,-1.367767,-1.455549,1.516388,-1.468002,...,2.342458,0.716753,-0.697328,2.339983,0.718477,-0.695551,-0.42135,0.906898,-0.744106,0.668062
1,2,2018-01-01,59.0,-7.75,-0.306935,2.151962,-1.555267,-1.484714,2.239044,-1.634994,...,2.196609,0.810487,-0.585756,2.201517,0.807602,-0.589727,-0.42135,0.906898,-0.744106,0.668062
2,3,2018-01-01,59.0,-7.5,-0.302488,2.691025,-1.585541,-1.489575,2.783966,-1.686752,...,2.103234,0.861572,-0.507635,2.115529,0.855266,-0.518189,-0.42135,0.906898,-0.744106,0.668062
3,4,2018-01-01,59.0,-7.25,-0.299524,2.850204,-1.714447,-1.49079,2.971466,-1.854721,...,2.112331,0.856919,-0.515451,2.128804,0.848312,-0.529497,-0.42135,0.906898,-0.744106,0.668062
4,5,2018-01-01,59.0,-7.0,-0.311381,3.299423,-1.867767,-1.485929,3.455841,-2.001205,...,2.085908,0.870238,-0.492632,2.095691,0.865377,-0.501122,-0.42135,0.906898,-0.744106,0.668062


Saved 'cleaned_dataset_head.csv'


## 13: Feed Forward Neural Net (Multi-Layer Perception)    

In [16]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Set seeds for reproducibility
np.random.seed(99)
tf.random.set_seed(99)

# Take a 10% sample of the cleaned dataset
sampled_df = cleaned_df.sample(frac=0.10, random_state=99)

# Split into 80% for training and 20% for validation/testing
train_subset, validation_subset = train_test_split(
    sampled_df, test_size=0.2, random_state=99
)

# Use only the selected input features
X_train = train_subset[feature_cols]
y_train = train_subset['t2m_C']

X_val = validation_subset[feature_cols]
y_val_actual = validation_subset['t2m_C']

# Ensure only numeric features are used
X_train = X_train.select_dtypes(include=[np.number])
X_val = X_val.select_dtypes(include=[np.number])

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# Build neural network
model = keras.Sequential([
    keras.Input(shape=(X_train_scaled.shape[1],)),
    layers.Dense(64, activation='relu'),
    layers.Dense(32, activation='relu'),
    layers.Dense(1)
])

# Compile the model
model.compile(
    optimizer='adam',
    loss='mean_squared_error',
    metrics=['mean_absolute_error']
)

# Add early stopping
early_stop = keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=10,
    restore_best_weights=True
)

# Train model
history = model.fit(
    X_train_scaled, y_train,
    validation_split=0.2,
    epochs=100,
    batch_size=64,
    callbacks=[early_stop],
    verbose=1
)

# Predict on validation subset
y_val_pred = model.predict(X_val_scaled).flatten()

# Evaluate predictions
rmse = np.sqrt(mean_squared_error(y_val_actual, y_val_pred))
mae = mean_absolute_error(y_val_actual, y_val_pred)

print("\nNeural Network Evaluation on Validation Set:")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"Mean Absolute Error (MAE): {mae:.2f}")

Epoch 1/100
[1m13289/13289[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 4ms/step - loss: 8.6124 - mean_absolute_error: 2.0775 - val_loss: 4.6204 - val_mean_absolute_error: 1.6426
Epoch 2/100
[1m13289/13289[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 4ms/step - loss: 4.5385 - mean_absolute_error: 1.6262 - val_loss: 4.4245 - val_mean_absolute_error: 1.6023
Epoch 3/100
[1m13289/13289[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 4ms/step - loss: 4.3876 - mean_absolute_error: 1.5940 - val_loss: 4.3156 - val_mean_absolute_error: 1.5794
Epoch 4/100
[1m13289/13289[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 4ms/step - loss: 4.3054 - mean_absolute_error: 1.5763 - val_loss: 4.2572 - val_mean_absolute_error: 1.5679
Epoch 5/100
[1m13289/13289[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 4ms/step - loss: 4.2487 - mean_absolute_error: 1.5642 - val_loss: 4.2112 - val_mean_absolute_error: 1.5579
Epoch 6/100
[1m13289/13289[0m [32m━━━━━━━━━━━━━

In [17]:
# Compare actual vs predicted temperature values
comparison_df = pd.DataFrame({
    'Actual': y_val_actual.values,
    'Predicted': y_val_pred
})

# View the first 20 rows
print("Sample of predicted vs actual temperatures:")
display(comparison_df.head(20))

Sample of predicted vs actual temperatures:


Unnamed: 0,Actual,Predicted
0,13.1984,11.506401
1,14.4933,10.689627
2,14.51382,15.212631
3,14.61733,12.689357
4,4.9108,5.518839
5,17.66787,14.597844
6,14.66714,15.730021
7,5.6653,5.653523
8,5.60952,8.076424
9,5.88125,5.80549


## 14: Random Forest Model  

In [24]:
# Set random seed for reproducibility
np.random.seed(99)

# Take a 10% sample of the dataset
sampled_df = cleaned_df.sample(frac=0.10, random_state=99)

# Split 10% sample into 80% train and 20% validation
train_subset, val_subset = train_test_split(sampled_df, test_size=0.2, random_state=99)

# Separate input features and target
X_train = train_subset[feature_cols]
y_train = train_subset['t2m_C']

X_val = val_subset[feature_cols]
y_val = val_subset['t2m_C']

# Not sure if this is needed
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# Create and train the Random Forest model
rf_model = RandomForestRegressor(
    # Number of trees
    n_estimators=25,
    # Reproducibility
    random_state=99, 
    n_jobs=-1 
)
rf_model.fit(X_train_scaled, y_train)

# Predict on validation set
y_pred = rf_model.predict(X_val_scaled)

# Evaluate
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
mae = mean_absolute_error(y_val, y_pred)

print("\nRandom Forest Evaluation on Validation Set:")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"Mean Absolute Error (MAE): {mae:.2f}")


Random Forest Evaluation on Validation Set:
Root Mean Squared Error (RMSE): 1.63
Mean Absolute Error (MAE): 1.12


## 15: SDG Regressor Model  

In [26]:
# Set random seed
np.random.seed(99)

# Take a 10% sample of the cleaned dataset
sampled_df = cleaned_df.sample(frac=0.10, random_state=99)

# Split into training and validation subsets
train_subset, val_subset = train_test_split(sampled_df, test_size=0.2, random_state=99)

# Extract features and target
X_train = train_subset[feature_cols]
y_train = train_subset['t2m_C']

X_val = val_subset[feature_cols]
y_val = val_subset['t2m_C']

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# Initialise and train the model
sgd_model = SGDRegressor(max_iter=1000, tol=1e-3, random_state=99)
sgd_model.fit(X_train_scaled, y_train)

# Predict on the validation set
y_pred = sgd_model.predict(X_val_scaled)

# Evaluate the model
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
mae = mean_absolute_error(y_val, y_pred)

print("\nSGDRegressor Evaluation on Validation Set:")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"Mean Absolute Error (MAE): {mae:.2f}")


 SGDRegressor Evaluation on Validation Set:
Root Mean Squared Error (RMSE): 3.15
Mean Absolute Error (MAE): 2.46
