# ID5059 Group 4 Assignment

## README

## Requirements

In [1]:
# Libraries Required for Project (Added as Project Progresses)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler

## 1. Display the first few Observations  
Yhe first step will be to load the dataset  
The head() and sample() methods will be used to look at the features and observations of the dataset.

In [2]:
# Load the dataset from CSV
train_df = pd.read_csv('train.csv')

# Confirm it's loaded by showing the shape
print(f"Data loaded: {train_df.shape[0]:,} rows and {train_df.shape[1]} columns")

# Display the first few rows
train_df.head()

# Display a random five rows
train_df.sample(5)

Data loaded: 13,288,920 rows and 13 columns


Unnamed: 0,id,valid_time,latitude,longitude,tp,u10,v10,sp,u100,v100,tcc,ptype,t2m
2081271,2081272,2018-02-27 03:00:00,50.25,-0.75,0.000195,-9.309677,-2.629562,102508.81,-10.13977,-2.660828,0.751587,6.0,273.9485
10381559,10381560,2018-10-13 03:00:00,54.75,-0.25,0.00053,0.881973,9.949997,100548.875,2.086655,13.832047,1.0,1.0,289.0625
9509915,9509916,2018-09-19 04:00:00,50.75,-6.5,5.7e-05,6.062439,11.890976,100656.75,8.148529,14.919464,0.910095,1.0,289.89258
13233492,13233493,2018-12-30 11:00:00,54.75,-7.0,0.0,3.567276,1.86322,101492.5,6.252212,2.711883,1.0,1.0,283.67627
3082423,3082424,2018-03-26 15:00:00,50.5,-7.5,0.000754,2.575195,9.029907,101291.31,4.211899,11.599335,1.0,1.0,282.72168


## 2. Understanding the Structure and Summary Statistics  
The describe() function gives count, mean, std dev, min, max, and quartiles for each numerical column. This is useful for spotting odd ranges or skewed distributions.

In [3]:
# View dataset structure
train_df.info()

# Summary statistics for all numeric columns
train_df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13288920 entries, 0 to 13288919
Data columns (total 13 columns):
 #   Column      Dtype  
---  ------      -----  
 0   id          int64  
 1   valid_time  object 
 2   latitude    float64
 3   longitude   float64
 4   tp          float64
 5   u10         float64
 6   v10         float64
 7   sp          float64
 8   u100        float64
 9   v100        float64
 10  tcc         float64
 11  ptype       float64
 12  t2m         float64
dtypes: float64(11), int64(1), object(1)
memory usage: 1.3+ GB


Unnamed: 0,id,latitude,longitude,tp,u10,v10,sp,u100,v100,tcc,ptype,t2m
count,13288920.0,13288920.0,13288920.0,13288920.0,13288920.0,13288920.0,13288920.0,13288920.0,13288920.0,13288920.0,13288920.0,13288920.0
mean,6644460.0,54.5,-3.0,0.0001154333,1.05547,1.496095,100530.4,1.569438,2.100029,0.6968492,0.8030135,283.3007
std,3836181.0,2.66927,2.95804,0.0003217102,5.055472,4.822915,1645.81,6.597733,6.325126,0.3447855,1.06984,4.899615
min,1.0,50.0,-8.0,0.0,-19.5634,-19.07394,90685.81,-24.0957,-24.3681,0.0,0.0,258.8645
25%,3322231.0,52.25,-5.5,0.0,-2.05159,-1.637405,99632.0,-2.824387,-2.27269,0.414978,0.0,279.8369
50%,6644460.0,54.5,-3.0,3.814697e-06,1.157776,1.34082,100808.9,1.856186,1.808136,0.8706665,1.0,283.2065
75%,9966690.0,56.75,-0.5,6.437302e-05,4.234451,4.46315,101707.0,6.024933,6.249405,1.0,1.0,286.6846
max,13288920.0,59.0,2.0,0.01098633,24.99036,23.15897,104371.6,31.78088,29.40465,1.0,8.0,308.0288


## 3. Missing Values and Duplicates  
It is important to identify if there are any missing values or duplicates in the dataset as these could effect analysis

In [4]:
# Check for missing values
missing_values = train_df.isnull().sum()
missing_values = missing_values[missing_values > 0]

if missing_values.empty:
    print("No missing values in any column.")
else:
    print("Missing values found:")
    print(missing_values)

# Check for duplicates
duplicate_rows = train_df.duplicated().sum()
print(f"Number of duplicate rows: {duplicate_rows}")

No missing values in any column.
Number of duplicate rows: 0


## 4. Dataset Feature Descriptions  

**id:** Observation Identifier (Not Needed)  
**valid_time:** Timestamp of the observation (could convert to datetime)  
**latitude:** Geospatial location  
**longitude:** Geospatial location  
**tp:** Total precipitation (metres)  
**u10:** Wind u components at 10m height positive = eastward, negative = westward  
**v10:** Wind v components at 10m height positive = northward, negative = southward  
**u100:** Same as u10 but at 100m height  
**v100:** Same as v10 but at 100m height  
**sp:** Surface pressure (Pa)  
**tcc:** Total cloud cover (0 = clear, 1 = full cloud)  
**ptype:** Precipitation type (categorical)  
**t2m:** Temperature at 2 metres (in Kelvin — convert to Celsius?)

## 5. Simplify and Encode the feature ptype  

Precipitation Type Simplification and One-Hot Encoding  
The ptype column contains multiple weather conditions, but some are extremely rare. We grouped them into a few meaningful categories to reduce noise:  
**none:** No precipitation  
**rain:** Regular rainfall  
**snow_mixed:** Snow, wet snow, or rain/snow mix  
**freezing_or_ice:** Freezing rain, ice pellets, or freezing drizzle  
These are then one-hot encoded, meaning we create separate columns like ptype_rain, which will be 1 when that condition is true, and 0 otherwise.  

In [5]:
# Define grouping function
def simplify_ptype(ptype):
    if ptype == 0:
        return 'none'
    elif ptype == 1:
        return 'rain'
    elif ptype in [5, 6, 7]:
        return 'snow_mixed'
    elif ptype in [3, 8, 12]:
        return 'freezing_or_ice'
    else:
        return 'other'

# Apply grouping
train_df['ptype_grouped'] = train_df['ptype'].apply(simplify_ptype)

# One-hot encode
ptype_dummies = pd.get_dummies(train_df['ptype_grouped'], prefix='ptype')
train_df = pd.concat([train_df, ptype_dummies], axis=1)

## 6. Convert Temperature from Kelvin to Celsius  
**Temperature Conversion**  
The t2m variable represents temperature in Kelvin, which is not ideal for interpretation or modelling. It will be converted to Celsius, which is more intuitive for most people and machine learning models.

In [6]:
train_df['t2m_C'] = train_df['t2m'] - 273.15

## 7. Drop the ID Feature  
Removing the id Column:  
The id column is a unique identifier for each row and does not carry any predictive information so it will be removed.

In [7]:
# Drop the 'id' column
train_df.drop(columns=['id'], inplace=True)

# Confirm removal
print("id column removed.")

id column removed.


## 8. Create Time-Based Features  
Creating Time-Based Features from valid_time  
The valid_time column contains timestamp information. This will be converted to datetime format and extract useful features like:  
**hour:** Hour of the day (0–23)  
**month:** Month of the year (1–12)  
**dayofweek:** Day of the week (0 = Monday, 6 = Sunday)  

These features may help capture daily and seasonal weather patterns.

In [8]:
# Convert 'valid_time' to datetime format
train_df['valid_time'] = pd.to_datetime(train_df['valid_time'])

# Extract time-based features
train_df['hour'] = train_df['valid_time'].dt.hour
train_df['month'] = train_df['valid_time'].dt.month
train_df['dayofweek'] = train_df['valid_time'].dt.dayofweek

# Display some examples
train_df[['valid_time', 'hour', 'month', 'dayofweek']].sample(10)

Unnamed: 0,valid_time,hour,month,dayofweek
4861270,2018-05-14 12:00:00,12,5,0
8966119,2018-09-04 06:00:00,6,9,1
10138609,2018-10-06 11:00:00,11,10,5
7539808,2018-07-27 02:00:00,2,7,4
3956945,2018-04-19 16:00:00,16,4,3
5069523,2018-05-20 05:00:00,5,5,6
1167617,2018-02-02 01:00:00,1,2,4
12191291,2018-12-01 20:00:00,20,12,5
3808702,2018-04-15 14:00:00,14,4,6
9703487,2018-09-24 12:00:00,12,9,0


## 9. Create Wind Speed Features  

Calculating Wind Speed from Wind Components. The wind is represented by two components:  

**u** = east–west (positive = eastward, negative = westward)  
**v** = north–south (positive = northward)  

To simplify these into one feature, we calculate wind speed using the formula:  
wind speed = sqrt(u2 + v2)  

This can be done for both 10m and 100m wind levels

In [9]:
# Calculate wind speed at 10m and 100m
train_df['wind10_speed'] = np.sqrt(train_df['u10']**2 + train_df['v10']**2)
train_df['wind100_speed'] = np.sqrt(train_df['u100']**2 + train_df['v100']**2)

# Display a few rows to check
train_df[['u10', 'v10', 'wind10_speed', 'u100', 'v100', 'wind100_speed']].head()

Unnamed: 0,u10,v10,wind10_speed,u100,v100,wind100_speed
0,1.405868,-1.367767,1.961442,1.516388,-1.468002,2.11056
1,2.151962,-1.555267,2.655146,2.239044,-1.634994,2.772458
2,2.691025,-1.585541,3.123388,2.783966,-1.686752,3.255088
3,2.850204,-1.714447,3.326108,2.971466,-1.854721,3.502799
4,3.299423,-1.867767,3.791405,3.455841,-2.001205,3.993452


## 10. Wind Direction  

Calculating Wind Direction  

In addition to wind speed, we can calculate wind direction from the u and v components of wind.  

This will show where the wind is coming from, measured in degrees (0° = north, 90° = east, etc.)  
Wind direction may influence temperature, depending on regional wind patterns (e.g., warm air from the south, cold air from the north)  

What will be calculated:  

**wind10_dir:** Wind direction at 10 metres  
**wind100_dir:** Wind direction at 100 metres  

It will also convert the direction into sine and cosine components to handle the circular nature of angles (so the model understands that 0° and 360° are essentially the same).

In [10]:
# Wind direction (in degrees), where 0 = north, 90 = east, etc.
train_df['wind10_dir'] = (np.arctan2(train_df['u10'], train_df['v10']) * (180 / np.pi)) % 360
train_df['wind100_dir'] = (np.arctan2(train_df['u100'], train_df['v100']) * (180 / np.pi)) % 360

# Convert to radians for sine/cosine encoding
train_df['wind10_dir_rad'] = np.radians(train_df['wind10_dir'])

# Sine and cosine components
train_df['wind10_dir_sin'] = np.sin(train_df['wind10_dir_rad'])
train_df['wind10_dir_cos'] = np.cos(train_df['wind10_dir_rad'])

# Repeat for 100m wind in case it's useful
train_df['wind100_dir_rad'] = np.radians(train_df['wind100_dir'])
train_df['wind100_dir_sin'] = np.sin(train_df['wind100_dir_rad'])
train_df['wind100_dir_cos'] = np.cos(train_df['wind100_dir_rad'])

# Check the dataset
train_df[['u10', 'v10', 'wind10_dir', 'wind10_dir_sin', 'wind10_dir_cos', 'wind100_dir']].sample(10)

Unnamed: 0,u10,v10,wind10_dir,wind10_dir_sin,wind10_dir_cos,wind100_dir
8583264,6.905609,-2.557724,110.323801,0.937745,-0.347325,110.461673
9628475,10.55835,-6.285034,120.763929,0.859282,-0.511502,121.452741
12754446,-0.259857,4.295807,356.538346,-0.060381,0.998175,4.286512
4819361,2.290787,-3.053665,143.123645,0.60009,-0.799932,142.424815
12446051,6.835205,-2.303009,108.620391,0.947655,-0.319297,110.007082
2047631,-11.27713,2.944397,284.632942,-0.967564,0.252626,285.267369
4695143,1.627121,11.498749,8.054119,0.140108,0.990136,9.893988
6086883,2.085648,1.019424,63.951506,0.898423,0.439132,72.828712
7492562,-0.178925,-1.637344,186.236382,-0.108631,-0.994082,170.243818
7397289,6.118118,4.060867,56.426001,0.833172,0.553014,61.660794


## 11. Scaling Numerical Features for Modelling  

It is planned to use linear regression and neural networks, so, it is important to scale our numerical features. These models are sensitive to differences in magnitude between features.  

For example:  

**sp:** (surface pressure) is around 100,000 Pa  
**tp:** (precipitation) is usually < 0.01
**tcc:**  (cloud cover) already ranges between 0 and 1  

If these features arenot scaled, models may unintentionally give more weight to features with larger values, even if they are not more important.  

Standard Scaling can be used, which transforms features to have:  
A mean of 0  
A standard deviation of 1  

The following will not scaled:

**tcc:** Already normalised between 0–1
One-hot encoded variables (e.g., precipitation type, day of week)
Wind direction sine/cosine: Already between -1 and 1  

These will be scaled:  
Surface pressure  
Wind speed  
Precipitation  
Time-based features (e.g., hour, month), optional, not sure but may be helpful

In [11]:
# List of features to scale
features_to_scale = [
    # Total precipitation
    'tp',
    # Surface pressure
    'sp',
    # Wind speed at 10m
    'wind10_speed',
    # Wind speed at 100m
    'wind100_speed',
    # Hour of day
    'hour',
    # Month of year
    'month',
    # Day of week
    'dayofweek',     
]

# Create a copy of the dataframe to preserve original features
train_scaled = train_df.copy()

# Initialise the scaler
scaler = StandardScaler()

# Apply scaling to selected features
train_scaled[features_to_scale] = scaler.fit_transform(train_scaled[features_to_scale])

# Display scaled values to confirm
train_scaled[features_to_scale].head()

Unnamed: 0,tp,sp,wind10_speed,wind100_speed,hour,month,dayofweek
0,-0.346954,-1.455549,-1.203479,-1.440209,-1.661325,-1.602745,-1.493348
1,-0.306935,-1.484714,-1.009811,-1.289457,-1.661325,-1.602745,-1.493348
2,-0.302488,-1.489575,-0.879087,-1.179535,-1.661325,-1.602745,-1.493348
3,-0.299524,-1.49079,-0.822491,-1.123118,-1.661325,-1.602745,-1.493348
4,-0.311381,-1.485929,-0.69259,-1.011368,-1.661325,-1.602745,-1.493348


## 12. Cyclical Time Feature Encoding  

Encoding Time Features as Cyclical Values  

Time-based features like hour, month, and dayofweek are cyclical, they repeat in a loop:  

Hour 23 is followed by hour 0 (midnight)  
December (12) is followed by January (1)  
Sunday (6) is followed by Monday (0)  

Treating these as plain numbers would mislead the model into thinking that the highest values are "further away" from the lowest, when in fact they are neighbours.  

To handle this, we transform each feature using sine and cosine. This places the values on a circle, helping the model understand the cyclical nature of time.  

In [12]:
# Encode 'hour' as cyclical
train_scaled['hour_sin'] = np.sin(2 * np.pi * train_scaled['hour'] / 24)
train_scaled['hour_cos'] = np.cos(2 * np.pi * train_scaled['hour'] / 24)

# Encode 'month' as cyclical
train_scaled['month_sin'] = np.sin(2 * np.pi * train_scaled['month'] / 12)
train_scaled['month_cos'] = np.cos(2 * np.pi * train_scaled['month'] / 12)

# Encode 'dayofweek' as cyclical (0=Monday, 6=Sunday)
train_scaled['dow_sin'] = np.sin(2 * np.pi * train_scaled['dayofweek'] / 7)
train_scaled['dow_cos'] = np.cos(2 * np.pi * train_scaled['dayofweek'] / 7)

# Check a few rows
train_scaled[['hour', 'hour_sin', 'hour_cos', 'month', 'month_sin', 'month_cos']].sample(10)

Unnamed: 0,hour,hour_sin,hour_cos,month,month_sin,month_cos
5123705,0.794547,0.206515,0.978443,-0.442602,-0.229677,0.973267
7125142,0.650084,0.169371,0.985552,0.137469,0.071916,0.997411
4851617,-0.794547,-0.206515,0.978443,-0.442602,-0.229677,0.973267
6113068,1.372399,0.351613,0.936146,-0.152567,-0.079799,0.996811
13197211,-0.072232,-0.018909,0.999821,1.587648,0.738802,0.673923
6526373,-0.794547,-0.206515,0.978443,-0.152567,-0.079799,0.996811
4325949,1.083473,0.279864,0.96004,-0.732638,-0.374269,0.92732
5680389,-1.661325,-0.42135,0.906898,-0.152567,-0.079799,0.996811
10732808,1.083473,0.279864,0.96004,1.007576,0.503431,0.864035
7849215,0.361158,0.09441,0.995533,0.427505,0.221976,0.975052


## 13. Adding a Lag Feature: Temperature 1 Hour Ago (Nearby times)  

Creating a Lag Feature: Temperature 1 Hour Ago  

In weather modelling, what has just happened can often help predict what happens next. For example:  

If the temperature was falling over the last hour, it may continue to drop  
If it was rising, it may keep rising  

A lag feature helps the model remember what the conditions were at a previous time step. For example:  

t2m_lag1: The temperature (in Celsius) 1 hour ago at the same latitude and longitude  

This is done by grouping the data by location (latitude, longitude) and shifting the temperature values by 1 time step.  

This new column could give a regression or neural network model a short-term memory effect.  
More lag features could be created if needed at a later time (e.g., 2-hour lag, 6-hour lag).  
**Warning:** The first row for each group will now have a NaN in the lag column, this is expected (since there’s no “previous” row to pull from).  

In [13]:
# Sort data to ensure it's in the correct time order per location
train_scaled.sort_values(by=['latitude', 'longitude', 'valid_time'], inplace=True)

# Create a 1-hour lag of temperature at each location
train_scaled['t2m_lag1'] = train_scaled.groupby(['latitude', 'longitude'])['t2m_C'].shift(1)

# Check a few rows to confirm
train_scaled[['valid_time', 'latitude', 'longitude', 't2m_C', 't2m_lag1']].sample(10)

Unnamed: 0,valid_time,latitude,longitude,t2m_C,t2m_lag1
4079223,2018-04-23 01:00:00,59.0,-5.5,8.37405,8.29873
4115502,2018-04-24 00:00:00,50.5,-7.0,10.9247,11.01687
2003662,2018-02-25 00:00:00,51.75,0.25,1.235,1.55752
4010518,2018-04-21 03:00:00,52.5,-2.75,6.3815,6.444
11487872,2018-11-12 12:00:00,52.0,-8.0,10.21023,10.24648
310064,2018-01-09 12:00:00,55.5,-2.5,2.63906,2.31948
4885767,2018-05-15 04:00:00,52.75,-7.5,9.38247,9.84292
11116819,2018-11-02 08:00:00,57.75,1.5,9.28384,9.1393
707152,2018-01-20 10:00:00,57.75,-1.75,3.08926,2.9938
10800371,2018-10-24 15:00:00,54.0,-1.0,13.3212,14.13906


## 14. Adding a Rolling Feature: Average Temperature Over 3 Hours  

Creating a Rolling Feature: 3-Hour Temperature Average  

A rolling feature calculates a summary statistic (such as a mean) over a moving time window. This helps the model recognise trends rather than focusing on just a single moment.  

Could create a column/feature named:  

t2m_roll3: The average temperature (in Celsius) over the past 3 hours at the same location  

This feature helps smooth out short-term fluctuations and provides a better sense of recent conditions.

In [14]:
# Create rolling 3-hour average of temperature, grouped by location
train_scaled['t2m_roll3'] = (
    train_scaled
    .groupby(['latitude', 'longitude'])['t2m_C']
    .rolling(window=3)
    .mean()
    .reset_index(level=[0,1], drop=True)
)

# Check a few rows
train_scaled[['valid_time', 'latitude', 'longitude', 't2m_C', 't2m_roll3']].sample(10)

Unnamed: 0,valid_time,latitude,longitude,t2m_C,t2m_roll3
11870132,2018-11-23 00:00:00,52.25,-3.75,2.5187,2.209783
2510842,2018-03-10 23:00:00,57.75,-7.5,4.49917,4.742007
12292567,2018-12-04 15:00:00,57.25,-0.75,6.91934,7.005113
1944183,2018-02-23 09:00:00,53.5,-7.0,3.41885,3.12165
9137277,2018-09-08 23:00:00,56.75,-3.75,8.00503,8.648663
967038,2018-01-27 13:00:00,54.75,-5.0,10.33303,10.234113
4207952,2018-04-26 13:00:00,51.25,2.0,11.08645,10.852523
1098037,2018-01-31 03:00:00,51.5,-4.0,8.6354,8.637433
9810959,2018-09-27 11:00:00,56.0,-1.0,14.01382,14.240793
5345437,2018-05-27 19:00:00,52.75,-2.75,18.0287,18.766417


# Note: Further development: If this is considered useful by the group we could use rolling or lag features for other variables such as wind.

## 15.  Removing Rows with Missing Values  

While the original dataset had no missing values, the lag and rolling features added, introduce NaNs at the start of each location group.  

These rows can't be used for modelling (since the model wouldn't see those previous values), so these should now be removed.

In [15]:
# Remove any rows with NaNs caused by lag/rolling features
train_model = train_scaled.dropna().copy()

print(f"Rows after dropping NaNs from lag/rolling: {train_model.shape}")

Rows after dropping NaNs from lag/rolling: (13285886, 39)


## 16. Feature Selection  

Selecting Input Features and Target Variable  

Define:

**y:** the target variable, temperature in Celsius (t2m_C)  
**x:** the input features we want the model to use to make predictions  

This includes weather features, time encodings, wind features, and one-hot precipitation types. 

In [16]:
# Define the target
y = train_model['t2m_C']

# Define input features for the model
feature_cols = [
    'tp', 'sp', 'wind10_speed', 'wind100_speed',
    'hour_sin', 'hour_cos',
    'month_sin', 'month_cos',
    'dow_sin', 'dow_cos',
    't2m_lag1', 't2m_roll3',
    'wind10_dir_sin', 'wind10_dir_cos',
    'ptype_none', 'ptype_rain', 'ptype_snow_mixed', 'ptype_freezing_or_ice',
]

# Create feature matrix
X = train_model[feature_cols]

print(f"Features and target selected. Feature matrix shape: {X.shape}")

Features and target selected. Feature matrix shape: (13285886, 18)


## What the dataset looks like now:

In [18]:
# Display 10 random rows from the processed dataset
train_model.sample(5, random_state=42)

Unnamed: 0,valid_time,latitude,longitude,tp,u10,v10,sp,u100,v100,tcc,...,wind100_dir_sin,wind100_dir_cos,hour_sin,hour_cos,month_sin,month_cos,dow_sin,dow_cos,t2m_lag1,t2m_roll3
11334069,2018-11-08 07:00:00,55.75,-0.75,-0.293595,1.39563,3.309387,0.124676,1.821365,3.6194,0.435364,...,0.449515,0.893273,-0.169371,0.985552,0.628348,0.777932,0.003682,0.999993,9.47305,9.524643
7344682,2018-07-21 17:00:00,53.75,-2.0,-0.358812,4.035996,0.091156,-1.126644,5.432907,0.052673,0.616669,...,0.999953,0.009695,0.206515,0.978443,0.071916,0.997411,0.783175,0.621802,18.30703,18.085433
1971525,2018-02-24 03:00:00,53.5,2.0,-0.358812,-9.48172,3.911667,1.147844,-10.737564,4.519897,0.128967,...,-0.921672,0.387971,-0.315964,0.948771,-0.634478,0.772941,0.783175,0.621802,2.66958,2.68325
3053469,2018-03-25 20:00:00,51.5,0.75,-0.358812,-0.764206,1.093109,0.357619,-1.225525,0.294479,0.959076,...,-0.972324,0.233638,0.315964,0.948771,-0.510246,0.860029,0.975237,0.221161,8.2106,7.977443
13274798,2018-12-31 14:00:00,52.75,-2.25,-0.358812,4.042435,0.04068,0.871233,6.029892,-0.146256,0.862,...,0.999706,-0.024248,0.09441,0.995533,0.738802,0.673923,-0.973582,0.228338,9.29604,9.216457
