In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [2]:
#Load dataset

df_train = pd.read_csv('data/Train.csv')
df_test = pd.read_csv('data/Test.csv')
df_sample_submission = pd.read_csv('data/SampleSubmission.csv')

In [3]:
# Creating subsets of locations
#location_groups = df_train.groupby('Place_ID')
#type("location_groups")
#for location, sub_df in location_groups:
    # Sort each subset DataFrame by the 'Date' column in ascending order
#   sub_df = sub_df.sort_values(by='Date', ascending=True)
#  print(f"{location}:{sub_df}")


# Convert 'Date' to datetime if not already done
df_train['Date'] = pd.to_datetime(df_train['Date'])

# Create empty lists to store train and test DataFrames
train_dfs = []
test_dfs = []

# Creating subsets of locations
location_groups = df_train.groupby('Place_ID')

# Filter out groups that have fewer than 13 unique dates
filtered_groups = {
    location: sub_df for location, sub_df in location_groups 
    if sub_df['Date'].nunique() >= 13
}

# Debug: Print number of filtered groups
print(f"Number of filtered groups (locations with >=13 unique dates): {len(filtered_groups)}")

# Iterate through each filtered group
for location, sub_df in filtered_groups.items():
    # Sort each subset DataFrame by the 'Date' column in ascending order
    sub_df = sub_df.sort_values(by='Date', ascending=True)

    # Debug: Check the number of rows in the sorted sub_df
    print(f"Location: {location}, Number of rows: {len(sub_df)}")
    
    # Select the last three rows for the test set
    test_df = sub_df.tail(3)
    # Select the remaining rows for the train set
    train_df = sub_df.iloc[:-3]

     # Create a lag-1 variable for 'target' only in the training set
    train_df['lag_1'] = train_df['target'].shift(1)

    # Drop rows with NaN values in 'lag_1' (due to the shift)
    train_df = train_df.dropna(subset=['lag_1'])

    # Debug: Verify the number of rows in train and test sets for this location
    print(f"Location: {location}, Train Rows: {len(train_df)}, Test Rows: {len(test_df)}")

    # Append the split dataframes to their respective lists
    train_dfs.append(train_df)
    test_dfs.append(test_df)

# Concatenate all the train and test DataFrames into separate DataFrames
df_train_split = pd.concat(train_dfs, ignore_index=True)
df_test_split = pd.concat(test_dfs, ignore_index=True)

# Display or use the resulting DataFrames
print("Training set:")
print(df_train_split.head())  # Display the first few rows for verification
print("\nTest set:")
print(df_test_split.head())   # Display the first few rows for verification

# Optional: Check the number of rows in the test set for each location to verify correctness
print("\nNumber of rows per Place_ID in the test set:")
print(df_test_split['Place_ID'].value_counts())




Number of filtered groups (locations with >=13 unique dates): 337
Location: 010Q650, Number of rows: 94
Location: 010Q650, Train Rows: 90, Test Rows: 3
Location: 05EC30X, Number of rows: 90
Location: 05EC30X, Train Rows: 86, Test Rows: 3
Location: 0DPWHX8, Number of rows: 94
Location: 0DPWHX8, Train Rows: 90, Test Rows: 3
Location: 0GBXTHY, Number of rows: 94
Location: 0GBXTHY, Train Rows: 90, Test Rows: 3
Location: 0HYPV1N, Number of rows: 94
Location: 0HYPV1N, Train Rows: 90, Test Rows: 3
Location: 0I2XREH, Number of rows: 65
Location: 0I2XREH, Train Rows: 61, Test Rows: 3
Location: 0KV6RJ1, Number of rows: 94
Location: 0KV6RJ1, Train Rows: 90, Test Rows: 3
Location: 0MGEY68, Number of rows: 94
Location: 0MGEY68, Train Rows: 90, Test Rows: 3
Location: 0O9R6AD, Number of rows: 92
Location: 0O9R6AD, Train Rows: 88, Test Rows: 3
Location: 0PP73FZ, Number of rows: 94
Location: 0PP73FZ, Train Rows: 90, Test Rows: 3
Location: 0RYZQUU, Number of rows: 91
Location: 0RYZQUU, Train Rows: 87, T

In [4]:
## MODEL 1 LIGHTGBM

from lightgbm import LGBMRegressor
from lightgbm import LGBMRegressor, early_stopping, log_evaluation
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error

# Define the feature columns and target
feature_columns = [
    'Place_ID',
    'precipitable_water_entire_atmosphere',
    'relative_humidity_2m_above_ground',
    'specific_humidity_2m_above_ground', 
    'temperature_2m_above_ground',
    'u_component_of_wind_10m_above_ground',
    'v_component_of_wind_10m_above_ground',
    'L3_NO2_NO2_column_number_density',
    'L3_NO2_NO2_slant_column_number_density',
    'L3_NO2_absorbing_aerosol_index', 
    'L3_NO2_cloud_fraction',
    'L3_NO2_sensor_altitude', 
    'L3_NO2_sensor_azimuth_angle',
    'L3_NO2_sensor_zenith_angle', 
    'L3_NO2_solar_azimuth_angle',
    'L3_NO2_solar_zenith_angle',
    'L3_NO2_stratospheric_NO2_column_number_density',
    'L3_NO2_tropopause_pressure',
    'L3_NO2_tropospheric_NO2_column_number_density',
    'L3_O3_O3_column_number_density', 
    'L3_O3_O3_effective_temperature',
    'L3_O3_cloud_fraction', 
    'L3_O3_sensor_azimuth_angle',
    'L3_O3_sensor_zenith_angle', 
    'L3_O3_solar_azimuth_angle',
    'L3_O3_solar_zenith_angle',
    'L3_CO_CO_column_number_density',
    'L3_CO_H2O_column_number_density', 
    'L3_CO_cloud_height',
    'L3_CO_sensor_altitude', 
    'L3_CO_sensor_azimuth_angle',
    'L3_CO_sensor_zenith_angle', 
    'L3_CO_solar_azimuth_angle',
    'L3_CO_solar_zenith_angle', 
    'L3_HCHO_HCHO_slant_column_number_density',
    'L3_HCHO_cloud_fraction', 
    'L3_HCHO_sensor_azimuth_angle',
    'L3_HCHO_sensor_zenith_angle', 
    'L3_HCHO_solar_azimuth_angle',
    'L3_HCHO_solar_zenith_angle',
    'L3_HCHO_tropospheric_HCHO_column_number_density',
    'L3_HCHO_tropospheric_HCHO_column_number_density_amf',
    'L3_CLOUD_cloud_base_height', 
    'L3_CLOUD_cloud_base_pressure',
    'L3_CLOUD_cloud_fraction', 
    'L3_CLOUD_cloud_optical_depth',
    'L3_CLOUD_cloud_top_height', 
    'L3_CLOUD_cloud_top_pressure',
    'L3_CLOUD_sensor_azimuth_angle', 
    'L3_CLOUD_sensor_zenith_angle',
    'L3_CLOUD_solar_azimuth_angle', 
    'L3_CLOUD_solar_zenith_angle',
    'L3_CLOUD_surface_albedo', 
    'L3_AER_AI_absorbing_aerosol_index',
    'L3_AER_AI_sensor_altitude', 
    'L3_AER_AI_sensor_azimuth_angle',
    'L3_AER_AI_sensor_zenith_angle', 
    'L3_AER_AI_solar_azimuth_angle',
    'L3_AER_AI_solar_zenith_angle', 
    'L3_SO2_SO2_column_number_density',
    'L3_SO2_SO2_column_number_density_amf',
    'L3_SO2_SO2_slant_column_number_density',
    'L3_SO2_absorbing_aerosol_index', 
    'L3_SO2_cloud_fraction',
    'L3_SO2_sensor_azimuth_angle', 
    'L3_SO2_sensor_zenith_angle',
    'L3_SO2_solar_azimuth_angle', 
    'L3_SO2_solar_zenith_angle',
    'L3_CH4_CH4_column_volume_mixing_ratio_dry_air',
    'L3_CH4_aerosol_height', 
    'L3_CH4_aerosol_optical_depth',
    'L3_CH4_sensor_azimuth_angle', 
    'L3_CH4_sensor_zenith_angle',
    'L3_CH4_solar_azimuth_angle', 
    'L3_CH4_solar_zenith_angle'
]
# Prepare the training and testing data
X_train = df_train_split[feature_columns]
y_train = df_train_split['target']
X_test = df_test_split[feature_columns]
y_test = df_test_split['target']

# Encode the 'Place_ID' as it is a categorical feature
label_encoder = LabelEncoder()
X_train['Place_ID'] = label_encoder.fit_transform(X_train['Place_ID'])
X_test['Place_ID'] = label_encoder.transform(X_test['Place_ID'])

# Define the LightGBM model using the scikit-learn API
model = LGBMRegressor(
    objective='regression',
    learning_rate=0.05,
    n_estimators=1000,
    num_leaves=31,
    max_depth=-1,
    min_data_in_leaf=20,
    feature_fraction=0.8,
    bagging_fraction=0.8,
    bagging_freq=5,
    random_state=42
)

# Train the model with early stopping
model.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    eval_metric='rmse',
    callbacks=[early_stopping(stopping_rounds=50), log_evaluation(period=100)],
)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"Root Mean Squared Error (RMSE): {rmse}")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007133 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19113
[LightGBM] [Info] Number of data points in the train set: 29187, number of used features: 75
[LightGBM] [Info] Start training from score 61.263360
Training until validation scores don't improve for 50 rounds
[100]	valid_0's rmse: 26.8532	valid_0's l2: 721.096
[200]	valid_0's rmse: 26.1322	valid_0's l2: 682.891
[300]	valid_0's rmse: 25.7081	valid_0's l2: 660.907
[400]	valid_0's rmse: 25.7006	valid_0's l2: 660.519
Early stopping, best iteration is:
[379]	valid_0's rmse: 25.4241	valid_0's l2: 646.384
Root Mean Squared Error (RMSE): 25.424079804978554


In [5]:
# MODEL 2 (incl. date variables)

from lightgbm import LGBMRegressor, early_stopping, log_evaluation
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np

# Step 1: Sort data by Place_ID and Date
df_train_split = df_train_split.sort_values(by=['Place_ID', 'Date'])
df_test_split = df_test_split.sort_values(by=['Place_ID', 'Date'])

# Step 2: Create 'day_number' and 'is_weekend' features

# Convert 'Date' to datetime format if not already done
df_train_split['Date'] = pd.to_datetime(df_train_split['Date'])
df_test_split['Date'] = pd.to_datetime(df_test_split['Date'])

# Calculate 'day_number' (days since the first date per Place_ID)
df_train_split['day_number'] = df_train_split.groupby('Place_ID')['Date'].transform(lambda x: (x - x.min()).dt.days)
df_test_split['day_number'] = df_test_split.groupby('Place_ID')['Date'].transform(lambda x: (x - x.min()).dt.days)

# Create 'is_weekend' (1 if weekend, 0 if weekday)
df_train_split['is_weekend'] = df_train_split['Date'].dt.weekday.isin([5, 6]).astype(int)
df_test_split['is_weekend'] = df_test_split['Date'].dt.weekday.isin([5, 6]).astype(int)

# Step 3: Update the feature columns list to include the new features
feature_columns = [
    'Place_ID',
    'precipitable_water_entire_atmosphere',
    'relative_humidity_2m_above_ground',
    'specific_humidity_2m_above_ground', 
    'temperature_2m_above_ground',
    'u_component_of_wind_10m_above_ground',
    'v_component_of_wind_10m_above_ground',
    'L3_NO2_NO2_column_number_density',
    'L3_NO2_NO2_slant_column_number_density',
    'L3_NO2_absorbing_aerosol_index', 
    'L3_NO2_cloud_fraction',
    'L3_NO2_sensor_altitude', 
    'L3_NO2_sensor_azimuth_angle',
    'L3_NO2_sensor_zenith_angle', 
    'L3_NO2_solar_azimuth_angle',
    'L3_NO2_solar_zenith_angle',
    'L3_NO2_stratospheric_NO2_column_number_density',
    'L3_NO2_tropopause_pressure',
    'L3_NO2_tropospheric_NO2_column_number_density',
    'L3_O3_O3_column_number_density', 
    'L3_O3_O3_effective_temperature',
    'L3_O3_cloud_fraction', 
    'L3_O3_sensor_azimuth_angle',
    'L3_O3_sensor_zenith_angle', 
    'L3_O3_solar_azimuth_angle',
    'L3_O3_solar_zenith_angle',
    'L3_CO_CO_column_number_density',
    'L3_CO_H2O_column_number_density', 
    'L3_CO_cloud_height',
    'L3_CO_sensor_altitude', 
    'L3_CO_sensor_azimuth_angle',
    'L3_CO_sensor_zenith_angle', 
    'L3_CO_solar_azimuth_angle',
    'L3_CO_solar_zenith_angle', 
    'L3_HCHO_HCHO_slant_column_number_density',
    'L3_HCHO_cloud_fraction', 
    'L3_HCHO_sensor_azimuth_angle',
    'L3_HCHO_sensor_zenith_angle', 
    'L3_HCHO_solar_azimuth_angle',
    'L3_HCHO_solar_zenith_angle',
    'L3_HCHO_tropospheric_HCHO_column_number_density',
    'L3_HCHO_tropospheric_HCHO_column_number_density_amf',
    'L3_CLOUD_cloud_base_height', 
    'L3_CLOUD_cloud_base_pressure',
    'L3_CLOUD_cloud_fraction', 
    'L3_CLOUD_cloud_optical_depth',
    'L3_CLOUD_cloud_top_height', 
    'L3_CLOUD_cloud_top_pressure',
    'L3_CLOUD_sensor_azimuth_angle', 
    'L3_CLOUD_sensor_zenith_angle',
    'L3_CLOUD_solar_azimuth_angle', 
    'L3_CLOUD_solar_zenith_angle',
    'L3_CLOUD_surface_albedo', 
    'L3_AER_AI_absorbing_aerosol_index',
    'L3_AER_AI_sensor_altitude', 
    'L3_AER_AI_sensor_azimuth_angle',
    'L3_AER_AI_sensor_zenith_angle', 
    'L3_AER_AI_solar_azimuth_angle',
    'L3_AER_AI_solar_zenith_angle', 
    'L3_SO2_SO2_column_number_density',
    'L3_SO2_SO2_column_number_density_amf',
    'L3_SO2_SO2_slant_column_number_density',
    'L3_SO2_absorbing_aerosol_index', 
    'L3_SO2_cloud_fraction',
    'L3_SO2_sensor_azimuth_angle', 
    'L3_SO2_sensor_zenith_angle',
    'L3_SO2_solar_azimuth_angle', 
    'L3_SO2_solar_zenith_angle',
    'L3_CH4_CH4_column_volume_mixing_ratio_dry_air',
    'L3_CH4_aerosol_height', 
    'L3_CH4_aerosol_optical_depth',
    'L3_CH4_sensor_azimuth_angle', 
    'L3_CH4_sensor_zenith_angle',
    'L3_CH4_solar_azimuth_angle', 
    'L3_CH4_solar_zenith_angle',
    'day_number',  # Include day_number feature
    'is_weekend'   # Include is_weekend feature
]

# Prepare the training and testing data with the new features
X_train = df_train_split[feature_columns]
y_train = df_train_split['target']
X_test = df_test_split[feature_columns]
y_test = df_test_split['target']

# Encode the 'Place_ID' as it is a categorical feature
label_encoder = LabelEncoder()
X_train['Place_ID'] = label_encoder.fit_transform(X_train['Place_ID'])
X_test['Place_ID'] = label_encoder.transform(X_test['Place_ID'])

# Define the LightGBM model using the scikit-learn API
model = LGBMRegressor(
    objective='regression',
    learning_rate=0.05,
    n_estimators=1000,
    num_leaves=31,
    max_depth=-1,
    min_data_in_leaf=20,
    feature_fraction=0.8,
    bagging_fraction=0.8,
    bagging_freq=5,
    random_state=42
)

# Train the model with early stopping
model.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    eval_metric='rmse',
    callbacks=[early_stopping(stopping_rounds=50), log_evaluation(period=100)],
)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"Root Mean Squared Error (RMSE): {rmse}")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006839 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19205
[LightGBM] [Info] Number of data points in the train set: 29187, number of used features: 77
[LightGBM] [Info] Start training from score 61.263360
Training until validation scores don't improve for 50 rounds
[100]	valid_0's rmse: 27.7057	valid_0's l2: 767.607
[200]	valid_0's rmse: 27.0971	valid_0's l2: 734.251
Early stopping, best iteration is:
[156]	valid_0's rmse: 27.0437	valid_0's l2: 731.361
Root Mean Squared Error (RMSE): 27.043692262683706


In [17]:
# Check if 'lag_1' exists in the training dataset and move it to the end
#if 'lag_1' in df_train_split.columns:
   # Get all columns except 'lag_1'
   # #  cols = [col for col in df_train_split.columns if col != 'lag_1']
    # Add 'lag_1' as the last column
  #  df_train_split = df_train_split[cols + ['lag_1']]
   # print("'lag_1' has been moved to the last position in the training dataset.")
#else:
 #   print("'lag_1' does not exist in the training dataset.")

In [None]:
# MODEL 3 (excl. some features)

# Step 1: Sort data by Place_ID and Date
df_train_split = df_train_split.sort_values(by=['Place_ID', 'Date'])
df_test_split = df_test_split.sort_values(by=['Place_ID', 'Date'])

# Step 2: Create 'day_number' and 'is_weekend' features

# Convert 'Date' to datetime format if not already done
df_train_split['Date'] = pd.to_datetime(df_train_split['Date'])
df_test_split['Date'] = pd.to_datetime(df_test_split['Date'])

# Calculate 'day_number' (days since the first date per Place_ID)
df_train_split['day_number'] = df_train_split.groupby('Place_ID')['Date'].transform(lambda x: (x - x.min()).dt.days)
df_test_split['day_number'] = df_test_split.groupby('Place_ID')['Date'].transform(lambda x: (x - x.min()).dt.days)

# Create 'is_weekend' (1 if weekend, 0 if weekday)
df_train_split['is_weekend'] = df_train_split['Date'].dt.weekday.isin([5, 6]).astype(int)
df_test_split['is_weekend'] = df_test_split['Date'].dt.weekday.isin([5, 6]).astype(int)

# Step 3: Update the feature columns list to include the new features
feature_columns = [
    'Place_ID',
    "L3_CO_CO_column_number_density",
    "L3_HCHO_tropospheric_HCHO_column_number_density",
    "L3_NO2_NO2_column_number_density",
    "L3_O3_O3_column_number_density",
    "u_component_of_wind_10m_above_ground",
    'day_number',  # Include day_number feature
    'is_weekend'   # Include is_weekend feature
]

# Prepare the training and testing data with the new features
X_train = df_train_split[feature_columns]
y_train = df_train_split['target']
X_test = df_test_split[feature_columns]
y_test = df_test_split['target']

# Encode the 'Place_ID' as it is a categorical feature
label_encoder = LabelEncoder()
X_train['Place_ID'] = label_encoder.fit_transform(X_train['Place_ID'])
X_test['Place_ID'] = label_encoder.transform(X_test['Place_ID'])

# Define the LightGBM model using the scikit-learn API
model = LGBMRegressor(
    objective='regression',
    learning_rate=0.05,
    n_estimators=1000,
    num_leaves=31,
    max_depth=-1,
    min_data_in_leaf=20,
    feature_fraction=0.8,
    bagging_fraction=0.8,
    bagging_freq=5,
    random_state=42
)

# Train the model with early stopping
model.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    eval_metric='rmse',
    callbacks=[early_stopping(stopping_rounds=50), log_evaluation(period=100)],
)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"Root Mean Squared Error (RMSE): {rmse}")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000618 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1610
[LightGBM] [Info] Number of data points in the train set: 29187, number of used features: 8
[LightGBM] [Info] Start training from score 61.263360
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[31]	valid_0's rmse: 30.963	valid_0's l2: 958.71
Root Mean Squared Error (RMSE): 30.963048795148396
