In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("Train.csv")
df.head()

Unnamed: 0,ID_Zindi,Date,ID,LAT,LON,Precipitation,LST,AAI,CloudFraction,NO2_strat,NO2_total,NO2_trop,TropopausePressure,GT_NO2
0,ID_ENTGC7,1/1/19,PD01,45.601585,11.903551,0.0,,0.230527,0.559117,2.4e-05,0.000117,,14440.82126,31.0
1,ID_8JCCXC,1/1/19,PD04,45.371005,11.84083,3.047342,,-0.074006,0.869309,2.4e-05,0.000127,,14441.79815,42.0
2,ID_V3136Z,1/1/19,RO01,45.045825,12.060869,0.0,,0.02447,0.67416,2.4e-05,8.6e-05,,14437.38294,31.0
3,ID_KRVZDJ,1/1/19,RO02,45.104075,11.553241,1.200467,,-0.010442,0.920054,2.4e-05,0.000124,,14440.83831,30.0
4,ID_PR351A,1/1/19,RO03,45.038758,11.790152,1.274564,,-0.176178,0.747464,2.4e-05,0.000116,,14438.79037,58.0


In [3]:
print("\nMissing values in each column:")
print(df.isnull().sum())


Missing values in each column:
ID_Zindi                  0
Date                      0
ID                        0
LAT                       0
LON                       0
Precipitation             0
LST                   39786
AAI                   12875
CloudFraction         12875
NO2_strat             12875
NO2_total             12875
NO2_trop              35473
TropopausePressure    12875
GT_NO2                 4533
dtype: int64


In [4]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np
from scipy.interpolate import splrep, splev

features = ['LST', 'AAI', 'CloudFraction', 'NO2_strat', 'NO2_total', 'NO2_trop', 'TropopausePressure']


for feature in features:
    scaler = MinMaxScaler()
    df[f'{feature}_scaled'] = scaler.fit_transform(df[[feature]])

    mask_finite = np.isfinite(df[f'{feature}_scaled'].values)
    known_indices = df.index.values[mask_finite]
    known_values = df[f'{feature}_scaled'].values[mask_finite]
    missing_indices = df.index.values[~mask_finite]

    df[f'{feature}_bspline'] = df[f'{feature}_scaled']

    tck = splrep(known_indices, known_values)

    df.loc[~mask_finite, f'{feature}_bspline'] = splev(missing_indices, tck)

    df[feature] = scaler.inverse_transform(df[[f'{feature}_bspline']])

In [5]:
df.head()

Unnamed: 0,ID_Zindi,Date,ID,LAT,LON,Precipitation,LST,AAI,CloudFraction,NO2_strat,...,CloudFraction_scaled,CloudFraction_bspline,NO2_strat_scaled,NO2_strat_bspline,NO2_total_scaled,NO2_total_bspline,NO2_trop_scaled,NO2_trop_bspline,TropopausePressure_scaled,TropopausePressure_bspline
0,ID_ENTGC7,1/1/19,PD01,45.601585,11.903551,0.0,309.259072,0.230527,0.559117,2.4e-05,...,0.559117,0.559117,0.175084,0.175084,0.062593,0.062593,,-49.772037,0.367957,0.367957
1,ID_8JCCXC,1/1/19,PD04,45.371005,11.84083,3.047342,300.225794,-0.074006,0.869309,2.4e-05,...,0.869309,0.869309,0.176768,0.176768,0.067448,0.067448,,-39.949398,0.368019,0.368019
2,ID_V3136Z,1/1/19,RO01,45.045825,12.060869,0.0,292.742096,0.02447,0.67416,2.4e-05,...,0.67416,0.67416,0.178451,0.178451,0.047636,0.047636,,-31.479893,0.36774,0.36774
3,ID_KRVZDJ,1/1/19,RO02,45.104075,11.553241,1.200467,286.687129,-0.010442,0.920054,2.4e-05,...,0.920053,0.920053,0.180135,0.180135,0.065992,0.065992,,-24.265082,0.367958,0.367958
4,ID_PR351A,1/1/19,RO03,45.038758,11.790152,1.274564,281.940047,-0.176178,0.747464,2.4e-05,...,0.747464,0.747464,0.180135,0.180135,0.062107,0.062107,,-18.206523,0.367829,0.367829


In [6]:
df.columns

Index(['ID_Zindi', 'Date', 'ID', 'LAT', 'LON', 'Precipitation', 'LST', 'AAI',
       'CloudFraction', 'NO2_strat', 'NO2_total', 'NO2_trop',
       'TropopausePressure', 'GT_NO2', 'LST_scaled', 'LST_bspline',
       'AAI_scaled', 'AAI_bspline', 'CloudFraction_scaled',
       'CloudFraction_bspline', 'NO2_strat_scaled', 'NO2_strat_bspline',
       'NO2_total_scaled', 'NO2_total_bspline', 'NO2_trop_scaled',
       'NO2_trop_bspline', 'TropopausePressure_scaled',
       'TropopausePressure_bspline'],
      dtype='object')

In [7]:
# moving avg/mean
window_size = 7
df['LST_ma_7'] = df['LST_bspline'].rolling(window=window_size).mean()
df['NO2_strat_ma_7'] = df['NO2_strat_bspline'].rolling(window=window_size).mean()
df['CloudFraction_ma_7'] = df['CloudFraction_bspline'].rolling(window=window_size).mean()
df['TropopausePressure_ma_7'] = df['TropopausePressure_bspline'].rolling(window=window_size).mean()

In [8]:
# moving SD
df['LST_std_7'] = df['LST_bspline'].rolling(window=window_size).std()
df['NO2_strat_std_7'] = df['NO2_strat_bspline'].rolling(window=window_size).std()
df['CloudFraction_std_7'] = df['CloudFraction_bspline'].rolling(window=window_size).std()
df['TropopausePressure_std_7'] = df['TropopausePressure_bspline'].rolling(window=window_size).std()

In [9]:
df['Date'] = pd.to_datetime(df['Date'], errors='coerce', dayfirst=True)
df['month'] = df['Date'].dt.month

monthly_avg_LST = df.groupby('month')['LST_bspline'].mean()
monthly_avg_precipitation = df.groupby('month')['Precipitation'].mean()
monthly_avg_NO2_strat = df.groupby('month')['NO2_strat_bspline'].mean()

# Map the monthly averages back to each row in the original DataFrame
df['monthly_avg_LST'] = df['month'].map(monthly_avg_LST)
df['monthly_avg_precipitation'] = df['month'].map(monthly_avg_precipitation)
df['monthly_avg_NO2_strat'] = df['month'].map(monthly_avg_NO2_strat)


  df['Date'] = pd.to_datetime(df['Date'], errors='coerce', dayfirst=True)


In [10]:
# LOESS
from statsmodels.tsa.seasonal import STL

stl = STL(df['LST_bspline'], period=365)
result = stl.fit()
df['LST_trend'] = result.trend
df['LST_seasonal'] = result.seasonal
df['LST_residual'] = result.resid


stl_NO2_strat = STL(df['NO2_strat_bspline'], period=182)
result_NO2_strat = stl_NO2_strat.fit()
df['NO2_strat_trend'] = result_NO2_strat.trend
df['NO2_strat_seasonal'] = result_NO2_strat.seasonal
df['NO2_strat_residual'] = result_NO2_strat.resid

stl_NO2_strat = STL(df['TropopausePressure_bspline'], period=182)
result_NO2_strat = stl_NO2_strat.fit()
df['NO2_strat_trend'] = result_NO2_strat.trend
df['NO2_strat_seasonal'] = result_NO2_strat.seasonal
df['NO2_strat_residual'] = result_NO2_strat.resid

stl_NO2_strat = STL(df['CloudFraction_bspline'], period=91)
result_NO2_strat = stl_NO2_strat.fit()
df['NO2_strat_trend'] = result_NO2_strat.trend
df['NO2_strat_seasonal'] = result_NO2_strat.seasonal
df['NO2_strat_residual'] = result_NO2_strat.resid

In [11]:
# Fourier Transformations
def fourier_series(df, period, n_harmonics, columns):
    for col in columns:
        if pd.api.types.is_datetime64_any_dtype(df[col]):
            t = (df[col] - df[col].min()).dt.days.values
        else:
            t = df[col].values

        for k in range(1, n_harmonics + 1):
            df[f'{col}_sin_{k}'] = np.sin(2 * np.pi * k * t / period)
            df[f'{col}_cos_{k}'] = np.cos(2 * np.pi * k * t / period)

    return df

period = 365.25  # Annual seasonality
n_harmonics = 4

ft_data = fourier_series(df, period=period, n_harmonics=n_harmonics, columns=['LST_bspline'])

In [12]:
period = 182
n_harmonics = 3
ft_data = fourier_series(ft_data, period=period, n_harmonics=n_harmonics, columns=['NO2_strat_bspline'])

period = 182
n_harmonics = 3
ft_data = fourier_series(ft_data, period=period, n_harmonics=n_harmonics, columns=['TropopausePressure_bspline'])

period = 91
n_harmonics = 3
ft_data = fourier_series(ft_data, period=period, n_harmonics=n_harmonics, columns=['CloudFraction_bspline'])

In [13]:
ft_data.head()

Unnamed: 0,ID_Zindi,Date,ID,LAT,LON,Precipitation,LST,AAI,CloudFraction,NO2_strat,...,TropopausePressure_bspline_sin_2,TropopausePressure_bspline_cos_2,TropopausePressure_bspline_sin_3,TropopausePressure_bspline_cos_3,CloudFraction_bspline_sin_1,CloudFraction_bspline_cos_1,CloudFraction_bspline_sin_2,CloudFraction_bspline_cos_2,CloudFraction_bspline_sin_3,CloudFraction_bspline_cos_3
0,ID_ENTGC7,2019-01-01,PD01,45.601585,11.903551,0.0,309.259072,0.230527,0.559117,2.4e-05,...,0.025403,0.999677,0.0381,0.999274,0.038595,0.999255,0.077133,0.997021,0.115556,0.993301
1,ID_8JCCXC,2019-01-01,PD04,45.371005,11.84083,3.047342,300.225794,-0.074006,0.869309,2.4e-05,...,0.025407,0.999677,0.038106,0.999274,0.059986,0.998199,0.119756,0.992803,0.179095,0.983832
2,ID_V3136Z,2019-01-01,RO01,45.045825,12.060869,0.0,292.742096,0.02447,0.67416,2.4e-05,...,0.025388,0.999678,0.038077,0.999275,0.046531,0.998917,0.092962,0.99567,0.139191,0.990266
3,ID_KRVZDJ,2019-01-01,RO02,45.104075,11.553241,1.200467,286.687129,-0.010442,0.920054,2.4e-05,...,0.025403,0.999677,0.0381,0.999274,0.063483,0.997983,0.12671,0.99194,0.189426,0.981895
4,ID_PR351A,2019-01-01,RO03,45.038758,11.790152,1.274564,281.940047,-0.176178,0.747464,2.4e-05,...,0.025394,0.999678,0.038086,0.999274,0.051586,0.998669,0.103036,0.994678,0.15421,0.988038


In [14]:
ft_data.fillna(method='ffill', inplace=True)
ft_data.fillna(method='bfill', inplace=True)

  ft_data.fillna(method='ffill', inplace=True)
  ft_data.fillna(method='bfill', inplace=True)


In [15]:
ft_data = ft_data.drop(['ID', 'ID_Zindi'], axis = 1)

In [16]:
ft_data = ft_data.drop(['Date'], axis = 1)

In [17]:
split_index = int(0.8 * len(ft_data))
train_df = ft_data.iloc[:split_index].copy()
val_df = ft_data.iloc[split_index:].copy()

print(f"\nTraining data points: {train_df.shape[0]}")
print(f"Testing data points: {val_df.shape[0]}")


Training data points: 69267
Testing data points: 17317


In [165]:
from xgboost import XGBRegressor

xgb_model = XGBRegressor(
    objective='reg:squarederror',
    n_estimators=150,
    learning_rate=0.1,
    max_depth=6,
    random_state=42,
    gamma = 1.5, # min loss req to split node
    min_child_weight = 2, 
    # alpha = 1, 
)

In [166]:
xgb_model.fit(ft_data.drop(['GT_NO2'], axis = 1), ft_data['GT_NO2'])

In [167]:
ft_data.drop(['GT_NO2'], axis = 1).shape

(86584, 68)

In [168]:
pred = xgb_model.predict(val_df.drop(['GT_NO2'], axis = 1))

In [169]:
from sklearn.metrics import root_mean_squared_error
root_mean_squared_error(val_df['GT_NO2'], pred)

6.995758551359221

In [164]:
train_cols = list(ft_data.drop(['GT_NO2'], axis = 1).columns)

In [24]:
test_df = pd.read_csv('Test.csv')
features = ['LST', 'AAI', 'CloudFraction', 'NO2_strat', 'NO2_total', 'NO2_trop', 'TropopausePressure']

for feature in features:
    scaler = MinMaxScaler()
    test_df[f'{feature}_scaled'] = scaler.fit_transform(test_df[[feature]])

    # Mask to identify non-missing values
    mask_finite = np.isfinite(test_df[f'{feature}_scaled'].values)
    known_indices = test_df.index.values[mask_finite]
    known_values = test_df[f'{feature}_scaled'].values[mask_finite]
    missing_indices = test_df.index.values[~mask_finite]

    # Create a new column for the filled values
    test_df[f'{feature}_bspline'] = test_df[f'{feature}_scaled']

    # Fit the B-spline representation
    tck = splrep(known_indices, known_values)

    # Fill missing values using the B-spline
    test_df.loc[~mask_finite, f'{feature}_bspline'] = splev(missing_indices, tck)

    # Inverse transform to get the original scale
    test_df[feature] = scaler.inverse_transform(test_df[[f'{feature}_bspline']])

In [25]:
window_size = 7
for feature in ['LST', 'NO2_strat', 'CloudFraction', 'TropopausePressure']:
    test_df[f'{feature}_ma_{window_size}'] = test_df[f'{feature}_bspline'].rolling(window=window_size).mean()
    test_df[f'{feature}_std_{window_size}'] = test_df[f'{feature}_bspline'].rolling(window=window_size).std()

In [26]:
test_df['Date'] = pd.to_datetime(test_df['Date'], errors='coerce', dayfirst=True)
test_df['month'] = test_df['Date'].dt.month

  test_df['Date'] = pd.to_datetime(test_df['Date'], errors='coerce', dayfirst=True)


In [27]:
monthly_avg_LST = test_df.groupby('month')['LST_bspline'].mean()
monthly_avg_precipitation = test_df.groupby('month')['Precipitation'].mean()
monthly_avg_NO2_strat = test_df.groupby('month')['NO2_strat_bspline'].mean()

test_df['monthly_avg_LST'] = test_df['month'].map(monthly_avg_LST)
test_df['monthly_avg_precipitation'] = test_df['month'].map(monthly_avg_precipitation)
test_df['monthly_avg_NO2_strat'] = test_df['month'].map(monthly_avg_NO2_strat)

In [28]:
# LOESS
from statsmodels.tsa.seasonal import STL

stl = STL(test_df['LST_bspline'], period=365)
result = stl.fit()
test_df['LST_trend'] = result.trend
test_df['LST_seasonal'] = result.seasonal
test_df['LST_residual'] = result.resid


stl_NO2_strat = STL(test_df['NO2_strat_bspline'], period=182)
result_NO2_strat = stl_NO2_strat.fit()
test_df['NO2_strat_trend'] = result_NO2_strat.trend
test_df['NO2_strat_seasonal'] = result_NO2_strat.seasonal
test_df['NO2_strat_residual'] = result_NO2_strat.resid

stl_NO2_strat = STL(test_df['TropopausePressure_bspline'], period=182)
result_NO2_strat = stl_NO2_strat.fit()
test_df['NO2_strat_trend'] = result_NO2_strat.trend
test_df['NO2_strat_seasonal'] = result_NO2_strat.seasonal
test_df['NO2_strat_residual'] = result_NO2_strat.resid

stl_NO2_strat = STL(test_df['CloudFraction_bspline'], period=91)
result_NO2_strat = stl_NO2_strat.fit()
test_df['NO2_strat_trend'] = result_NO2_strat.trend
test_df['NO2_strat_seasonal'] = result_NO2_strat.seasonal
test_df['NO2_strat_residual'] = result_NO2_strat.resid

In [29]:
period = 365.25  # Annual seasonality
n_harmonics = 4
ft_data = fourier_series(test_df, period=period, n_harmonics=n_harmonics, columns=['LST_bspline'])

period = 182
n_harmonics = 3
ft_data = fourier_series(ft_data, period=period, n_harmonics=n_harmonics, columns=['NO2_strat_bspline'])

period = 182
n_harmonics = 3
ft_data = fourier_series(ft_data, period=period, n_harmonics=n_harmonics, columns=['TropopausePressure_bspline'])

period = 91
n_harmonics = 3
ft_data = fourier_series(ft_data, period=period, n_harmonics=n_harmonics, columns=['CloudFraction_bspline'])


In [30]:
ft_data

Unnamed: 0,ID_Zindi,Date,ID,LAT,LON,Precipitation,LST,AAI,CloudFraction,NO2_strat,...,TropopausePressure_bspline_sin_2,TropopausePressure_bspline_cos_2,TropopausePressure_bspline_sin_3,TropopausePressure_bspline_cos_3,CloudFraction_bspline_sin_1,CloudFraction_bspline_cos_1,CloudFraction_bspline_sin_2,CloudFraction_bspline_cos_2,CloudFraction_bspline_sin_3,CloudFraction_bspline_cos_3
0,ID_2MYNQS,2019-01-01,PD03,45.289376,11.642394,3.277529,296.660949,-0.313361,0.771456,0.000024,...,0.025439,0.999676,0.038153,0.999272,0.053241,0.998582,0.106331,0.994331,0.159119,0.987259
1,ID_P4U5WU,2019-01-01,TV03,45.836941,12.510362,0.000000,288.252482,-0.229512,0.398208,0.000023,...,0.025413,0.999677,0.038114,0.999273,0.027491,0.999622,0.054962,0.998488,0.082391,0.996600
2,ID_U4KWPK,2019-01-01,X5561,45.582894,8.842165,0.000000,282.980000,-0.470822,0.153694,0.000023,...,0.025384,0.999678,0.038071,0.999275,0.010612,0.999944,0.021222,0.999775,0.031831,0.999493
3,ID_QGSNTZ,2019-01-01,X5953,45.131947,10.015742,1.928031,280.235007,0.132952,0.756917,0.000024,...,0.025452,0.999676,0.038173,0.999271,0.052238,0.998635,0.104334,0.994542,0.156145,0.987734
4,ID_GHSZ6K,2019-01-01,X6701,45.186329,9.146666,0.000000,279.409007,-0.198272,0.678858,0.000023,...,0.025443,0.999676,0.038159,0.999272,0.046855,0.998902,0.093608,0.995609,0.140154,0.990130
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6571,ID_GUSXU9,2021-12-31,TV03,45.836941,12.510362,0.000000,282.580000,-0.013364,0.000000,0.000032,...,0.019416,0.999812,0.029121,0.999576,0.000000,1.000000,0.000000,1.000000,0.000000,1.000000
6572,ID_GMVEG1,2021-12-31,X5561,45.582894,8.842165,0.000000,285.120000,-0.412887,0.002098,0.000031,...,0.019397,0.999812,0.029093,0.999577,0.000145,1.000000,0.000290,1.000000,0.000435,1.000000
6573,ID_GD6HNP,2021-12-31,X5953,45.131947,10.015742,0.000000,285.818726,-0.876740,0.246678,0.000031,...,0.019406,0.999812,0.029106,0.999576,0.017031,0.999855,0.034058,0.999420,0.051074,0.998695
6574,ID_J7YW1Y,2021-12-31,X6701,45.186329,9.146666,0.000000,282.608089,-1.025128,0.476947,0.000031,...,0.019428,0.999811,0.029139,0.999575,0.032925,0.999458,0.065815,0.997832,0.098633,0.995124


In [31]:
test = ft_data.drop(['ID', 'ID_Zindi'], axis = 1)
test = test.drop(['Date'], axis = 1)

In [32]:
test

Unnamed: 0,LAT,LON,Precipitation,LST,AAI,CloudFraction,NO2_strat,NO2_total,NO2_trop,TropopausePressure,...,TropopausePressure_bspline_sin_2,TropopausePressure_bspline_cos_2,TropopausePressure_bspline_sin_3,TropopausePressure_bspline_cos_3,CloudFraction_bspline_sin_1,CloudFraction_bspline_cos_1,CloudFraction_bspline_sin_2,CloudFraction_bspline_cos_2,CloudFraction_bspline_sin_3,CloudFraction_bspline_cos_3
0,45.289376,11.642394,3.277529,296.660949,-0.313361,0.771456,0.000024,0.000075,-0.000142,14440.028190,...,0.025439,0.999676,0.038153,0.999272,0.053241,0.998582,0.106331,0.994331,0.159119,0.987259
1,45.836941,12.510362,0.000000,288.252482,-0.229512,0.398208,0.000023,0.000120,0.000051,14434.047900,...,0.025413,0.999677,0.038114,0.999273,0.027491,0.999622,0.054962,0.998488,0.082391,0.996600
2,45.582894,8.842165,0.000000,282.980000,-0.470822,0.153694,0.000023,0.000171,0.000148,14427.424780,...,0.025384,0.999678,0.038071,0.999275,0.010612,0.999944,0.021222,0.999775,0.031831,0.999493
3,45.131947,10.015742,1.928031,280.235007,0.132952,0.756917,0.000024,0.000266,0.000173,14443.090060,...,0.025452,0.999676,0.038173,0.999271,0.052238,0.998635,0.104334,0.994542,0.156145,0.987734
4,45.186329,9.146666,0.000000,279.409007,-0.198272,0.678858,0.000023,0.000149,0.000151,14440.858400,...,0.025443,0.999676,0.038159,0.999272,0.046855,0.998902,0.093608,0.995609,0.140154,0.990130
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6571,45.836941,12.510362,0.000000,282.580000,-0.013364,0.000000,0.000032,0.000135,0.000103,13060.468600,...,0.019416,0.999812,0.029121,0.999576,0.000000,1.000000,0.000000,1.000000,0.000000,1.000000
6572,45.582894,8.842165,0.000000,285.120000,-0.412887,0.002098,0.000031,0.000201,0.000171,13056.117640,...,0.019397,0.999812,0.029093,0.999577,0.000145,1.000000,0.000290,1.000000,0.000435,1.000000
6573,45.131947,10.015742,0.000000,285.818726,-0.876740,0.246678,0.000031,0.000514,0.000523,13058.245939,...,0.019406,0.999812,0.029106,0.999576,0.017031,0.999855,0.034058,0.999420,0.051074,0.998695
6574,45.186329,9.146666,0.000000,282.608089,-1.025128,0.476947,0.000031,0.000751,0.001261,13063.222600,...,0.019428,0.999811,0.029139,0.999575,0.032925,0.999458,0.065815,0.997832,0.098633,0.995124


In [33]:
train_cols

['LAT',
 'LON',
 'Precipitation',
 'LST',
 'AAI',
 'CloudFraction',
 'NO2_strat',
 'NO2_total',
 'NO2_trop',
 'TropopausePressure',
 'LST_scaled',
 'LST_bspline',
 'AAI_scaled',
 'AAI_bspline',
 'CloudFraction_scaled',
 'CloudFraction_bspline',
 'NO2_strat_scaled',
 'NO2_strat_bspline',
 'NO2_total_scaled',
 'NO2_total_bspline',
 'NO2_trop_scaled',
 'NO2_trop_bspline',
 'TropopausePressure_scaled',
 'TropopausePressure_bspline',
 'LST_ma_7',
 'NO2_strat_ma_7',
 'CloudFraction_ma_7',
 'TropopausePressure_ma_7',
 'LST_std_7',
 'NO2_strat_std_7',
 'CloudFraction_std_7',
 'TropopausePressure_std_7',
 'month',
 'monthly_avg_LST',
 'monthly_avg_precipitation',
 'monthly_avg_NO2_strat',
 'LST_trend',
 'LST_seasonal',
 'LST_residual',
 'NO2_strat_trend',
 'NO2_strat_seasonal',
 'NO2_strat_residual',
 'LST_bspline_sin_1',
 'LST_bspline_cos_1',
 'LST_bspline_sin_2',
 'LST_bspline_cos_2',
 'LST_bspline_sin_3',
 'LST_bspline_cos_3',
 'LST_bspline_sin_4',
 'LST_bspline_cos_4',
 'NO2_strat_bspline_s

In [34]:
test_cols = list(test.columns)

In [35]:
len(test_cols)

68

In [36]:
test = test[train_cols]

In [37]:
pred = xgb_model.predict(test)

In [38]:
submission = pd.DataFrame({
    'ID_Zindi': ft_data['ID_Zindi'],
    'GT_NO2_Predicted': pred
})

In [39]:
submission.to_csv('predictions.csv', index=False)