In [2]:
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
# import data 
data = pd.read_csv("Train.csv")
data.head()

Unnamed: 0,ID_Zindi,Date,ID,LAT,LON,Precipitation,LST,AAI,CloudFraction,NO2_strat,NO2_total,NO2_trop,TropopausePressure,GT_NO2
0,ID_ENTGC7,1/1/19,PD01,45.601585,11.903551,0.0,,0.230527,0.559117,2.4e-05,0.000117,,14440.82126,31.0
1,ID_8JCCXC,1/1/19,PD04,45.371005,11.84083,3.047342,,-0.074006,0.869309,2.4e-05,0.000127,,14441.79815,42.0
2,ID_V3136Z,1/1/19,RO01,45.045825,12.060869,0.0,,0.02447,0.67416,2.4e-05,8.6e-05,,14437.38294,31.0
3,ID_KRVZDJ,1/1/19,RO02,45.104075,11.553241,1.200467,,-0.010442,0.920054,2.4e-05,0.000124,,14440.83831,30.0
4,ID_PR351A,1/1/19,RO03,45.038758,11.790152,1.274564,,-0.176178,0.747464,2.4e-05,0.000116,,14438.79037,58.0


In [4]:
data['Date'] = pd.to_datetime(data['Date'], errors='coerce', dayfirst=True)

  data['Date'] = pd.to_datetime(data['Date'], errors='coerce', dayfirst=True)


In [5]:
import numpy as np

data = data.replace([np.inf, -np.inf], np.nan)
data.fillna(method='ffill', inplace=True)
data.fillna(method='bfill', inplace=True)

  data.fillna(method='ffill', inplace=True)
  data.fillna(method='bfill', inplace=True)


In [6]:
# moving avg/mean
window_size = 7
data['LST_ma_7'] = data['LST'].rolling(window=window_size).mean()
data['NO2_strat_ma_7'] = data['NO2_strat'].rolling(window=window_size).mean()
data['CloudFraction_ma_7'] = data['CloudFraction'].rolling(window=window_size).mean()
data['TropopausePressure_ma_7'] = data['TropopausePressure'].rolling(window=window_size).mean()

In [7]:
# moving SD
data['LST_std_7'] = data['LST'].rolling(window=window_size).std()
data['NO2_strat_std_7'] = data['NO2_strat'].rolling(window=window_size).std()
data['CloudFraction_std_7'] = data['CloudFraction'].rolling(window=window_size).std()
data['TropopausePressure_std_7'] = data['TropopausePressure'].rolling(window=window_size).std()

In [8]:
data['month'] = data['Date'].dt.month

monthly_avg_LST = data.groupby('month')['LST'].mean()
monthly_avg_precipitation = data.groupby('month')['Precipitation'].mean()
monthly_avg_NO2_strat = data.groupby('month')['NO2_strat'].mean()

# Map the monthly averages back to each row in the original DataFrame
data['monthly_avg_LST'] = data['month'].map(monthly_avg_LST)
data['monthly_avg_precipitation'] = data['month'].map(monthly_avg_precipitation)
data['monthly_avg_NO2_strat'] = data['month'].map(monthly_avg_NO2_strat)


In [9]:
# LOESS
from statsmodels.tsa.seasonal import STL

stl = STL(data['LST'], period=365)
result = stl.fit()
data['LST_trend'] = result.trend
data['LST_seasonal'] = result.seasonal
data['LST_residual'] = result.resid


stl_NO2_strat = STL(data['NO2_strat'], period=182)
result_NO2_strat = stl_NO2_strat.fit()
data['NO2_strat_trend'] = result_NO2_strat.trend
data['NO2_strat_seasonal'] = result_NO2_strat.seasonal
data['NO2_strat_residual'] = result_NO2_strat.resid

stl_NO2_strat = STL(data['TropopausePressure'], period=182)
result_NO2_strat = stl_NO2_strat.fit()
data['NO2_strat_trend'] = result_NO2_strat.trend
data['NO2_strat_seasonal'] = result_NO2_strat.seasonal
data['NO2_strat_residual'] = result_NO2_strat.resid

stl_NO2_strat = STL(data['CloudFraction'], period=91)
result_NO2_strat = stl_NO2_strat.fit()
data['NO2_strat_trend'] = result_NO2_strat.trend
data['NO2_strat_seasonal'] = result_NO2_strat.seasonal
data['NO2_strat_residual'] = result_NO2_strat.resid

In [10]:
# Fourier Transformations
def fourier_series(df, period, n_harmonics, columns):
    for col in columns:
        if pd.api.types.is_datetime64_any_dtype(df[col]):
            t = (df[col] - df[col].min()).dt.days.values
        else:
            t = df[col].values

        for k in range(1, n_harmonics + 1):
            df[f'{col}_sin_{k}'] = np.sin(2 * np.pi * k * t / period)
            df[f'{col}_cos_{k}'] = np.cos(2 * np.pi * k * t / period)

    return df

period = 365.25  # Annual seasonality
n_harmonics = 4

ft_data = fourier_series(data, period=period, n_harmonics=n_harmonics, columns=['LST'])

In [11]:
period = 182
n_harmonics = 3
ft_data = fourier_series(ft_data, period=period, n_harmonics=n_harmonics, columns=['NO2_strat'])

period = 182
n_harmonics = 3
ft_data = fourier_series(ft_data, period=period, n_harmonics=n_harmonics, columns=['TropopausePressure'])

period = 91
n_harmonics = 3
ft_data = fourier_series(ft_data, period=period, n_harmonics=n_harmonics, columns=['CloudFraction'])

In [12]:
ft_data.fillna(method='ffill', inplace=True)
ft_data.fillna(method='bfill', inplace=True)

  ft_data.fillna(method='ffill', inplace=True)
  ft_data.fillna(method='bfill', inplace=True)


In [13]:
ft_data.head()

Unnamed: 0,ID_Zindi,Date,ID,LAT,LON,Precipitation,LST,AAI,CloudFraction,NO2_strat,...,TropopausePressure_sin_2,TropopausePressure_cos_2,TropopausePressure_sin_3,TropopausePressure_cos_3,CloudFraction_sin_1,CloudFraction_cos_1,CloudFraction_sin_2,CloudFraction_cos_2,CloudFraction_sin_3,CloudFraction_cos_3
0,ID_ENTGC7,2019-01-01,PD01,45.601585,11.903551,0.0,278.38,0.230527,0.559117,2.4e-05,...,-0.930569,-0.366117,0.221302,0.975205,0.038595,0.999255,0.077133,0.997021,0.115556,0.993301
1,ID_8JCCXC,2019-01-01,PD04,45.371005,11.84083,3.047342,278.38,-0.074006,0.869309,2.4e-05,...,-0.953129,-0.302565,0.318669,0.947866,0.059986,0.998199,0.119756,0.992803,0.179095,0.983832
2,ID_V3136Z,2019-01-01,RO01,45.045825,12.060869,0.0,278.38,0.02447,0.67416,2.4e-05,...,-0.818366,-0.574698,-0.132563,0.991175,0.046531,0.998917,0.092962,0.99567,0.139191,0.990266
3,ID_KRVZDJ,2019-01-01,RO02,45.104075,11.553241,1.200467,278.38,-0.010442,0.920054,2.4e-05,...,-0.930999,-0.365021,0.223023,0.974813,0.063483,0.997983,0.12671,0.99194,0.189426,0.981895
4,ID_PR351A,2019-01-01,RO03,45.038758,11.790152,1.274564,278.38,-0.176178,0.747464,2.4e-05,...,-0.870264,-0.492585,0.012811,0.999918,0.051586,0.998669,0.103036,0.994678,0.15421,0.988038


In [16]:
ft_data['GT_NO2_lag2'] = ft_data['GT_NO2'].shift(-2)

In [17]:
ft_data.head()

Unnamed: 0,ID_Zindi,Date,ID,LAT,LON,Precipitation,LST,AAI,CloudFraction,NO2_strat,...,TropopausePressure_sin_3,TropopausePressure_cos_3,CloudFraction_sin_1,CloudFraction_cos_1,CloudFraction_sin_2,CloudFraction_cos_2,CloudFraction_sin_3,CloudFraction_cos_3,GT_NO2_lag1,GT_NO2_lag2
0,ID_ENTGC7,2019-01-01,PD01,45.601585,11.903551,0.0,278.38,0.230527,0.559117,2.4e-05,...,0.221302,0.975205,0.038595,0.999255,0.077133,0.997021,0.115556,0.993301,31.0,31.0
1,ID_8JCCXC,2019-01-01,PD04,45.371005,11.84083,3.047342,278.38,-0.074006,0.869309,2.4e-05,...,0.318669,0.947866,0.059986,0.998199,0.119756,0.992803,0.179095,0.983832,30.0,30.0
2,ID_V3136Z,2019-01-01,RO01,45.045825,12.060869,0.0,278.38,0.02447,0.67416,2.4e-05,...,-0.132563,0.991175,0.046531,0.998917,0.092962,0.99567,0.139191,0.990266,58.0,58.0
3,ID_KRVZDJ,2019-01-01,RO02,45.104075,11.553241,1.200467,278.38,-0.010442,0.920054,2.4e-05,...,0.223023,0.974813,0.063483,0.997983,0.12671,0.99194,0.189426,0.981895,26.0,26.0
4,ID_PR351A,2019-01-01,RO03,45.038758,11.790152,1.274564,278.38,-0.176178,0.747464,2.4e-05,...,0.012811,0.999918,0.051586,0.998669,0.103036,0.994678,0.15421,0.988038,38.0,38.0


In [18]:
ft_data['GT_NO2_lag1'] = ft_data['GT_NO2'].shift(-1)

In [19]:
ft_data.head(10)

Unnamed: 0,ID_Zindi,Date,ID,LAT,LON,Precipitation,LST,AAI,CloudFraction,NO2_strat,...,TropopausePressure_sin_3,TropopausePressure_cos_3,CloudFraction_sin_1,CloudFraction_cos_1,CloudFraction_sin_2,CloudFraction_cos_2,CloudFraction_sin_3,CloudFraction_cos_3,GT_NO2_lag1,GT_NO2_lag2
0,ID_ENTGC7,2019-01-01,PD01,45.601585,11.903551,0.0,278.38,0.230527,0.559117,2.4e-05,...,0.221302,0.975205,0.038595,0.999255,0.077133,0.997021,0.115556,0.993301,42.0,31.0
1,ID_8JCCXC,2019-01-01,PD04,45.371005,11.84083,3.047342,278.38,-0.074006,0.869309,2.4e-05,...,0.318669,0.947866,0.059986,0.998199,0.119756,0.992803,0.179095,0.983832,31.0,30.0
2,ID_V3136Z,2019-01-01,RO01,45.045825,12.060869,0.0,278.38,0.02447,0.67416,2.4e-05,...,-0.132563,0.991175,0.046531,0.998917,0.092962,0.99567,0.139191,0.990266,30.0,58.0
3,ID_KRVZDJ,2019-01-01,RO02,45.104075,11.553241,1.200467,278.38,-0.010442,0.920054,2.4e-05,...,0.223023,0.974813,0.063483,0.997983,0.12671,0.99194,0.189426,0.981895,58.0,26.0
4,ID_PR351A,2019-01-01,RO03,45.038758,11.790152,1.274564,278.38,-0.176178,0.747464,2.4e-05,...,0.012811,0.999918,0.051586,0.998669,0.103036,0.994678,0.15421,0.988038,26.0,38.0
5,ID_4XN0K8,2019-01-01,TV01,45.889734,12.307124,0.0,278.38,-0.366831,0.324392,2.3e-05,...,-0.63239,0.77465,0.022396,0.999749,0.044781,0.998997,0.067143,0.997743,38.0,34.0
6,ID_O0RJKX,2019-01-01,TV02,45.671721,12.237807,0.0,278.38,0.188599,0.818422,2.4e-05,...,-0.202102,0.979365,0.056479,0.998404,0.112777,0.99362,0.168715,0.985665,34.0,41.0
7,ID_1APJEY,2019-01-01,VE01,45.629092,12.590682,0.0,278.38,0.507837,0.926018,2.4e-05,...,-0.368805,0.929507,0.063894,0.997957,0.127527,0.991835,0.19064,0.98166,41.0,35.0
8,ID_4B1H1U,2019-01-01,VE02,45.499618,12.261249,0.0,278.38,0.087363,0.835097,2.4e-05,...,-0.015785,0.999875,0.057628,0.998338,0.115065,0.993358,0.172119,0.985076,35.0,40.0
9,ID_3JD1GC,2019-01-01,VE03,45.428424,12.31293,0.650355,278.38,0.208678,0.812696,2.4e-05,...,-0.098904,0.995097,0.056084,0.998426,0.111991,0.993709,0.167546,0.985864,40.0,39.0


In [20]:
ft_data.to_csv('multi_output.csv')