# Layer.ai Air Quality Prediction Challenge
# By Mohamed Eltayeb & Azer Ksouri

# Import libraries

In [54]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

from sklearn.preprocessing import LabelEncoder
from catboost import CatBoostRegressor
from sklearn.compose import TransformedTargetRegressor
from sklearn.metrics import mean_absolute_error

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
pd.options.mode.chained_assignment = None  # default='warn'
pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.set_option('display.max_columns', None)

## Define Functions

In [55]:
#Plot the Features Importances
def plotImp(model, X , num = 30, fig_size = (60, 30)):
    feature_imp = pd.DataFrame({'Value':model.feature_importances_,'Feature':X.columns})
    plt.figure(figsize=fig_size)
    sns.set(font_scale = 5)
    sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", 
                                                        ascending=False)[0:num])
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout()
    plt.savefig('lgbm_importances-01.png')
    plt.show()

In [56]:
#Label Encoder
def label_enc(train_df, test_df, features):
    lbl_enc = LabelEncoder()
    full_data = pd.concat([train_df[features], test_df[features]],axis=0)
    for col in (features):
        print(col)
        lbl_enc.fit(full_data[col].values)
        train_df[col] = lbl_enc.transform(train_df[col])
        test_df[col] = lbl_enc.transform(test_df[col])
    return train_df, test_df

In [57]:
#Group Time Series Split
from sklearn.model_selection._split import _BaseKFold, indexable, _num_samples
from sklearn.utils.validation import _deprecate_positional_args

class GroupTimeSeriesSplit(_BaseKFold):
    @_deprecate_positional_args
    def __init__(self,
                 n_splits=5,
                 *,
                 max_train_size=None
                 ):
        super().__init__(n_splits, shuffle=False, random_state=None)
        self.max_train_size = max_train_size

    def split(self, X, y=None, groups=None):
        """Generate indices to split data into training and test set.
        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Training data, where n_samples is the number of samples
            and n_features is the number of features.
        y : array-like of shape (n_samples,)
            Always ignored, exists for compatibility.
        groups : array-like of shape (n_samples,)
            Group labels for the samples used while splitting the dataset into
            train/test set.
        Yields
        ------
        train : ndarray
            The training set indices for that split.
        test : ndarray
            The testing set indices for that split.
        """
        if groups is None:
            raise ValueError(
                "The 'groups' parameter should not be None")
        X, y, groups = indexable(X, y, groups)
        n_samples = _num_samples(X)
        n_splits = self.n_splits
        n_folds = n_splits + 1
        group_dict = {}
        u, ind = np.unique(groups, return_index=True)
        unique_groups = u[np.argsort(ind)]
        n_samples = _num_samples(X)
        n_groups = _num_samples(unique_groups)
        for idx in np.arange(n_samples):
            if (groups[idx] in group_dict):
                group_dict[groups[idx]].append(idx)
            else:
                group_dict[groups[idx]] = [idx]
        if n_folds > n_groups:
            raise ValueError(
                ("Cannot have number of folds={0} greater than"
                 " the number of groups={1}").format(n_folds,
                                                     n_groups))
        group_test_size = n_groups // n_folds
        group_test_starts = range(n_groups - n_splits * group_test_size,
                                  n_groups, group_test_size)
        for group_test_start in group_test_starts:
            train_array = []
            test_array = []
            for train_group_idx in unique_groups[:group_test_start]:
                train_array_tmp = group_dict[train_group_idx]
                train_array = np.sort(np.unique(
                                      np.concatenate((train_array,
                                                      train_array_tmp)),
                                      axis=None), axis=None)
            train_end = train_array.size
            if self.max_train_size and self.max_train_size < train_end:
                train_array = train_array[train_end -
                                          self.max_train_size:train_end]
            for test_group_idx in unique_groups[group_test_start:
                                                group_test_start +
                                                group_test_size]:
                test_array_tmp = group_dict[test_group_idx]
                test_array = np.sort(np.unique(
                                              np.concatenate((test_array,
                                                              test_array_tmp)),
                                     axis=None), axis=None)
            yield [int(i) for i in train_array], [int(i) for i in test_array]

# Read the training and testing data


In [80]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

# Save The IDs

In [81]:
test_df = test_df.sort_values(['date','device']).reset_index(drop=True)
ID = test_df['ID']
test_df.drop('ID',inplace=True,axis=1)
train_df.drop('ID',inplace=True,axis=1)

# Add The Time Features

In [82]:
train_df = train_df.sort_values(['device','date']).reset_index(drop=True) 
test_df = test_df.sort_values(['device','date']).reset_index(drop=True)

for dataset in (train_df,test_df):
    dataset['date'] = pd.to_datetime(dataset['date'])
    dataset['Day'] = dataset.date.dt.day
    dataset['Month'] = dataset.date.dt.month
    dataset['Year'] = dataset.date.dt.year
    dataset['DayOfWeek'] = dataset.date.dt.dayofweek
    dataset['DayOfYear'] = dataset.date.dt.dayofyear
    dataset['Week'] = dataset.date.dt.weekofyear
    dataset.set_index('date', inplace=True)

# Exploratory data analysis

In [83]:
train_df.shape

(9923, 75)

In [84]:
test_df.shape

(4254, 74)

In [85]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 9923 entries, 2019-09-27 to 2020-08-19
Data columns (total 75 columns):
 #   Column                                                    Non-Null Count  Dtype  
---  ------                                                    --------------  -----  
 0   device                                                    9923 non-null   object 
 1   site_latitude                                             9923 non-null   float64
 2   site_longitude                                            9923 non-null   float64
 3   humidity                                                  9923 non-null   float64
 4   temp_mean                                                 9903 non-null   float64
 5   SulphurDioxide_SO2_column_number_density                  4291 non-null   float64
 6   SulphurDioxide_SO2_column_number_density_amf              4291 non-null   float64
 7   SulphurDioxide_SO2_slant_column_number_density            4291 non-null   float64
 8   

In [86]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 4254 entries, 2020-08-20 to 2020-10-19
Data columns (total 74 columns):
 #   Column                                                    Non-Null Count  Dtype  
---  ------                                                    --------------  -----  
 0   device                                                    4254 non-null   object 
 1   site_latitude                                             4254 non-null   float64
 2   site_longitude                                            4254 non-null   float64
 3   humidity                                                  4254 non-null   float64
 4   temp_mean                                                 4254 non-null   float64
 5   SulphurDioxide_SO2_column_number_density                  914 non-null    float64
 6   SulphurDioxide_SO2_column_number_density_amf              914 non-null    float64
 7   SulphurDioxide_SO2_slant_column_number_density            914 non-null    float64
 8   

In [87]:
train_df.head()

Unnamed: 0_level_0,device,site_latitude,site_longitude,humidity,temp_mean,SulphurDioxide_SO2_column_number_density,SulphurDioxide_SO2_column_number_density_amf,SulphurDioxide_SO2_slant_column_number_density,SulphurDioxide_cloud_fraction,SulphurDioxide_sensor_azimuth_angle,SulphurDioxide_sensor_zenith_angle,SulphurDioxide_solar_azimuth_angle,SulphurDioxide_solar_zenith_angle,SulphurDioxide_SO2_column_number_density_15km,CarbonMonoxide_CO_column_number_density,CarbonMonoxide_H2O_column_number_density,CarbonMonoxide_cloud_height,CarbonMonoxide_sensor_altitude,CarbonMonoxide_sensor_azimuth_angle,CarbonMonoxide_sensor_zenith_angle,CarbonMonoxide_solar_azimuth_angle,CarbonMonoxide_solar_zenith_angle,NitrogenDioxide_NO2_column_number_density,NitrogenDioxide_tropospheric_NO2_column_number_density,NitrogenDioxide_stratospheric_NO2_column_number_density,NitrogenDioxide_NO2_slant_column_number_density,NitrogenDioxide_tropopause_pressure,NitrogenDioxide_absorbing_aerosol_index,NitrogenDioxide_cloud_fraction,NitrogenDioxide_sensor_altitude,NitrogenDioxide_sensor_azimuth_angle,NitrogenDioxide_sensor_zenith_angle,NitrogenDioxide_solar_azimuth_angle,NitrogenDioxide_solar_zenith_angle,Formaldehyde_tropospheric_HCHO_column_number_density,Formaldehyde_tropospheric_HCHO_column_number_density_amf,Formaldehyde_HCHO_slant_column_number_density,Formaldehyde_cloud_fraction,Formaldehyde_solar_zenith_angle,Formaldehyde_solar_azimuth_angle,Formaldehyde_sensor_zenith_angle,Formaldehyde_sensor_azimuth_angle,UvAerosolIndex_absorbing_aerosol_index,UvAerosolIndex_sensor_altitude,UvAerosolIndex_sensor_azimuth_angle,UvAerosolIndex_sensor_zenith_angle,UvAerosolIndex_solar_azimuth_angle,UvAerosolIndex_solar_zenith_angle,Ozone_O3_column_number_density,Ozone_O3_column_number_density_amf,Ozone_O3_slant_column_number_density,Ozone_O3_effective_temperature,Ozone_cloud_fraction,Ozone_sensor_azimuth_angle,Ozone_sensor_zenith_angle,Ozone_solar_azimuth_angle,Ozone_solar_zenith_angle,Cloud_cloud_fraction,Cloud_cloud_top_pressure,Cloud_cloud_top_height,Cloud_cloud_base_pressure,Cloud_cloud_base_height,Cloud_cloud_optical_depth,Cloud_surface_albedo,Cloud_sensor_azimuth_angle,Cloud_sensor_zenith_angle,Cloud_solar_azimuth_angle,Cloud_solar_zenith_angle,pm2_5,Day,Month,Year,DayOfWeek,DayOfYear,Week
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1
2019-09-27,A0WN66FH,0.286,32.578,0.877,20.76,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-0.262,829764.375,-93.763,58.262,-93.279,34.373,0.115,2.997,0.349,222.859,1.0,-93.754,58.36,-93.277,34.401,1.0,36288.625,8233.294,41519.246,7233.293,56.773,0.249,-93.773,58.146,-93.282,34.339,72.836,27,9,2019,4,270,39
2019-09-28,A0WN66FH,0.286,32.578,0.854,22.637,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-0.702,829788.938,-99.787,36.39,-94.576,29.739,0.115,2.376,0.276,227.165,0.451,-99.787,36.39,-94.576,29.739,0.451,57830.965,4675.087,65457.598,3675.087,7.49,0.278,-99.787,36.391,-94.576,29.739,39.781,28,9,2019,5,271,39
2019-09-29,A0WN66FH,0.286,32.578,0.835,22.703,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-1.649,829668.675,-103.629,0.745,-96.198,25.182,0.115,2.105,0.245,220.777,0.734,-103.629,0.745,-96.198,25.182,0.892,17882.896,12995.496,20972.16,11995.496,7.945,0.224,-103.768,0.684,-96.269,25.174,57.602,29,9,2019,6,272,39
2019-09-30,A0WN66FH,0.286,32.578,0.797,22.211,-0.0,0.955,-0.0,0.0,74.431,35.451,-98.696,20.667,-0.0,0.037,2289.174,1.0,829460.062,72.342,35.631,-98.641,20.661,0.0,0.0,0.0,0.0,9580.239,-1.71,0.033,829478.25,74.431,35.451,-98.696,20.667,0.0,1.113,0.0,0.0,20.667,-98.696,35.451,74.431,-1.71,829478.25,74.431,35.451,-98.696,20.667,0.116,2.289,0.272,227.476,0.0,74.431,35.451,-98.696,20.667,,,,,,,,,,,,25.907,30,9,2019,0,273,40
2019-10-01,A0WN66FH,0.286,32.578,0.817,22.16,-0.0,1.055,-0.0,0.278,72.166,57.408,-102.351,16.282,-0.0,0.04,2671.486,2951.745,829258.688,70.48,57.699,-102.36,16.245,,,,,,,,,,,,,0.0,1.112,0.0,0.278,16.282,-102.351,57.408,72.166,-1.614,829289.047,72.166,57.408,-102.351,16.282,0.117,2.839,0.338,228.514,0.278,72.166,57.408,-102.351,16.282,0.278,64740.239,3765.821,73150.386,2765.821,4.407,0.262,72.162,57.407,-102.423,16.283,48.14,1,10,2019,1,274,40


In [88]:
test_df.head()

Unnamed: 0_level_0,device,site_latitude,site_longitude,humidity,temp_mean,SulphurDioxide_SO2_column_number_density,SulphurDioxide_SO2_column_number_density_amf,SulphurDioxide_SO2_slant_column_number_density,SulphurDioxide_cloud_fraction,SulphurDioxide_sensor_azimuth_angle,SulphurDioxide_sensor_zenith_angle,SulphurDioxide_solar_azimuth_angle,SulphurDioxide_solar_zenith_angle,SulphurDioxide_SO2_column_number_density_15km,CarbonMonoxide_CO_column_number_density,CarbonMonoxide_H2O_column_number_density,CarbonMonoxide_cloud_height,CarbonMonoxide_sensor_altitude,CarbonMonoxide_sensor_azimuth_angle,CarbonMonoxide_sensor_zenith_angle,CarbonMonoxide_solar_azimuth_angle,CarbonMonoxide_solar_zenith_angle,NitrogenDioxide_NO2_column_number_density,NitrogenDioxide_tropospheric_NO2_column_number_density,NitrogenDioxide_stratospheric_NO2_column_number_density,NitrogenDioxide_NO2_slant_column_number_density,NitrogenDioxide_tropopause_pressure,NitrogenDioxide_absorbing_aerosol_index,NitrogenDioxide_cloud_fraction,NitrogenDioxide_sensor_altitude,NitrogenDioxide_sensor_azimuth_angle,NitrogenDioxide_sensor_zenith_angle,NitrogenDioxide_solar_azimuth_angle,NitrogenDioxide_solar_zenith_angle,Formaldehyde_tropospheric_HCHO_column_number_density,Formaldehyde_tropospheric_HCHO_column_number_density_amf,Formaldehyde_HCHO_slant_column_number_density,Formaldehyde_cloud_fraction,Formaldehyde_solar_zenith_angle,Formaldehyde_solar_azimuth_angle,Formaldehyde_sensor_zenith_angle,Formaldehyde_sensor_azimuth_angle,UvAerosolIndex_absorbing_aerosol_index,UvAerosolIndex_sensor_altitude,UvAerosolIndex_sensor_azimuth_angle,UvAerosolIndex_sensor_zenith_angle,UvAerosolIndex_solar_azimuth_angle,UvAerosolIndex_solar_zenith_angle,Ozone_O3_column_number_density,Ozone_O3_column_number_density_amf,Ozone_O3_slant_column_number_density,Ozone_O3_effective_temperature,Ozone_cloud_fraction,Ozone_sensor_azimuth_angle,Ozone_sensor_zenith_angle,Ozone_solar_azimuth_angle,Ozone_solar_zenith_angle,Cloud_cloud_fraction,Cloud_cloud_top_pressure,Cloud_cloud_top_height,Cloud_cloud_base_pressure,Cloud_cloud_base_height,Cloud_cloud_optical_depth,Cloud_surface_albedo,Cloud_sensor_azimuth_angle,Cloud_sensor_zenith_angle,Cloud_solar_azimuth_angle,Cloud_solar_zenith_angle,Day,Month,Year,DayOfWeek,DayOfYear,Week
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1
2020-08-20,A0WN66FH,0.286,32.578,0.792,23.349,0.0,0.858,0.0,0.21,75.368,24.837,-56.408,21.97,0.0,0.048,2494.005,1959.492,829581.75,72.628,25.045,-56.395,21.961,,,,,,,,,,,,,0.0,1.083,0.0,0.21,21.97,-56.408,24.837,75.368,-2.071,829601.562,75.368,24.837,-56.408,21.97,0.123,2.201,0.275,234.362,0.21,75.368,24.837,-56.408,21.97,0.211,66601.727,3553.709,75593.211,2553.709,3.097,0.226,75.368,24.837,-56.408,21.97,20,8,2020,3,233,34
2020-08-21,A0WN66FH,0.286,32.578,0.79,22.989,,,,,,,,,,0.029,3169.188,4631.772,829368.0,71.077,51.419,-49.853,18.125,,,,,,,,,,,,,0.0,0.896,0.0,0.465,18.107,-49.66,51.518,72.781,-2.773,829396.375,72.781,51.518,-49.66,18.107,0.12,2.639,0.322,224.725,0.466,72.781,51.518,-49.66,18.107,0.465,41635.555,7425.758,47216.414,6425.758,3.796,0.246,72.781,51.518,-49.66,18.107,21,8,2020,4,234,34
2020-08-22,A0WN66FH,0.286,32.578,0.827,22.844,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,22,8,2020,5,235,34
2020-08-23,A0WN66FH,0.286,32.578,0.822,22.601,0.0,0.438,0.0,0.174,-98.348,52.177,-68.687,31.503,0.0,0.037,2317.106,910.123,829847.875,-96.662,52.1,-68.696,31.472,0.0,0.0,0.0,0.0,8613.727,-1.824,0.081,829885.438,-98.348,52.177,-68.687,31.503,0.0,0.97,0.0,0.174,31.503,-68.687,52.177,-98.348,-1.824,829885.438,-98.348,52.177,-68.687,31.503,0.121,2.76,0.34,233.692,0.174,-98.348,52.177,-68.687,31.503,,,,,,,,,,,,23,8,2020,6,236,34
2020-08-24,A0WN66FH,0.286,32.578,0.894,20.56,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-0.926,829800.875,-100.719,26.056,-66.197,27.1,0.124,2.257,0.285,228.58,1.0,-100.719,26.056,-66.197,27.1,1.0,45908.11,6630.184,51949.321,5630.184,17.573,0.259,-100.719,26.056,-66.197,27.1,24,8,2020,0,237,35


In [89]:
train_df.describe()

Unnamed: 0,site_latitude,site_longitude,humidity,temp_mean,SulphurDioxide_SO2_column_number_density,SulphurDioxide_SO2_column_number_density_amf,SulphurDioxide_SO2_slant_column_number_density,SulphurDioxide_cloud_fraction,SulphurDioxide_sensor_azimuth_angle,SulphurDioxide_sensor_zenith_angle,SulphurDioxide_solar_azimuth_angle,SulphurDioxide_solar_zenith_angle,SulphurDioxide_SO2_column_number_density_15km,CarbonMonoxide_CO_column_number_density,CarbonMonoxide_H2O_column_number_density,CarbonMonoxide_cloud_height,CarbonMonoxide_sensor_altitude,CarbonMonoxide_sensor_azimuth_angle,CarbonMonoxide_sensor_zenith_angle,CarbonMonoxide_solar_azimuth_angle,CarbonMonoxide_solar_zenith_angle,NitrogenDioxide_NO2_column_number_density,NitrogenDioxide_tropospheric_NO2_column_number_density,NitrogenDioxide_stratospheric_NO2_column_number_density,NitrogenDioxide_NO2_slant_column_number_density,NitrogenDioxide_tropopause_pressure,NitrogenDioxide_absorbing_aerosol_index,NitrogenDioxide_cloud_fraction,NitrogenDioxide_sensor_altitude,NitrogenDioxide_sensor_azimuth_angle,NitrogenDioxide_sensor_zenith_angle,NitrogenDioxide_solar_azimuth_angle,NitrogenDioxide_solar_zenith_angle,Formaldehyde_tropospheric_HCHO_column_number_density,Formaldehyde_tropospheric_HCHO_column_number_density_amf,Formaldehyde_HCHO_slant_column_number_density,Formaldehyde_cloud_fraction,Formaldehyde_solar_zenith_angle,Formaldehyde_solar_azimuth_angle,Formaldehyde_sensor_zenith_angle,Formaldehyde_sensor_azimuth_angle,UvAerosolIndex_absorbing_aerosol_index,UvAerosolIndex_sensor_altitude,UvAerosolIndex_sensor_azimuth_angle,UvAerosolIndex_sensor_zenith_angle,UvAerosolIndex_solar_azimuth_angle,UvAerosolIndex_solar_zenith_angle,Ozone_O3_column_number_density,Ozone_O3_column_number_density_amf,Ozone_O3_slant_column_number_density,Ozone_O3_effective_temperature,Ozone_cloud_fraction,Ozone_sensor_azimuth_angle,Ozone_sensor_zenith_angle,Ozone_solar_azimuth_angle,Ozone_solar_zenith_angle,Cloud_cloud_fraction,Cloud_cloud_top_pressure,Cloud_cloud_top_height,Cloud_cloud_base_pressure,Cloud_cloud_base_height,Cloud_cloud_optical_depth,Cloud_surface_albedo,Cloud_sensor_azimuth_angle,Cloud_sensor_zenith_angle,Cloud_solar_azimuth_angle,Cloud_solar_zenith_angle,pm2_5,Day,Month,Year,DayOfWeek,DayOfYear,Week
count,9923.0,9923.0,9923.0,9903.0,4291.0,4291.0,4291.0,4291.0,4291.0,4291.0,4291.0,4291.0,4291.0,5463.0,5463.0,5463.0,5463.0,5463.0,5463.0,5463.0,5463.0,3005.0,3005.0,3005.0,3005.0,3005.0,3005.0,3005.0,3005.0,3005.0,3005.0,3005.0,3005.0,5277.0,5277.0,5277.0,5277.0,5277.0,5277.0,5277.0,5277.0,9588.0,9588.0,9588.0,9588.0,9588.0,9588.0,9387.0,9387.0,9387.0,9387.0,9387.0,9387.0,9387.0,9387.0,9387.0,8414.0,8414.0,8414.0,8414.0,8414.0,8414.0,8414.0,8414.0,8414.0,8414.0,8414.0,9923.0,9923.0,9923.0,9923.0,9923.0,9923.0,9923.0
mean,0.243,32.472,0.791,21.571,0.0,0.791,0.0,0.147,-17.585,37.886,-86.495,28.039,0.0,0.035,2704.027,2327.034,829629.139,-14.789,37.209,-85.284,28.004,0.0,0.0,0.0,0.0,8669.07,-1.651,0.088,829664.405,-20.506,36.694,-83.369,28.408,0.0,1.092,0.0,0.198,28.061,-86.659,39.514,-17.736,-1.407,829639.713,-15.62,40.118,-86.777,27.72,0.116,2.588,0.304,228.463,0.455,-17.495,39.555,-87.249,27.856,0.48,50048.337,6334.871,56017.988,5405.379,17.944,0.269,-17.784,40.69,-86.994,27.792,57.107,15.551,6.228,2019.692,2.984,174.242,25.334
std,0.332,0.546,0.158,4.061,0.0,0.198,0.0,0.086,86.752,19.323,39.627,6.474,0.0,0.006,1118.013,1392.642,203.984,83.05,18.494,39.856,6.37,0.0,0.0,0.0,0.0,828.532,0.501,0.049,201.689,86.661,19.328,40.297,6.216,0.0,0.294,0.0,0.125,6.628,39.749,19.145,86.588,0.7,229.328,86.605,19.412,40.278,6.855,0.004,0.398,0.047,3.997,0.339,86.581,19.23,39.759,6.842,0.333,18886.902,3172.279,19694.712,3073.16,33.709,0.052,86.454,19.074,40.092,6.906,27.483,8.762,3.381,0.462,1.994,102.926,14.632
min,-1.245,29.989,0.0,0.0,-0.001,0.346,-0.001,0.0,-105.714,0.101,-159.899,9.757,-0.0,0.019,790.617,-200.152,829145.5,-98.192,1.108,-157.293,9.831,-0.0,-0.0,0.0,0.0,7311.869,-3.369,0.0,829180.062,-105.714,0.101,-157.751,9.757,-0.001,0.372,-0.001,0.0,8.703,-160.285,0.101,-106.86,-6.439,829156.687,-106.86,0.101,-160.321,8.213,0.106,2.039,0.227,187.585,0.0,-106.86,0.101,-160.285,8.389,0.0,12711.389,1150.15,15003.535,1150.15,1.25,0.012,-106.727,0.101,-160.285,8.387,1.16,1.0,1.0,2019.0,0.0,1.0,1.0
25%,0.289,32.554,0.771,21.233,-0.0,0.659,-0.0,0.084,-99.641,22.521,-126.766,23.982,-0.0,0.031,2143.695,1225.931,829466.25,-97.111,22.743,-126.567,24.03,0.0,0.0,0.0,0.0,8613.726,-2.008,0.054,829523.25,-99.694,22.207,-125.381,24.876,0.0,0.919,-0.0,0.104,24.005,-126.987,23.388,-99.61,-1.887,829442.784,-99.625,24.257,-126.149,23.3,0.113,2.229,0.263,226.204,0.163,-99.636,23.967,-126.1,23.631,0.187,34430.057,3948.41,39480.238,2952.134,4.791,0.24,-99.613,26.535,-126.529,23.377,38.028,8.0,3.0,2019.0,1.0,87.0,13.0
50%,0.315,32.59,0.823,22.272,0.0,0.785,0.0,0.148,-97.302,42.489,-81.917,28.463,0.0,0.033,2468.993,2324.655,829671.431,-74.327,38.737,-80.819,28.298,0.0,0.0,0.0,0.0,8613.738,-1.665,0.083,829700.938,-97.34,38.214,-75.658,28.548,0.0,1.036,0.0,0.187,28.481,-82.303,43.115,-97.293,-1.421,829691.375,-97.284,43.742,-85.028,28.022,0.116,2.48,0.293,228.547,0.359,-97.296,43.116,-85.815,28.337,0.384,50417.75,5768.341,57283.977,4768.781,6.775,0.268,-97.295,45.017,-85.783,28.024,51.93,15.0,6.0,2020.0,3.0,170.0,25.0
75%,0.355,32.618,0.869,23.314,0.0,0.907,0.0,0.217,73.653,55.985,-49.346,32.45,0.0,0.038,2857.763,3442.47,829796.688,71.178,53.457,-47.585,32.549,0.0,0.0,0.0,0.0,8613.768,-1.302,0.115,829835.063,73.678,53.193,-46.966,32.821,0.0,1.189,0.0,0.292,32.517,-50.265,56.693,73.619,-0.944,829830.449,73.527,57.373,-49.749,32.732,0.118,2.871,0.342,230.909,0.763,73.583,56.693,-50.359,32.774,0.8,63413.425,8665.13,71602.27,7665.13,14.08,0.295,72.933,57.407,-50.007,32.795,72.033,23.0,9.0,2020.0,5.0,263.0,37.0
max,0.391,32.753,1.0,29.735,0.002,1.633,0.002,0.3,84.743,66.025,-20.703,41.397,0.001,0.07,12919.685,4989.29,830182.375,72.691,64.558,-23.272,41.66,0.0,0.0,0.0,0.0,13020.412,0.105,0.486,830204.733,84.743,66.236,-20.703,41.397,0.001,2.363,0.001,0.551,41.41,-20.703,66.025,84.743,1.012,830238.569,84.792,66.449,-20.657,41.993,0.126,3.548,0.429,245.794,1.0,84.792,66.239,-20.703,41.993,1.0,88330.359,15000.0,88513.484,14000.0,250.0,1.0,84.802,66.234,-20.703,41.993,440.92,31.0,12.0,2020.0,6.0,365.0,52.0


In [None]:
test_df.describe()

In [None]:
#The cardinality of each catgorical feature (Training)
cat_cols = train_df.columns
for col in cat_cols:
    print(col, train_df[col].nunique())

In [None]:
#The cardinality of each catgorical feature (Testing)
cat_cols = test_df.columns
for col in cat_cols:
    print(col, test_df[col].nunique())

# Data Preprocessing

# Missing Data 

In [None]:
#missing data percentage (Training)
total = train_df.isnull().sum().sort_values(ascending=False)
percent_1 = train_df.isnull().sum()/train_df.isnull().count()*100
percent_2 = (round(percent_1, 1)).sort_values(ascending=False)
missing_data = pd.concat([total, percent_2], axis=1, keys=['Total', '%'])
missing_data.head(60)

In [None]:
#missing data percentage (Testing)
total = test_df.isnull().sum().sort_values(ascending=False)
percent_1 = test_df.isnull().sum()/test_df.isnull().count()*100
percent_2 = (round(percent_1, 1)).sort_values(ascending=False)
missing_data = pd.concat([total, percent_2], axis=1, keys=['Total', '%'])
missing_data

In [90]:
train_df = train_df.bfill().ffill()
test_df = test_df.bfill().ffill()

# Feature Engineering

## - Lags Features

In [91]:
def LAG(data,LagFeature,shift=1,NewFeatures=[]) :
    data[NewFeatures[0]]   = data[LagFeature]  - data[LagFeature].shift(shift)
    data[NewFeatures[1]]   = data[LagFeature].shift(shift)

num_feats = train_df.columns
num_feats = num_feats.drop(['Week','DayOfYear','DayOfWeek','Year','Month','Day','pm2_5','temp_mean','humidity','site_longitude','site_latitude','device'])

for feature in num_feats:
    LAG(train_df,LagFeature=f'{feature}',shift=1,NewFeatures=[f'{feature}_diff_Lag1',f'{feature}_Lag1'])
    LAG(test_df,LagFeature=f'{feature}',shift=1,NewFeatures=[f'{feature}_diff_Lag1',f'{feature}_Lag1'])

## - Combination Between Time Features

In [92]:
for dataset in (train_df,test_df):
    dataset['Year_Month'] = dataset['Year'].astype(str) + '-' + dataset['Month'].astype(str)
    dataset['Year_Week'] = dataset['Year'].astype(str) + '-' + dataset['Week'].astype(str)
    dataset['Year_Month_Day'] = dataset['Year'].astype(str) + '-' + dataset['Month'].astype(str) + '-' + dataset['Day'].astype(str)
    
feats = ['Year_Month','Year_Week','Year_Month_Day']
train_df,test_df = label_enc(train_df,test_df,feats)

Year_Month
Year_Week
Year_Month_Day


## - Aggregations Features

In [93]:
DevicePM2_5Mean = dict(train_df.groupby('device')['pm2_5'].mean())
DevicePM2_5Std = dict(train_df.groupby('device')['pm2_5'].std())
DevicePM2_5Min = dict(train_df.groupby('device')['pm2_5'].min())
DevicePM2_5Max = dict(train_df.groupby('device')['pm2_5'].max())

for dataset in (train_df,test_df):
    dataset['DevicePM2_5Mean'] = dataset['device'].map(DevicePM2_5Mean)
    dataset['DevicePM2_5Std'] = dataset['device'].map(DevicePM2_5Std)
    dataset['DevicePM2_5Min'] = dataset['device'].map(DevicePM2_5Min)
    dataset['DevicePM2_5Max'] = dataset['device'].map(DevicePM2_5Max)

In [94]:
def Agg(Feature):
    for dataset in (train_df,test_df):
        dataset[f'{Feature}PerMonth'] = dataset['Month'].map(dict(dataset.groupby('Month')[Feature].mean()))
        dataset[f'{Feature}PerWeek'] = dataset['Year_Week'].map(dict(dataset.groupby('Year_Week')[Feature].mean()))
        dataset[f'{Feature}PerDay'] = dataset['Year_Month_Day'].map(dict(dataset.groupby('Year_Month_Day')[Feature].mean()))
        
        dataset[f'{Feature}Month_std'] = dataset['Month'].map(dict(dataset.groupby('Month')[Feature].std()))
        dataset[f'{Feature}Week_std'] = dataset['Year_Week'].map(dict(dataset.groupby('Year_Week')[Feature].std()))
        dataset[f'{Feature}Day_std'] = dataset['Year_Month_Day'].map(dict(dataset.groupby('Year_Month_Day')[Feature].std()))
        
        dataset[f'{Feature}Month_min'] = dataset['Month'].map(dict(dataset.groupby('Month')[Feature].min()))
        dataset[f'{Feature}Week_min'] = dataset['Year_Week'].map(dict(dataset.groupby('Year_Week')[Feature].min()))
        dataset[f'{Feature}Day_min'] = dataset['Year_Month_Day'].map(dict(dataset.groupby('Year_Month_Day')[Feature].min()))
       
        dataset[f'{Feature}Month_max'] = dataset['Month'].map(dict(dataset.groupby('Month')[Feature].max()))
        dataset[f'{Feature}Week_max'] = dataset['Year_Week'].map(dict(dataset.groupby('Year_Week')[Feature].max()))
        dataset[f'{Feature}Day_max'] = dataset['Year_Month_Day'].map(dict(dataset.groupby('Year_Month_Day')[Feature].max()))
        
Agg('temp_mean')
Agg('humidity')

In [95]:
train_df.drop(['Year_Month','Year_Week','Year_Month_Day'],inplace=True,axis=1)
test_df.drop(['Year_Month','Year_Week','Year_Month_Day'],inplace=True,axis=1)

## - Rolling Features

In [96]:
def Rolling(feature):
    for dataset in (train_df,test_df):
        dataset[f'{feature}_Rolling_3'] = dataset[feature].rolling(3).mean()
        dataset[f'{feature}_Rolling_5'] = dataset[feature].rolling(5).mean()

        dataset[f"{feature}_rolling_mean_60"] = dataset.rolling(60).mean()[feature]
        dataset[f"{feature}_rolling_max_60"] = dataset.rolling(60).max()[feature]
        dataset[f"{feature}_rolling_min_60"] = dataset.rolling(60).min()[feature]

        dataset[f"{feature}_rolling_mean_30"] = dataset.rolling(30).mean()[feature]
        dataset[f"{feature}_rolling_max_30"] = dataset.rolling(30).max()[feature]
        dataset[f"{feature}_rolling_min_30"] = dataset.rolling(30).min()[feature]

        dataset[f"{feature}_rolling_mean_10"] = dataset.rolling(10).mean()[feature]
        dataset[f"{feature}_rolling_max_10"] = dataset.rolling(10).max()[feature]
        dataset[f"{feature}_rolling_min_10"] = dataset.rolling(10).min()[feature]

Rolling('temp_mean')
Rolling('humidity')

## - Polar Coordinates

In [97]:
def Polar(X,y, a = 0, b = 0): # a and b represnt the center
    r = np.sqrt((X-a)**2 + (y-b)**2)
    phi = np.arctan2((y-a), (X-b))
    return r, phi

train_df['R'], train_df['Phi'] = Polar(train_df["site_latitude"],train_df["site_longitude"])
test_df['R'], test_df['Phi'] = Polar(test_df["site_latitude"],test_df["site_longitude"])

## - Foureier Frequnecies and Amplitudes For Features That Contain Seasonality

In [98]:
freq2_dict_no_log = dict()
freq3_dict_no_log = dict()

amp2_dict_no_log = dict()
amp3_dict_no_log = dict()

for feat_1 in ('Year','Month','Day'):
    for feat_2 in ('temp_mean', 'humidity'):
        for i in range(min(train_df[feat_1].unique()), max(train_df[feat_1].unique()) + 1):

            a = train_df.loc[train_df[feat_1]==i]
            a_sales = a[feat_2]

            Y = np.fft.fft(a_sales.values)
            Y = abs(Y)
            freq = np.fft.fftfreq(len(Y), 1)

            intercept_index = np.argmax(Y)
            Y = np.delete(Y, intercept_index)
            freq = np.delete(freq, intercept_index)

            amplitude_1_index = np.argmax(Y)
            amplitude_1 = Y[amplitude_1_index]
            Y = np.delete(Y, amplitude_1_index)
            freq_1 = freq[amplitude_1_index]
            freq = np.delete(freq, amplitude_1_index)

            amplitude_2_index = np.argmax(Y)
            amplitude_2 = Y[amplitude_2_index]
            Y = np.delete(Y, amplitude_2_index)
            freq_2 = freq[amplitude_2_index]
            freq = np.delete(freq, amplitude_2_index)

            amplitude_3_index = np.argmax(Y)
            amplitude_3 = Y[amplitude_3_index]
            Y = np.delete(Y, amplitude_3_index)
            freq_3 = freq[amplitude_3_index]
            freq = np.delete(freq, amplitude_3_index)

            #Freq_1 is not included because it seems as it is always 0
            a[f'Frequency_2_{feat_1}_{feat_2}'] = freq_2
            a[f'Frequency_3_{feat_1}_{feat_2}'] = freq_3

            a[f'Amplitude_2_{feat_1}_{feat_2}'] = amplitude_2
            a[f'Amplitude_3_{feat_1}_{feat_2}'] = amplitude_3


            freq2_dict_no_log[i] = freq_2
            freq3_dict_no_log[i] = freq_3

            amp2_dict_no_log[i] = amplitude_2
            amp3_dict_no_log[i] = amplitude_3


            if i == min(train_df[feat_1].unique()):
                k = a
            else:
                k = pd.concat([k,a])
                
        train_df = k
        
        test_df[f'Frequency_2_{feat_1}_{feat_2}'] = test_df[feat_1].map(freq2_dict_no_log)
        test_df[f'Frequency_3_{feat_1}_{feat_2}'] = test_df[feat_1].map(freq3_dict_no_log)
        test_df[f'Amplitude_2_{feat_1}_{feat_2}'] = test_df[feat_1].map(amp2_dict_no_log)
        test_df[f'Amplitude_3_{feat_1}_{feat_2}'] = test_df[feat_1].map(amp3_dict_no_log)
        
        freq2_dict_no_log = dict()
        freq3_dict_no_log = dict()
        amp2_dict_no_log = dict()
        amp3_dict_no_log = dict()

## - Percentage change in Temperature and Humidity 

In [99]:
periods = [1, 3, 7, 14]
for period in periods:
    train_df.loc[:, f"PctChangeTemp_{period}"] = train_df["temp_mean"].pct_change(period)
    train_df.loc[:, f"PctChangeHumi_{period}"] = train_df["humidity"].pct_change(period)
    test_df.loc[:, f"PctChangeTemp_{period}"] = test_df["temp_mean"].pct_change(period)
    test_df.loc[:, f"PctChangeHumi_{period}"] = test_df["humidity"].pct_change(period)

## - Historic Volatility

In [100]:
periods = [3, 7, 14]
for period in periods:
    train_df.loc[:, f"volatility_temp_mean_{period}"] = train_df["temp_mean"].diff().rolling(period).std()
    test_df.loc[:, f"volatility_temp_mean_{period}"] = test_df["temp_mean"].diff().rolling(period).std()
    train_df.loc[:, f"volatility_humidity_{period}"] = train_df["humidity"].diff().rolling(period).std()
    test_df.loc[:, f"volatility_humidity_{period}"] = test_df["humidity"].diff().rolling(period).std()

# Encoding

In [101]:
train_df, test_df = label_enc(train_df,test_df,['device'])

device


## Sort The Data For Correct Validation

In [102]:
temp = train_df.sort_values(['date','device']).copy()
temp['date'] = temp.index
temp = temp.reset_index(drop=True)

In [103]:
train_df = train_df.sort_values(['date','device']).reset_index(drop=True) 
test_df = test_df.sort_values(['date','device']).reset_index(drop=True)

# Modeling

## Validation

In [104]:
params = {'objective':'RMSE','n_estimators': 2064, 'learning_rate': 0.03196897706232692, 
          'depth': 5, 'reg_lambda': 12.680808984686983}
CB = CatBoostRegressor(**params,verbose=0, random_state=42, task_type = 'CPU')
LogCB = TransformedTargetRegressor(CB, func = np.log1p, inverse_func = np.expm1)

In [105]:
print('Validating...')

X = train_df.drop('pm2_5',axis=1).values
y = train_df['pm2_5'].values

scores = []
for train_index, test_index in GroupTimeSeriesSplit(n_splits=4).split(X, y, groups=temp['date'].values):
    print(f'====Train On {len(train_index)} Samples ====Validate On {len(test_index)} Sapmles====')
    print(f'Train On Months [{temp.reset_index(drop=True).loc[train_index].date.min()} - {temp.reset_index(drop=True).loc[train_index].date.max()}]')
    print(f'Validate On Months [{temp.reset_index(drop=True).loc[test_index].date.min()} - {temp.reset_index(drop=True).loc[test_index].date.max()}]')
    X_Train, X_Test = X[train_index], X[test_index]
    y_Train, y_Test = y[train_index], y[test_index]
    LogCB.fit(X_Train,y_Train)
    vali = temp.loc[(temp.index >= test_index[0]) & (temp.index <= test_index[-1])]
    y_pred = LogCB.predict(X_Test)
    scores.append(mean_absolute_error(y_pred,y_Test))
    print(scores[-1])
    
#RMSE as Loss, CPU, Used TransformedTargetRegressor
# Fold 1: 18.316379697884738
# Fold 2: 18.14936480700997
# Fold 3: 15.91820261813899
# Fold 4: 12.967791513084244

Validating...
====Train On 202 Samples ====Validate On 806 Sapmles====
Train On Months [2019-03-14 00:00:00 - 2019-06-27 00:00:00]
Validate On Months [2019-06-28 00:00:00 - 2019-10-10 00:00:00]
18.316379697884738
====Train On 1008 Samples ====Validate On 2658 Sapmles====
Train On Months [2019-03-14 00:00:00 - 2019-10-10 00:00:00]
Validate On Months [2019-10-11 00:00:00 - 2020-01-23 00:00:00]
18.14936480700997
====Train On 3666 Samples ====Validate On 2966 Sapmles====
Train On Months [2019-03-14 00:00:00 - 2020-01-23 00:00:00]
Validate On Months [2020-01-24 00:00:00 - 2020-05-07 00:00:00]
15.91820261813899
====Train On 6632 Samples ====Validate On 3291 Sapmles====
Train On Months [2019-03-14 00:00:00 - 2020-05-07 00:00:00]
Validate On Months [2020-05-08 00:00:00 - 2020-08-20 00:00:00]
12.967791513084244


In [78]:
plotImp(LogCB,train_df.drop('pm2_5',axis=1))

AttributeError: 'TransformedTargetRegressor' object has no attribute 'feature_importances_'

## Inference

In [40]:
#Averaging the predictions of the same model with different seeds to get more consistent results
X = train_df.drop('pm2_5',axis=1)
y = train_df['pm2_5']

Predictions = pd.DataFrame()

for seed in range(20,46):
    print(f'Seed: {seed}')
    params = {'objective':'MAE','n_estimators': 2064, 'learning_rate': 0.03196897706232692, 'depth': 5, 'reg_lambda': 12.680808984686983}
    CB = CatBoostRegressor(**params,verbose=0, random_state=seed, task_type = 'CPU')
    LogCB = TransformedTargetRegressor(CB, func = np.log1p, inverse_func = np.expm1)  
    LogCB.fit(X, y)

    Predictions[f'Target_{seed}'] = LogCB.predict(test_df)
    Predictions[f'Target_{seed}'] = Predictions[f'Target_{seed}'] * 0.975  #A Correction Factor of 0.975
    
#Averaging the Results
Predictions['Mean'] = Predictions.mean(axis=1)
Predictions['HMean'] = Predictions.apply(stats.hmean, axis=1)
Predictions['GMean'] = Predictions.apply(stats.gmean, axis=1)

#Averaging the Second Results
FinalPred = Predictions[['Mean','HMean','GMean']].apply(stats.hmean,axis=1)

#Making the submission file
submission = pd.DataFrame({"Id": ID ,"pm2_5": FinalPred.values})
submission.to_csv('AirQualityPrediction.csv',index=False)

Seed: 20
Seed: 21
Seed: 22
Seed: 23
Seed: 24
Seed: 25
Seed: 26
Seed: 27
Seed: 28
Seed: 29
Seed: 30
Seed: 31
Seed: 32
Seed: 33
Seed: 34
Seed: 35
Seed: 36
Seed: 37
Seed: 38
Seed: 39
Seed: 40
Seed: 41
Seed: 42
Seed: 43
Seed: 44
Seed: 45
