**EDA**

Here we clean the data:

1. remove unnecessary columns
2. change data types 
3. fill missing values 
4. round data 

In [1]:
#import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, RandomizedSearchCV
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score, make_scorer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_selection import SelectKBest, f_classif, f_regression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import RidgeCV
from sklearn.svm import LinearSVR
import sys
!{sys.executable} -m pip install xgboost



In [2]:
from xgboost import XGBRegressor

In [31]:
# read in the data
df = pd.read_csv("../data/Train.csv", sep = ',')

**Delete unwanted columns**

In [32]:
df.isna().sum()

Place_ID X Date                     0
Date                                0
Place_ID                            0
target                              0
target_min                          0
                                ...  
L3_CH4_aerosol_optical_depth    24765
L3_CH4_sensor_azimuth_angle     24765
L3_CH4_sensor_zenith_angle      24765
L3_CH4_solar_azimuth_angle      24765
L3_CH4_solar_zenith_angle       24765
Length: 82, dtype: int64

In [34]:
columns_to_delete = ['Place_ID X Date',
                     'Place_ID',
                     'Date',
                     'target_min', 
                     'target_max',
                     'target_variance', 
                     'target_count', 
                     #removing columns with 80% missing values
                     'L3_CH4_sensor_azimuth_angle', 
                     'L3_CH4_sensor_zenith_angle',
                     'L3_CH4_solar_azimuth_angle', 
                     'L3_CH4_solar_zenith_angle',
                     'L3_CH4_aerosol_optical_depth',
                     'L3_CH4_CH4_column_volume_mixing_ratio_dry_air',
                     'L3_CH4_aerosol_height'
                    ]

df_reduced = df.drop(columns = columns_to_delete, axis = 1)

**Split data into train and test sets 80/20 split**

In [36]:
RSEED = 9 

# Define x/y
X = df_reduced 
y = np.array(X.pop('target'))

# split train/test data
train_X, test_X, train_y, test_y = train_test_split(X, y,
                                                    test_size = 0.2, 
                                                    random_state = RSEED)

**Check for remaining columns, data types and null values**

In [38]:
train_X.isna().sum()

precipitable_water_entire_atmosphere       0
relative_humidity_2m_above_ground          0
specific_humidity_2m_above_ground          0
temperature_2m_above_ground                0
u_component_of_wind_10m_above_ground       0
                                        ... 
L3_SO2_cloud_fraction                   5764
L3_SO2_sensor_azimuth_angle             5764
L3_SO2_sensor_zenith_angle              5764
L3_SO2_solar_azimuth_angle              5764
L3_SO2_solar_zenith_angle               5764
Length: 67, dtype: int64

In [39]:
df_reduced.corr()

Unnamed: 0,precipitable_water_entire_atmosphere,relative_humidity_2m_above_ground,specific_humidity_2m_above_ground,temperature_2m_above_ground,u_component_of_wind_10m_above_ground,v_component_of_wind_10m_above_ground,L3_NO2_NO2_column_number_density,L3_NO2_NO2_slant_column_number_density,L3_NO2_absorbing_aerosol_index,L3_NO2_cloud_fraction,...,L3_AER_AI_solar_zenith_angle,L3_SO2_SO2_column_number_density,L3_SO2_SO2_column_number_density_amf,L3_SO2_SO2_slant_column_number_density,L3_SO2_absorbing_aerosol_index,L3_SO2_cloud_fraction,L3_SO2_sensor_azimuth_angle,L3_SO2_sensor_zenith_angle,L3_SO2_solar_azimuth_angle,L3_SO2_solar_zenith_angle
precipitable_water_entire_atmosphere,1.000000,0.197203,0.882341,0.733225,-0.195106,0.032502,-0.145308,-0.239584,0.037743,0.109698,...,-0.541639,-0.067858,0.330204,-0.104745,0.006507,0.035312,-0.000132,0.067864,0.032651,-0.529087
relative_humidity_2m_above_ground,0.197203,1.000000,0.219509,-0.260632,0.011186,0.118368,-0.134249,-0.118823,-0.055780,0.324565,...,0.285365,0.000194,0.123957,-0.013460,-0.167396,0.324243,-0.007036,-0.020469,0.049687,0.236697
specific_humidity_2m_above_ground,0.882341,0.219509,1.000000,0.816273,-0.191998,0.023091,-0.181148,-0.247529,-0.012826,-0.003219,...,-0.627122,-0.071769,0.386680,-0.101513,-0.003531,-0.055165,0.001297,0.076148,0.020177,-0.595630
temperature_2m_above_ground,0.733225,-0.260632,0.816273,1.000000,-0.143676,-0.010511,-0.153220,-0.238809,0.031876,-0.127252,...,-0.695437,-0.078338,0.244634,-0.106900,0.082904,-0.218556,0.003631,0.073909,0.007059,-0.666284
u_component_of_wind_10m_above_ground,-0.195106,0.011186,-0.191998,-0.143676,1.000000,0.145513,-0.043865,0.001337,-0.019238,0.043174,...,0.267196,0.006622,-0.123269,0.000426,-0.009792,0.070769,-0.007992,-0.008450,0.087557,0.229289
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
L3_SO2_cloud_fraction,0.035312,0.324243,-0.055165,-0.218556,0.070769,0.099912,0.021875,-0.008401,0.045455,0.743887,...,0.291531,-0.011579,0.141619,-0.026270,-0.105591,1.000000,-0.098965,0.204818,0.040373,0.324347
L3_SO2_sensor_azimuth_angle,-0.000132,-0.007036,0.001297,0.003631,-0.007992,0.001337,0.003261,-0.000554,-0.195558,-0.054617,...,-0.117427,0.004633,0.001911,0.011893,-0.184242,-0.098965,1.000000,-0.020885,0.014651,-0.129849
L3_SO2_sensor_zenith_angle,0.067864,-0.020469,0.076148,0.073909,-0.008450,-0.027155,-0.003311,0.118238,0.215107,0.086447,...,-0.054384,-0.008082,-0.009829,-0.012371,0.370667,0.204818,-0.020885,1.000000,0.230882,0.021546
L3_SO2_solar_azimuth_angle,0.032651,0.049687,0.020177,0.007059,0.087557,0.098119,-0.113574,-0.091441,0.211621,0.014935,...,0.013186,-0.002805,-0.112144,-0.006649,0.246763,0.040373,0.014651,0.230882,1.000000,-0.056467


**Interpolation class**

In [12]:
# Custom transformer for interpolation
class InterpolateTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, method='linear', limit_direction='both'):
        self.method = method
        self.limit_direction = limit_direction

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X.interpolate(method=self.method, limit_direction=self.limit_direction)

In [None]:
**Feature engineering pipeline**
1. Impute all missing values from all columns
2. Log transform all columns
3.  