## EDA ##

Here we clean the data:

1. remove unnecessary columns
2. change data types 
3. fill missing values 
4. round data 

In [6]:
# packages
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer

In [7]:
# read in the data
df = pd.read_csv("../data/Train.csv", sep = ',')

**Delete columns**

In [3]:
columns_to_delete = ['Place_ID X Date', 
                     'target_min', 
                     'target_max',
                     'target_variance', 
                     'target_count', 
                     'relative_humidity_2m_above_ground', 
                     'L3_NO2_NO2_slant_column_number_density',
                     'L3_NO2_stratospheric_NO2_column_number_density',  
                     'L3_SO2_SO2_slant_column_number_density',
                     #removing all sensor data and zenith and azimuth angle data
                     'L3_NO2_sensor_altitude', 
                     'L3_NO2_sensor_azimuth_angle',
                     'L3_NO2_sensor_zenith_angle', 
                     'L3_NO2_solar_azimuth_angle',
                     'L3_NO2_solar_zenith_angle',                  
                     'L3_O3_sensor_azimuth_angle',
                     'L3_O3_sensor_zenith_angle', 
                     'L3_O3_solar_azimuth_angle',
                     'L3_O3_solar_zenith_angle',
                     'L3_CO_sensor_altitude', 
                     'L3_CO_sensor_azimuth_angle',
                     'L3_CO_sensor_zenith_angle', 
                     'L3_CO_solar_azimuth_angle',
                     'L3_CO_solar_zenith_angle',
                     'L3_HCHO_sensor_zenith_angle', 
                     'L3_HCHO_solar_azimuth_angle',
                     'L3_HCHO_sensor_azimuth_angle',
                     'L3_HCHO_solar_zenith_angle',
                     'L3_CLOUD_sensor_azimuth_angle', 
                     'L3_CLOUD_sensor_zenith_angle',
                     'L3_CLOUD_solar_azimuth_angle', 
                     'L3_CLOUD_solar_zenith_angle',
                     'L3_AER_AI_sensor_altitude', 
                     'L3_AER_AI_sensor_azimuth_angle',
                     'L3_AER_AI_sensor_zenith_angle', 
                     'L3_AER_AI_solar_azimuth_angle',
                     'L3_AER_AI_solar_zenith_angle',
                     'L3_SO2_sensor_azimuth_angle', 
                     'L3_SO2_sensor_zenith_angle',
                     'L3_SO2_solar_azimuth_angle', 
                     'L3_SO2_solar_zenith_angle', 
                     'L3_CH4_sensor_azimuth_angle', 
                     'L3_CH4_sensor_zenith_angle',
                     'L3_CH4_solar_azimuth_angle', 
                     'L3_CH4_solar_zenith_angle',
                     'L3_CH4_CH4_column_volume_mixing_ratio_dry_air',
                     'L3_CH4_aerosol_height',
                     'L3_CH4_aerosol_optical_depth'  
                    ]

df_reduced = df.drop(columns = columns_to_delete, axis = 1)

**Split test data in 80% train and 20% test data**

In [4]:
RSEED = 9 

# Define x/y
X = df_reduced 
y = np.array(X.pop('target'))

# split train/test data
train_X, test_X, train_y, test_y = train_test_split(X, y,
                                                    test_size = 0.2, 
                                                    random_state = RSEED)

**Change data types**

In [5]:
X['Date'] = pd.to_datetime(X['Date'])

In [6]:
#sns.heatmap(X.isna())

**Baseline Model**

In [7]:
# impute missing values and transform
preprocessing = Pipeline(steps=[
    ('imputing', SimpleImputer(strategy='mean')), 
    ('standard_scaler', StandardScaler())
])
preprocessing

In [8]:
preprocessing = ColumnTransformer([
    ('transformations', preprocessing, X.columns.drop(['Place_ID', 'Date']))
])
preprocessing

In [9]:
pipe_dectree = Pipeline([
    ('preprocessor', preprocessing),
    ('regressor', DecisionTreeRegressor(max_depth=10))
])

In [10]:
pipe_dectree.fit(train_X, train_y)

In [11]:
y_train_predicted = pipe_dectree.predict(train_X)
print("RMSE: {:.2f}".format(mean_squared_error(train_y, y_train_predicted, squared=False)))

RMSE: 31.21


In [12]:
y_test_predicted = pipe_dectree.predict(test_X)
print("RMSE: {:.2f}".format(mean_squared_error(test_y, y_test_predicted, squared=False)))


RMSE: 38.42


In [196]:
y.max() #- y.min()

815.0