In [29]:
import pandas as pd
import numpy as np
import mlflow

import matplotlib.pyplot as plt
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

import xgboost as xgb

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope
from sklearn.metrics import mean_squared_error, mean_absolute_error

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
train_df = pd.read_csv('data/Train.csv', parse_dates=["Date"])

In [3]:
train_df.shape

(30557, 82)

In [4]:
# For train
for col in train_df.columns:
    if train_df[col].isnull().any():
        empty_rows = train_df.loc[train_df[col].isnull()]
        for index, row in empty_rows.iterrows():
            mean = (row['target_min']+row['target_max'])/2
            train_df.at[index, col] = mean

In [5]:
train_df.head()

Unnamed: 0,Place_ID X Date,Date,Place_ID,target,target_min,target_max,target_variance,target_count,precipitable_water_entire_atmosphere,relative_humidity_2m_above_ground,specific_humidity_2m_above_ground,temperature_2m_above_ground,u_component_of_wind_10m_above_ground,v_component_of_wind_10m_above_ground,L3_NO2_NO2_column_number_density,L3_NO2_NO2_slant_column_number_density,L3_NO2_absorbing_aerosol_index,L3_NO2_cloud_fraction,L3_NO2_sensor_altitude,L3_NO2_sensor_azimuth_angle,L3_NO2_sensor_zenith_angle,L3_NO2_solar_azimuth_angle,L3_NO2_solar_zenith_angle,L3_NO2_stratospheric_NO2_column_number_density,L3_NO2_tropopause_pressure,L3_NO2_tropospheric_NO2_column_number_density,L3_O3_O3_column_number_density,L3_O3_O3_effective_temperature,L3_O3_cloud_fraction,L3_O3_sensor_azimuth_angle,L3_O3_sensor_zenith_angle,L3_O3_solar_azimuth_angle,L3_O3_solar_zenith_angle,L3_CO_CO_column_number_density,L3_CO_H2O_column_number_density,L3_CO_cloud_height,L3_CO_sensor_altitude,L3_CO_sensor_azimuth_angle,L3_CO_sensor_zenith_angle,L3_CO_solar_azimuth_angle,L3_CO_solar_zenith_angle,L3_HCHO_HCHO_slant_column_number_density,L3_HCHO_cloud_fraction,L3_HCHO_sensor_azimuth_angle,L3_HCHO_sensor_zenith_angle,L3_HCHO_solar_azimuth_angle,L3_HCHO_solar_zenith_angle,L3_HCHO_tropospheric_HCHO_column_number_density,L3_HCHO_tropospheric_HCHO_column_number_density_amf,L3_CLOUD_cloud_base_height,L3_CLOUD_cloud_base_pressure,L3_CLOUD_cloud_fraction,L3_CLOUD_cloud_optical_depth,L3_CLOUD_cloud_top_height,L3_CLOUD_cloud_top_pressure,L3_CLOUD_sensor_azimuth_angle,L3_CLOUD_sensor_zenith_angle,L3_CLOUD_solar_azimuth_angle,L3_CLOUD_solar_zenith_angle,L3_CLOUD_surface_albedo,L3_AER_AI_absorbing_aerosol_index,L3_AER_AI_sensor_altitude,L3_AER_AI_sensor_azimuth_angle,L3_AER_AI_sensor_zenith_angle,L3_AER_AI_solar_azimuth_angle,L3_AER_AI_solar_zenith_angle,L3_SO2_SO2_column_number_density,L3_SO2_SO2_column_number_density_amf,L3_SO2_SO2_slant_column_number_density,L3_SO2_absorbing_aerosol_index,L3_SO2_cloud_fraction,L3_SO2_sensor_azimuth_angle,L3_SO2_sensor_zenith_angle,L3_SO2_solar_azimuth_angle,L3_SO2_solar_zenith_angle,L3_CH4_CH4_column_volume_mixing_ratio_dry_air,L3_CH4_aerosol_height,L3_CH4_aerosol_optical_depth,L3_CH4_sensor_azimuth_angle,L3_CH4_sensor_zenith_angle,L3_CH4_solar_azimuth_angle,L3_CH4_solar_zenith_angle
0,010Q650 X 2020-01-02,2020-01-02,010Q650,38.0,23.0,53.0,769.5,92,11.0,60.200001,0.00804,18.51684,1.996377,-1.227395,7.4e-05,0.000156,-1.23133,0.006507,840209.874619,76.537512,38.634284,-61.736719,22.358167,5.7e-05,6156.074219,1.7e-05,0.119095,234.151102,0.0,76.536426,38.593017,-61.752587,22.363665,0.02108,883.332451,267.017184,840138.461052,74.543393,38.622451,-61.789016,22.379054,-1e-05,0.0,76.536426,38.593017,-61.752587,22.363665,6.4e-05,0.566828,38.0,38.0,0.0,38.0,38.0,38.0,76.536426,38.593017,-61.752587,22.363665,38.0,-1.23133,840209.874619,76.537512,38.634284,-61.736719,22.358167,-0.000127,0.312521,-4e-05,-1.861476,0.0,76.536426,38.593017,-61.752587,22.363665,1793.793579,3227.855469,0.010579,74.481049,37.501499,-62.142639,22.545118
1,010Q650 X 2020-01-03,2020-01-03,010Q650,39.0,25.0,63.0,1319.85,91,14.6,48.799999,0.00839,22.546533,3.33043,-1.188108,7.6e-05,0.000197,-1.082553,0.01836,840772.941995,-14.708036,59.624912,-67.693509,28.614804,5.5e-05,6156.074219,2.1e-05,0.115179,233.313706,0.059433,-14.708036,59.624912,-67.693509,28.614804,0.022017,1148.985447,61.216687,841116.763051,-57.0152,61.402626,-74.457583,33.089495,0.000114,0.059433,-14.708036,59.624912,-67.693509,28.614804,0.000171,0.858446,175.019862,99354.162958,0.059358,5.958538,175.072145,99353.672374,-14.708036,59.624912,-67.693509,28.614804,0.315403,-1.082553,840772.941995,-14.708036,59.624912,-67.693509,28.614804,0.00015,0.433957,5e-05,-1.452612,0.059433,-14.708036,59.624912,-67.693509,28.614804,1789.960449,3384.226562,0.015104,75.630043,55.657486,-53.868134,19.293652
2,010Q650 X 2020-01-04,2020-01-04,010Q650,24.0,8.0,56.0,1181.96,96,16.4,33.400002,0.0075,27.03103,5.065727,3.500559,6.7e-05,0.00017,-1.001242,0.015904,841410.713456,-105.201338,49.839714,-78.342701,34.296977,5.9e-05,7311.869141,7e-06,0.115876,232.233484,0.082063,-105.201338,49.839714,-78.342701,34.296977,0.020677,1109.347101,134.700335,841319.860448,-103.494458,49.924556,-78.355069,34.308941,2.7e-05,0.082063,-105.201338,49.839714,-78.342701,34.296977,0.000124,0.910536,275.903991,98118.935248,0.082247,5.75576,508.977723,95671.383578,-105.201338,49.839714,-78.342701,34.296977,0.307463,-1.001241,841410.713456,-105.201338,49.839714,-78.342701,34.296977,0.00015,0.356925,5.3e-05,-1.57295,0.082063,-105.201338,49.839714,-78.342701,34.296977,32.0,32.0,32.0,32.0,32.0,32.0,32.0
3,010Q650 X 2020-01-05,2020-01-05,010Q650,49.0,10.0,55.0,1113.67,96,6.911948,21.300001,0.00391,23.971857,3.004001,1.099468,8.3e-05,0.000175,-0.777019,0.055765,841103.242368,-104.334056,29.180977,-73.896572,30.545393,6e-05,11205.388333,2.3e-05,0.141557,230.936229,0.121261,-104.334066,29.181258,-73.896588,30.545446,0.021207,1061.570832,474.821444,841036.1929,-101.955931,29.21497,-73.914571,30.544513,2.3e-05,0.121261,-104.334066,29.181258,-73.896588,30.545446,8.1e-05,1.132571,383.692363,97258.462755,0.121555,6.246885,495.380407,96232.486418,-104.334066,29.181258,-73.896588,30.545446,0.279637,-0.777023,841103.242368,-104.334056,29.180977,-73.896572,30.545393,0.000227,0.584522,0.00011,-1.239317,0.121261,-104.334066,29.181258,-73.896588,30.545446,32.5,32.5,32.5,32.5,32.5,32.5,32.5
4,010Q650 X 2020-01-06,2020-01-06,010Q650,21.0,9.0,52.0,1164.82,95,13.900001,44.700001,0.00535,16.816309,2.621787,2.670559,7e-05,0.000142,0.366323,0.02853,840763.055499,58.850179,0.797294,-68.61248,26.899694,6.2e-05,11205.372845,9e-06,0.126369,232.499132,0.037919,58.850179,0.797294,-68.61248,26.899694,0.037766,1044.247425,926.92631,840710.342206,15.499573,1.389085,-68.622851,26.906207,3.7e-05,0.037919,58.850179,0.797294,-68.61248,26.899694,0.00014,0.649359,4314.483127,59875.027957,0.037008,4.205691,5314.483207,52561.523079,58.850179,0.797294,-68.61248,26.899694,0.238241,0.366324,840763.055499,58.850179,0.797294,-68.61248,26.899694,0.00039,0.408047,0.000159,0.202489,0.037919,58.850179,0.797294,-68.61248,26.899694,30.5,30.5,30.5,30.5,30.5,30.5,30.5


In [21]:
def preprocess_train(train_df):
    train_df=train_df.drop(columns=['target_min', 'target_max', 'target_variance', 'target_count'], axis=1)
    
    train_df['saleyear'] = train_df.Date.dt.year
    train_df['salemonth'] = train_df.Date.dt.month
    train_df['saleday'] = train_df.Date.dt.day
    train_df['saledayofweek'] = train_df.Date.dt.day_of_week
    train_df['saledayofyear'] = train_df.Date.dt.day_of_year
    
    train_df['placeID_freq'] = train_df['Place_ID'].map(train_df['Place_ID'].value_counts())
    
    # Drop features that corrolate with other features
    train_df= train_df.drop(columns=['L3_SO2_SO2_column_number_density_amf','L3_NO2_stratospheric_NO2_column_number_density','L3_CLOUD_sensor_azimuth_angle','L3_HCHO_solar_azimuth_angle','L3_SO2_sensor_zenith_angle','L3_CH4_solar_zenith_angle','temperature_2m_above_ground','L3_NO2_absorbing_aerosol_index','L3_CO_sensor_azimuth_angle','L3_AER_AI_sensor_zenith_angle','L3_AER_AI_solar_azimuth_angle','L3_AER_AI_solar_zenith_angle','L3_CO_sensor_zenith_angle','L3_HCHO_sensor_zenith_angle','L3_HCHO_tropospheric_HCHO_column_number_density_amf','L3_SO2_sensor_azimuth_angle','L3_SO2_absorbing_aerosol_index','L3_CH4_sensor_zenith_angle','specific_humidity_2m_above_ground','L3_NO2_NO2_slant_column_number_density','L3_NO2_cloud_fraction','L3_NO2_tropopause_pressure','L3_O3_sensor_azimuth_angle','L3_O3_cloud_fraction','L3_CLOUD_sensor_zenith_angle','L3_CLOUD_solar_azimuth_angle','L3_CLOUD_solar_zenith_angle','L3_CO_sensor_altitude','L3_HCHO_sensor_azimuth_angle','L3_CO_solar_zenith_angle','L3_HCHO_cloud_fraction','L3_HCHO_solar_zenith_angle','L3_HCHO_tropospheric_HCHO_column_number_density','L3_SO2_SO2_column_number_density','L3_CLOUD_cloud_top_height','L3_CLOUD_cloud_top_pressure','L3_AER_AI_absorbing_aerosol_index','L3_AER_AI_sensor_azimuth_angle','L3_SO2_SO2_slant_column_number_density','L3_SO2_cloud_fraction','L3_SO2_solar_zenith_angle','L3_CH4_aerosol_height','L3_CH4_sensor_azimuth_angle','L3_CH4_solar_azimuth_angle','Place_ID X Date','Date'], axis=1)
    
    # by removing data above the 95th percentile
    removed_col = ['precipitable_water_entire_atmosphere','L3_NO2_sensor_zenith_angle', 'L3_NO2_solar_azimuth_angle','L3_NO2_solar_zenith_angle','L3_NO2_tropospheric_NO2_column_number_density','L3_O3_O3_column_number_density','L3_O3_O3_effective_temperature','L3_O3_solar_zenith_angle','L3_CO_CO_column_number_density','L3_CO_solar_azimuth_angle','L3_CLOUD_cloud_base_pressure','L3_CLOUD_surface_albedo','L3_SO2_solar_azimuth_angle','L3_CH4_aerosol_optical_depth']
    col = ['u_component_of_wind_10m_above_ground', 'v_component_of_wind_10m_above_ground', 'L3_NO2_sensor_azimuth_angle', 'L3_O3_sensor_zenith_angle', 'L3_O3_solar_azimuth_angle','L3_CO_H2O_column_number_density', 'L3_CO_cloud_height','L3_HCHO_HCHO_slant_column_number_density', 'L3_CLOUD_cloud_base_height','L3_CLOUD_cloud_fraction', 'L3_CLOUD_cloud_optical_depth', 'L3_CH4_CH4_column_volume_mixing_ratio_dry_air']
    for co in col:
        q_cutoff = train_df[co].quantile(0.95)
        mask = train_df[co] < q_cutoff

        train_df = train_df[mask]
        
    cat = ['Place_ID']
    train_df[cat] = train_df[cat].astype(str)
#     train_df=train_df.drop(columns=cat, axis=1)

    train = train_df
    
    trained_df = train_df.drop(columns='target', axis=1)
    train_numerical_features=trained_df.select_dtypes(include=['float']).columns
    train_numerical = []
    for num in train_numerical_features:
        train_numerical.append(num)

    train_categorical_features=train_df.select_dtypes(include=['object']).columns
    train_categorical= []
    for cat in train_categorical_features:
        train_categorical.append(cat)

    dicts = train_df[train_categorical + train_numerical].to_dict(orient='records')
#     return train_df
    return dicts, train

# train_dict=preprocess_train(train_df)
train_dict, train= preprocess_train(train_df)

In [22]:
dv = DictVectorizer()

X = dv.fit_transform(train_dict)
Y = train['target']

In [27]:
X_train, X_valid, y_train, y_valid = train_test_split(X, Y, test_size=0.3, random_state=2)
print(X_train.shape, y_train.shape, X_valid.shape, y_valid.shape)

(11549, 370) (11549,) (4950, 370) (4950,)


In [30]:
# Change max samples in RandomForestRegressor
model_rf = RandomForestRegressor()

In [31]:
%%time
# Cutting down the max number of samples each tree can see improves training time
model_rf.fit(X_train, y_train)

CPU times: user 2min 27s, sys: 66.6 ms, total: 2min 27s
Wall time: 2min 29s


In [32]:
from sklearn.metrics import mean_squared_error
val_preds = model_rf.predict(X_valid)
mse = mean_squared_error(y_valid, val_preds)
print('MSE: ', mse)
print('RMSE: ', np.sqrt(mse))

MSE:  411.3511771234343
RMSE:  20.281794228406774


In [33]:
import pickle

In [34]:
with open('models/random_forest.bin', 'wb') as f_out:
    pickle.dump((dv, model_rf), f_out)

In [35]:
# Load the model from the pickle file
with open('models/random_forest.bin', 'rb') as file:
    dv, reg = pickle.load(file)

In [36]:
from sklearn.metrics import mean_squared_error
val_preds = reg.predict(X_valid)
mse = mean_squared_error(y_valid, val_preds)
print('MSE: ', mse)
print('RMSE: ', np.sqrt(mse))

MSE:  411.3511771234343
RMSE:  20.281794228406774
