In [1]:
import pandas as pd
import numpy as np
import mlflow

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, BayesianRidge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
#from sklearn.naive_bayes import GaussianNB
# from sklearn import svm

import matplotlib.pyplot as plt
from sklearn.feature_extraction import DictVectorizer

import xgboost as xgb

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope
from sklearn.metrics import mean_squared_error

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
# train_df = pd.read_csv('Trainv2.csv', parse_dates=["Date"])
train_df = pd.read_csv('data/Train.csv', parse_dates=["Date"])

In [3]:
# For test
def fill_dataframe(df):
    numerical_columns=df.select_dtypes(include=['float64','int']).columns
    for col in df.columns:
        if col in numerical_columns:
          # For automaticaly filling all columns with dtype(int or float) with mean
          median=df[col].median()
          df[col]=df[col].fillna(median)

# fill_dataframe(test_df)
fill_dataframe(train_df)

def preprocess_train(train_df):
    train_df=train_df.drop(columns=['target_min', 'target_max', 'target_variance', 'target_count'], axis=1)
    
    train_df['saleyear'] = train_df.Date.dt.year
    train_df['salemonth'] = train_df.Date.dt.month
    train_df['saleday'] = train_df.Date.dt.day
    train_df['saledayofweek'] = train_df.Date.dt.day_of_week
    train_df['saledayofyear'] = train_df.Date.dt.day_of_year
    
    train_df['placeID_freq'] = train_df['Place_ID'].map(train_df['Place_ID'].value_counts())
    
    # Drop features that corrolate with other features
    train_df= train_df.drop(columns=['L3_SO2_SO2_column_number_density_amf','L3_NO2_stratospheric_NO2_column_number_density','L3_CLOUD_sensor_azimuth_angle','L3_HCHO_solar_azimuth_angle','L3_SO2_sensor_zenith_angle','L3_CH4_solar_zenith_angle','temperature_2m_above_ground','L3_NO2_absorbing_aerosol_index','L3_CO_sensor_azimuth_angle','L3_AER_AI_sensor_zenith_angle','L3_AER_AI_solar_azimuth_angle','L3_AER_AI_solar_zenith_angle','L3_CO_sensor_zenith_angle','L3_HCHO_sensor_zenith_angle','L3_HCHO_tropospheric_HCHO_column_number_density_amf','L3_SO2_sensor_azimuth_angle','L3_SO2_absorbing_aerosol_index','L3_CH4_sensor_zenith_angle','specific_humidity_2m_above_ground','L3_NO2_NO2_slant_column_number_density','L3_NO2_cloud_fraction','L3_NO2_tropopause_pressure','L3_O3_sensor_azimuth_angle','L3_O3_cloud_fraction','L3_CLOUD_sensor_zenith_angle','L3_CLOUD_solar_azimuth_angle','L3_CLOUD_solar_zenith_angle','L3_CO_sensor_altitude','L3_HCHO_sensor_azimuth_angle','L3_CO_solar_zenith_angle','L3_HCHO_cloud_fraction','L3_HCHO_solar_zenith_angle','L3_HCHO_tropospheric_HCHO_column_number_density','L3_SO2_SO2_column_number_density','L3_CLOUD_cloud_top_height','L3_CLOUD_cloud_top_pressure','L3_AER_AI_absorbing_aerosol_index','L3_AER_AI_sensor_azimuth_angle','L3_SO2_SO2_slant_column_number_density','L3_SO2_cloud_fraction','L3_SO2_solar_zenith_angle','L3_CH4_aerosol_height','L3_CH4_sensor_azimuth_angle','L3_CH4_solar_azimuth_angle','Place_ID X Date','Date'], axis=1)
    
    # by removing data above the 95th percentile
    removed_col = ['precipitable_water_entire_atmosphere','L3_NO2_sensor_zenith_angle', 'L3_NO2_solar_azimuth_angle','L3_NO2_solar_zenith_angle','L3_NO2_tropospheric_NO2_column_number_density','L3_O3_O3_column_number_density','L3_O3_O3_effective_temperature','L3_O3_solar_zenith_angle','L3_CO_CO_column_number_density','L3_CO_solar_azimuth_angle','L3_CLOUD_cloud_base_pressure','L3_CLOUD_surface_albedo','L3_SO2_solar_azimuth_angle','L3_CH4_aerosol_optical_depth']
    col = ['u_component_of_wind_10m_above_ground', 'v_component_of_wind_10m_above_ground', 'L3_NO2_sensor_azimuth_angle', 'L3_O3_sensor_zenith_angle', 'L3_O3_solar_azimuth_angle','L3_CO_H2O_column_number_density', 'L3_CO_cloud_height','L3_HCHO_HCHO_slant_column_number_density', 'L3_CLOUD_cloud_base_height','L3_CLOUD_cloud_fraction', 'L3_CLOUD_cloud_optical_depth', 'L3_CH4_CH4_column_volume_mixing_ratio_dry_air']
    for co in col:
        q_cutoff = train_df[co].quantile(0.95)
        mask = train_df[co] < q_cutoff

        train_df = train_df[mask]
        
#     cat = ['Place_ID','salemonth','saleday', 'saledayofweek', 'saledayofyear','saleyear',]
    cat = ['Place_ID']
    train_df[cat] = train_df[cat].astype(str)
#     train_df=train_df.drop(columns=cat, axis=1)

    train = train_df
    
    trained_df = train_df.drop(columns='target', axis=1)
    train_numerical_features=trained_df.select_dtypes(include=['float']).columns
    train_numerical = []
    for num in train_numerical_features:
        train_numerical.append(num)

    train_categorical_features=train_df.select_dtypes(include=['object']).columns
    train_categorical= []
    for cat in train_categorical_features:
        train_categorical.append(cat)

    dicts = train_df[train_categorical + train_numerical].to_dict(orient='records')
#     return train_df
    return dicts, train

# train_dict=preprocess_train(train_df)
train_dict, train= preprocess_train(train_df)

In [4]:
# import re

# column_names = train.columns.tolist()

# filtered_columns = [col for col in column_names if re.search(r'L3_CLOUD', col)]

# print(filtered_columns)

In [5]:
dv = DictVectorizer()

X = dv.fit_transform(train_dict)
Y = train['target'].values

In [6]:
X_train, X_valid, y_train, y_valid = train_test_split(X, Y, test_size=0.3, random_state=2)
print(X_train.shape, y_train.shape, X_valid.shape, y_valid.shape)

(11456, 370) (11456,) (4910, 370) (4910,)


In [6]:
# Change max samples in RandomForestRegressor
model_rf = RandomForestRegressor()

In [7]:
%%time
# Cutting down the max number of samples each tree can see improves training time
model_rf.fit(X_train, y_train)

CPU times: user 2min 32s, sys: 86.3 ms, total: 2min 33s
Wall time: 2min 33s


In [8]:
from sklearn.metrics import mean_squared_error
val_preds = model_rf.predict(X_valid)
mse = mean_squared_error(y_valid, val_preds)
print('MSE: ', mse)
print('RMSE: ', np.sqrt(mse))

MSE:  765.5693392613034
RMSE:  27.668923709846457


In [9]:
from sklearn.metrics import mean_squared_error
val_preds = model_rf.predict(X_valid)
mse = mean_squared_error(y_valid, val_preds)
print('MSE: ', mse)
print('RMSE: ', np.sqrt(mse))

MSE:  760.1708629415479
RMSE:  27.57119625517812


In [7]:
import lightgbm as lgb
from sklearn.decomposition import PCA, FastICA

In [9]:
feature=['precipitable_water_entire_atmosphere', 'relative_humidity_2m_above_ground', 'specific_humidity_2m_above_ground', 'temperature_2m_above_ground', 'u_component_of_wind_10m_above_ground', 'v_component_of_wind_10m_above_ground', 'L3_NO2_NO2_column_number_density', 'L3_SO2_SO2_column_number_density','L3_SO2_SO2_column_number_density_amf' ,'L3_SO2_SO2_slant_column_number_density', 'L3_NO2_tropospheric_NO2_column_number_density', 'L3_HCHO_tropospheric_HCHO_column_number_density', 'L3_CO_CO_column_number_density', 'L3_HCHO_HCHO_slant_column_number_density','L3_CO_H2O_column_number_density','L3_O3_O3_column_number_density','L3_NO2_NO2_slant_column_number_density']
train1=X_train.copy()
# test1=test.copy()
    
pca = PCA(random_state=42,n_components=1)
train_pca = pca.fit_transform(train1)
train['pca_feature'] = train_pca[:,0]
# test_pca = pca.transform(test1)
# test['pca_feature'] = test_pca[:,0]
     

TypeError: PCA does not support sparse input. See TruncatedSVD for a possible alternative.

In [8]:
param = {"random_state": 6,
          'metric' : 'rmse',
          'num_iterations': 1500}

In [9]:
trn_data = lgb.Dataset(X_train, y_train)

In [10]:
clf = lgb.train(param, trn_data)



You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8252
[LightGBM] [Info] Number of data points in the train set: 11456, number of used features: 331
[LightGBM] [Info] Start training from score 62.224302


In [11]:
oofs_df= clf.predict(X_valid)

In [12]:
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_valid, oofs_df)
print('MSE: ', mse)
print('RMSE: ', np.sqrt(mse))

MSE:  538.3197343930137
RMSE:  23.201718350006185


# Ensemble

In [11]:
from sklearn.linear_model import SGDRegressor
from sklearn.ensemble import StackingRegressor

In [12]:
rand_for_reg = RandomForestRegressor(n_estimators=10000,
                                     random_state=11)
lin_reg = LinearRegression()
bay_rig = BayesianRidge(n_iter=10000)
lass_reg = Lasso(random_state=11)
dec_tre_reg = DecisionTreeRegressor(random_state=11)
kn_reg = KNeighborsRegressor()
sgd_reg = SGDRegressor()


In [None]:
%%time
estimator_stack=([('model_1',rand_for_reg), ('model_2',lin_reg), ('model_3',bay_rig), ('model_4',lass_reg), ('model_5',dec_tre_reg), ('model_6',kn_reg), ('model_7', sgd_reg)])

reg_stack = StackingRegressor(estimators=estimator_stack,
                              final_estimator = RandomForestRegressor())
reg_stack.fit(X_train, y_train)

In [None]:
from sklearn.metrics import mean_squared_error
val_preds = reg_stack.predict(X_valid)
mse = mean_squared_error(y_valid, val_preds)
print('MSE: ', mse)
print('RMSE: ', np.sqrt(mse))

# Test

In [13]:
test_df = pd.read_csv('data/Test.csv',parse_dates=["Date"])

In [14]:
# For test
def fill_dataframe(df):
    numerical_columns=df.select_dtypes(include=['float64']).columns
    for col in df.columns:
        if col in numerical_columns:
          # For automaticaly filling all columns with dtype(int or float) with mean
          median=df[col].median()
          df[col]=df[col].fillna(median)

fill_dataframe(test_df)

In [15]:
test_df.head()

Unnamed: 0,Place_ID X Date,Date,Place_ID,precipitable_water_entire_atmosphere,relative_humidity_2m_above_ground,specific_humidity_2m_above_ground,temperature_2m_above_ground,u_component_of_wind_10m_above_ground,v_component_of_wind_10m_above_ground,L3_NO2_NO2_column_number_density,L3_NO2_NO2_slant_column_number_density,L3_NO2_absorbing_aerosol_index,L3_NO2_cloud_fraction,L3_NO2_sensor_altitude,L3_NO2_sensor_azimuth_angle,L3_NO2_sensor_zenith_angle,L3_NO2_solar_azimuth_angle,L3_NO2_solar_zenith_angle,L3_NO2_stratospheric_NO2_column_number_density,L3_NO2_tropopause_pressure,L3_NO2_tropospheric_NO2_column_number_density,L3_O3_O3_column_number_density,L3_O3_O3_effective_temperature,L3_O3_cloud_fraction,L3_O3_sensor_azimuth_angle,L3_O3_sensor_zenith_angle,L3_O3_solar_azimuth_angle,L3_O3_solar_zenith_angle,L3_CO_CO_column_number_density,L3_CO_H2O_column_number_density,L3_CO_cloud_height,L3_CO_sensor_altitude,L3_CO_sensor_azimuth_angle,L3_CO_sensor_zenith_angle,L3_CO_solar_azimuth_angle,L3_CO_solar_zenith_angle,L3_HCHO_HCHO_slant_column_number_density,L3_HCHO_cloud_fraction,L3_HCHO_sensor_azimuth_angle,L3_HCHO_sensor_zenith_angle,L3_HCHO_solar_azimuth_angle,L3_HCHO_solar_zenith_angle,L3_HCHO_tropospheric_HCHO_column_number_density,L3_HCHO_tropospheric_HCHO_column_number_density_amf,L3_CLOUD_cloud_base_height,L3_CLOUD_cloud_base_pressure,L3_CLOUD_cloud_fraction,L3_CLOUD_cloud_optical_depth,L3_CLOUD_cloud_top_height,L3_CLOUD_cloud_top_pressure,L3_CLOUD_sensor_azimuth_angle,L3_CLOUD_sensor_zenith_angle,L3_CLOUD_solar_azimuth_angle,L3_CLOUD_solar_zenith_angle,L3_CLOUD_surface_albedo,L3_AER_AI_absorbing_aerosol_index,L3_AER_AI_sensor_altitude,L3_AER_AI_sensor_azimuth_angle,L3_AER_AI_sensor_zenith_angle,L3_AER_AI_solar_azimuth_angle,L3_AER_AI_solar_zenith_angle,L3_SO2_SO2_column_number_density,L3_SO2_SO2_column_number_density_amf,L3_SO2_SO2_slant_column_number_density,L3_SO2_absorbing_aerosol_index,L3_SO2_cloud_fraction,L3_SO2_sensor_azimuth_angle,L3_SO2_sensor_zenith_angle,L3_SO2_solar_azimuth_angle,L3_SO2_solar_zenith_angle,L3_CH4_CH4_column_volume_mixing_ratio_dry_air,L3_CH4_aerosol_height,L3_CH4_aerosol_optical_depth,L3_CH4_sensor_azimuth_angle,L3_CH4_sensor_zenith_angle,L3_CH4_solar_azimuth_angle,L3_CH4_solar_zenith_angle
0,0OS9LVX X 2020-01-02,2020-01-02,0OS9LVX,11.6,30.200001,0.00409,14.656824,3.956377,0.712605,5.3e-05,0.000108,0.466171,0.010752,835670.49274,68.099367,1.445658,-95.984984,22.942019,4.6e-05,6156.074219,7e-06,0.11331,227.467539,0.032071,68.099367,1.445658,-95.984984,22.942019,0.017953,841.142869,155.982981,835625.785337,32.709708,1.898112,-95.987015,22.95015,7.3e-05,0.032071,68.099367,1.445658,-95.984984,22.942019,8.8e-05,1.494039,2592.634167,74028.429228,0.032268,6.317523,2593.137433,74023.930722,68.099367,1.445658,-95.984984,22.942019,0.299053,0.466173,835670.49274,68.099367,1.445658,-95.984984,22.942019,0.000221,0.784436,0.000184,-0.140458,0.032071,68.099367,1.445658,-95.984984,22.942019,1771.898988,2943.639456,0.003386,0.0,1.538256,0.0,21.074167
1,0OS9LVX X 2020-01-03,2020-01-03,0OS9LVX,18.300001,42.900002,0.00595,15.026544,4.23043,0.661892,5e-05,0.000109,-0.213659,0.028307,835281.882757,75.936844,34.638933,-95.017976,18.539524,4.5e-05,7311.869141,5e-06,0.110397,227.788713,0.044784,75.936845,34.639165,-95.017596,18.539491,0.019576,1187.57032,922.385833,835225.278332,73.808143,34.720869,-95.017095,18.548551,4.8e-05,0.040803,75.936813,34.641758,-95.014908,18.539116,7.4e-05,1.534485,7334.601102,48466.6533,0.046211,8.308433,7902.937546,46345.613031,75.936845,34.639165,-95.017596,18.539491,0.290397,-0.213657,835281.882757,75.936844,34.638933,-95.017976,18.539524,3.4e-05,0.678988,1.4e-05,-0.842713,0.040803,75.936813,34.641758,-95.014908,18.539116,1771.898988,2943.639456,0.003386,0.0,1.538256,0.0,21.074167
2,0OS9LVX X 2020-01-04,2020-01-04,0OS9LVX,17.6,41.299999,0.0059,15.511041,5.245728,1.640559,5e-05,0.000134,-0.25425,0.010374,834839.050781,75.552445,55.872276,-94.015418,14.14082,4.4e-05,6156.074219,6e-06,0.112502,229.235631,0.007113,75.552445,55.872276,-94.015418,14.14082,0.018736,944.341413,1281.892115,834751.987268,73.86168,56.019838,-94.001436,14.143972,6e-06,0.007113,75.552445,55.872276,-94.015418,14.14082,4.2e-05,1.680458,5188.524088,61477.291929,0.007849,6.415458,5797.742161,57757.842218,75.552445,55.872276,-94.015418,14.14082,0.279277,-0.25425,834839.050781,75.552445,55.872276,-94.015418,14.14082,0.000184,0.667768,0.000122,-0.71677,0.007113,75.552445,55.872276,-94.015418,14.14082,1771.898988,2943.639456,0.003386,0.0,1.538256,0.0,21.074167
3,0OS9LVX X 2020-01-05,2020-01-05,0OS9LVX,15.011948,53.100002,0.00709,14.441858,5.454001,-0.190532,5.5e-05,0.000155,-0.26849,0.088795,836269.833912,-102.285091,59.174917,-97.248047,32.730747,4.3e-05,6156.074219,1.2e-05,0.113312,228.383705,0.062076,-102.285125,59.174188,-97.247602,32.730553,0.018304,873.850358,895.144001,836168.569714,-100.652057,59.246499,-97.249432,32.741289,-2.5e-05,0.062076,-102.285125,59.174188,-97.247602,32.730553,1.3e-05,1.653101,2601.282362,74707.128398,0.061411,6.446504,2629.112962,74535.286185,-102.285125,59.174188,-97.247602,32.730553,0.359061,-0.268491,836269.833912,-102.285091,59.174917,-97.248047,32.730747,0.000201,0.696772,0.000133,-0.730104,0.062076,-102.285125,59.174188,-97.247602,32.730553,1771.898988,2943.639456,0.003386,0.0,1.538256,0.0,21.074167
4,0OS9LVX X 2020-01-06,2020-01-06,0OS9LVX,9.7,71.599998,0.00808,11.896295,3.511787,-0.279441,5.5e-05,0.000131,0.46072,0.041197,836043.111009,-102.13396,40.925873,-96.057236,28.320528,4.7e-05,6156.074219,8e-06,0.114592,229.490218,0.042777,-102.133957,40.925873,-96.057265,28.320527,0.018666,666.809145,1.0,835978.237828,-100.252279,41.004036,-96.059909,28.33255,-9e-05,0.042055,-102.13393,40.925803,-96.057578,28.320496,-2.7e-05,1.426467,2557.196696,74589.276717,0.042168,6.874006,2571.293308,74483.68031,-102.133957,40.925873,-96.057265,28.320527,0.331468,0.46072,836043.111009,-102.13396,40.925873,-96.057236,28.320528,9.3e-05,0.677305,6.5e-05,-0.108353,0.042777,-102.133957,40.925873,-96.057265,28.320527,1831.261597,3229.118652,0.031068,-100.278343,41.84708,-95.910744,28.498789


In [16]:
def preprocess_test(train_df):
    
    train_df['saleyear'] = train_df.Date.dt.year
    train_df['salemonth'] = train_df.Date.dt.month
    train_df['saleday'] = train_df.Date.dt.day
    train_df['saledayofweek'] = train_df.Date.dt.day_of_week
    train_df['saledayofyear'] = train_df.Date.dt.day_of_year
    
    train_df['placeID_freq'] = train_df['Place_ID'].map(train_df['Place_ID'].value_counts())
    
    # Drop features that corrolate with other features
    train_df= train_df.drop(columns=['L3_SO2_SO2_column_number_density_amf','L3_NO2_stratospheric_NO2_column_number_density','L3_CLOUD_sensor_azimuth_angle','L3_HCHO_solar_azimuth_angle','L3_SO2_sensor_zenith_angle','L3_CH4_solar_zenith_angle','temperature_2m_above_ground','L3_NO2_absorbing_aerosol_index','L3_CO_sensor_azimuth_angle','L3_AER_AI_sensor_zenith_angle','L3_AER_AI_solar_azimuth_angle','L3_AER_AI_solar_zenith_angle','L3_CO_sensor_zenith_angle','L3_HCHO_sensor_zenith_angle','L3_HCHO_tropospheric_HCHO_column_number_density_amf','L3_SO2_sensor_azimuth_angle','L3_SO2_absorbing_aerosol_index','L3_CH4_sensor_zenith_angle','specific_humidity_2m_above_ground','L3_NO2_NO2_slant_column_number_density','L3_NO2_cloud_fraction','L3_NO2_tropopause_pressure','L3_O3_sensor_azimuth_angle','L3_O3_cloud_fraction','L3_CLOUD_sensor_zenith_angle','L3_CLOUD_solar_azimuth_angle','L3_CLOUD_solar_zenith_angle','L3_CO_sensor_altitude','L3_HCHO_sensor_azimuth_angle','L3_CO_solar_zenith_angle','L3_HCHO_cloud_fraction','L3_HCHO_solar_zenith_angle','L3_HCHO_tropospheric_HCHO_column_number_density','L3_SO2_SO2_column_number_density','L3_CLOUD_cloud_top_height','L3_CLOUD_cloud_top_pressure','L3_AER_AI_absorbing_aerosol_index','L3_AER_AI_sensor_azimuth_angle','L3_SO2_SO2_slant_column_number_density','L3_SO2_cloud_fraction','L3_SO2_solar_zenith_angle','L3_CH4_aerosol_height','L3_CH4_sensor_azimuth_angle','L3_CH4_solar_azimuth_angle','Place_ID X Date','Date'], axis=1)
    
    return train_df


test=preprocess_test(test_df)

In [17]:
def process_test(train):
    
    cat = ['Place_ID','salemonth','saleyear','saleday', 'saledayofweek', 'saledayofyear']
    train[cat] = train[cat].astype(str)
    
    test_numerical_features=train.select_dtypes(include=['float']).columns
    test_numerical = []
    for num in test_numerical_features:
        test_numerical.append(num)

    test_categorical_features=train.select_dtypes(include=['object']).columns
    test_categorical= []
    for cat in test_categorical_features:
        test_categorical.append(cat)

    dicts = train[test_categorical + test_numerical].to_dict(orient='records')
    
    return dicts

test_dict= process_test(test)

In [18]:
tested= dv.transform(test_dict)

In [19]:
tested.shape

(16136, 370)

In [20]:
test_pred = clf.predict(tested)

In [21]:
test_pred

array([31.60019149, 16.05000056, 40.08160953, ..., 46.50302318,
       39.42443114, 50.28608439])

In [22]:
test_df['target'] = np.round(test_pred, 2)

In [23]:
submit = test_df[['Place_ID X Date','target']]


In [24]:
submit.head()

Unnamed: 0,Place_ID X Date,target
0,0OS9LVX X 2020-01-02,31.6
1,0OS9LVX X 2020-01-03,16.05
2,0OS9LVX X 2020-01-04,40.08
3,0OS9LVX X 2020-01-05,35.91
4,0OS9LVX X 2020-01-06,25.91


In [25]:
submit.to_csv('submitv30.csv', index=False)