In [None]:
!pip install pycaret

In [None]:
# Analysis adapted from: https://www.kaggle.com/junhyeok99/automl-pycaret

# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

from pycaret.regression import setup, compare_models, blend_models, finalize_model, predict_model, plot_model

import time

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv('../input/tabular-playground-series-jul-2021/train.csv')
train.head()

In [None]:
test = pd.read_csv('../input/tabular-playground-series-jul-2021/test.csv')
test.head()

In [None]:
# Concatenate Train & Test Data
df = pd.concat([train, test])
df.sample(5)

In [None]:
df.info()

In [None]:
# Parse out date field into separate fields
df['date_time'] = pd.to_datetime(df['date_time'])
print('data timeframe: {} through {}'.format(df.date_time.dt.date.min(), df.date_time.dt.date.max())) 
df['year'] = df.date_time.dt.year
df['month'] = df.date_time.dt.month
df['week'] = df.date_time.dt.week
df['day'] = df.date_time.dt.day
df['dayofweek'] = df.date_time.dt.dayofweek
df['hour'] = df.date_time.dt.hour
df.sample(5)

In [None]:
df['time'] = df.date_time.dt.date - df.date_time.dt.date.min()
df['time'] = df.time.apply(lambda x: x.days)
df = df.drop(columns = 'date_time')
df.sample(5)

In [None]:
# splitting data back up into train and test
train2 = df[:len(train)]
test2 = df[len(train):]

In [None]:
target_cols = ['target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides']
for col in target_cols:
    train2[col] = np.log1p(train2[col]) # Log Scale each target column for training data

In [None]:
fig, ax = plt.subplots(len(target_cols), 2, figsize=(12, 12))
n = 0
for i in target_cols:
    sns.histplot(train[i], ax = ax[n, 0])
    sns.histplot(train2[i], ax = ax[n, 1])
    n += 1
    
fig.tight_layout()
plt.show()

In [None]:
train_3 = train2.drop(columns = target_cols)
test_3 = test2.drop(columns = target_cols)

train_co = train2.drop(columns = ['target_benzene', 'target_nitrogen_oxides'])
train_be = train2.drop(columns = ['target_nitrogen_oxides', 'target_carbon_monoxide'])
train_no = train2.drop(columns = ['target_benzene', 'target_carbon_monoxide'])

test_co = test2.drop(columns = ['target_benzene', 'target_nitrogen_oxides'])
test_be = test2.drop(columns = ['target_nitrogen_oxides', 'target_carbon_monoxide'])
test_no = test2.drop(columns = ['target_benzene', 'target_carbon_monoxide'])

In [None]:
time_cols = ['year', 'month', 'time', 'hour']

fig, ax = plt.subplots(len(time_cols), len(target_cols), figsize = (12,10))

for var in enumerate(time_cols):
    for i in enumerate(target_cols):
        ax[var[0], i[0]].plot(train2.groupby(train2[var[1]])[i[1]].mean())
        ax[var[0], i[0]].set_title('{} - {}'.format(var[1], i[1]))
        
plt.tight_layout()
plt.show()

In [None]:
temp_cols = ['deg_C', 'relative_humidity', 'absolute_humidity']

fig, ax = plt.subplots(len(temp_cols), len(target_cols), figsize = (12,10))

for var in enumerate(temp_cols):
    for i in enumerate(target_cols):
        ax[var[0], i[0]].plot(train2.groupby(train2[var[1]])[i[1]].mean())
        ax[var[0], i[0]].set_title('{} - {}'.format(var[1], i[1]))
        
plt.tight_layout()
plt.show()

In [None]:
sensor_cols = ['sensor_1', 'sensor_2', 'sensor_3', 'sensor_4', 'sensor_5']

fig, ax = plt.subplots(len(sensor_cols), len(target_cols), figsize = (12,10))

for var in enumerate(sensor_cols):
    for i in enumerate(target_cols):
        ax[var[0], i[0]].plot(train2.groupby(train2[var[1]])[i[1]].mean())
        ax[var[0], i[0]].set_title('{} - {}'.format(var[1], i[1]))
        
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(12,12))
sns.heatmap(train2.corr());

In [None]:
def pycaret_model(train, target, test, n_select, fold, opt):
  print('Setup Your Data....')
  setup(data=train,
              target=target,
              numeric_imputation = 'mean',
              silent= True)
  
  print('Comparing Models....')
  best = compare_models(sort=opt, n_select=n_select, fold = fold, exclude = ['xgboost'])

  print('Here is Best Model Feature Importances!')
  plot_model(estimator = best[0], plot = 'feature')
  time.sleep(5)
  
  print('Blending Models....')
  blended = blend_models(estimator_list= best, fold=fold, optimize=opt)
  pred_holdout = predict_model(blended)
    
  print('Finallizing Models....')
  final_model = finalize_model(blended)
  print('Done...!!!')

  pred_esb = predict_model(final_model, test)
  re = pred_esb['Label']

  return re

In [None]:
sub = pd.read_csv('../input/tabular-playground-series-jul-2021/sample_submission.csv')
sub['target_carbon_monoxide'] = np.exp(pycaret_model(train_co, 'target_carbon_monoxide', test_co, 5, 3, 'RMSLE'))-1

In [None]:
sub['target_benzene'] = np.exp(pycaret_model(train_be, 'target_benzene', test_be, 5, 3, 'RMSLE'))-1

In [None]:
sub['target_nitrogen_oxides'] = np.exp(pycaret_model(train_no, 'target_nitrogen_oxides', test_no, 4, 3, 'RMSLE')) - 1

In [None]:
sub.to_csv('submission.csv', index=False)