In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.decomposition import PCA
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train = pd.read_csv("train.csv")
train.head()

In [None]:
test = pd.read_csv("/kaggle/input/tabular-playground-series-jul-2021/test.csv")
test.head()

In [None]:
train.describe()

In [None]:
def shift_feature(df):
    shift_features = ["deg_C", "absolute_humidity", "relative_humidity",
                      "sensor_1", "sensor_2", "sensor_3", "sensor_4", "sensor_5"]
    # Amounts of hour shifts and lags
    shifts = [1, 3]
    
    for feature in shift_features:
        for shift in shifts:
            df[feature+"-"+str(shift)+"abs_shfit"] = df[feature] - df[feature].shift(periods=shift, fill_value=0)
            df[feature+"+"+str(shift)+"abs_shfit"] = df[feature] - df[feature].shift(periods=-shift, fill_value=0)
            df[feature+"-"+str(shift)+"prc_shfit"] = (df[feature] / df[feature].shift(periods=shift, fill_value=0)) - 1
            df[feature+"+"+str(shift)+"prc_shfit"] = (df[feature] / df[feature].shift(periods=-shift, fill_value=0)) - 1

    # Replacing infinity values as a result of devision by zero at the end of a dataset
    df.replace(to_replace=np.inf, value=0, inplace=True)
    return df.drop("hour", axis=1)

In [None]:
def month(x):
    return int(x[5:7])
def hour(x):
    return int(x[11:13])
train['month'] = train['date_time'].apply(month)
train['hour'] = train['date_time'].apply(hour)
train["is_winter"] = train["month"].isin([1, 2, 12]).astype("int")
train["is_spring"] = train["month"].isin([3, 4, 5]).astype("int")
train["is_summer"] = train["month"].isin([6, 7, 8]).astype("int")
train["is_autumn"] = train["month"].isin([9, 10, 11]).astype("int")
train["working_hours"] =  train["hour"].isin(np.arange(8, 21, 1)).astype("int")
#train["maximum_hours"] =  train["hour"].isin([8, 9, 17, 18, 19, 20]).astype("int")
train['SMC'] = (train['absolute_humidity'] * 100) / train['relative_humidity']
train['Dew_Point'] = 243.12*(np.log(train['relative_humidity'] * 0.01) + (17.62 * train['deg_C'])/
                                (243.12+train['deg_C']))/(17.62-(np.log(train['relative_humidity'] * 0.01)
                                                                    +17.62*train['deg_C']/(243.12+train['deg_C'])))
train["is_weekend"] = (train["date_time"].apply(pd.to_datetime).dt.dayofweek >= 5).astype("int")
train["date_time"] = pd.to_datetime(train["date_time"], format="%Y-%m-%d %H:%M:%S")
#train = shift_feature(train)
#train.head()

In [None]:
#train['week'] = train['date_time'].dt.weekofyear

In [None]:
sns.heatmap(train.corr(),cmap = 'Accent_r')

In [None]:
sns.distplot(train['target_carbon_monoxide'])
plt.show()
sns.distplot(np.log1p(train['target_carbon_monoxide']))
plt.show()

In [None]:
test['month'] = test['date_time'].apply(month)
test['hour'] = test['date_time'].apply(hour)
test["is_winter"] = test["month"].isin([1, 2, 12]).astype("int")
test["is_spring"] = test["month"].isin([3, 4, 5]).astype("int")
test["is_summer"] = test["month"].isin([6, 7, 8]).astype("int")
test["is_autumn"] = test["month"].isin([9, 10, 11]).astype("int")
test["working_hours"] =  test["hour"].isin(np.arange(8, 21, 1)).astype("int")
test["is_weekend"] = (test["date_time"].apply(pd.to_datetime).dt.dayofweek >= 5).astype("int")
test['SMC'] = (test['absolute_humidity'] * 100) / test['relative_humidity']
test['Dew_Point'] = 243.12*(np.log(test['relative_humidity'] * 0.01) + (17.62 * test['deg_C'])/
                                (243.12+test['deg_C']))/(17.62-(np.log(test['relative_humidity'] * 0.01)
                                                                    +17.62*test['deg_C']/(243.12+test['deg_C'])))
#test["maximum_hours"] =  test["hour"].isin([8, 9, 17, 18, 19, 20]).astype("int")
test["date_time"] = pd.to_datetime(test["date_time"], format="%Y-%m-%d %H:%M:%S")
#test['week'] = test['date_time'].dt.weekofyear
#test = shift_feature(test)
test_dt = test['date_time']
test.head()

In [None]:
test.shape

In [None]:
Xtrain = train.drop(["date_time", 'target_carbon_monoxide', 'target_benzene', 
                     'target_nitrogen_oxides'], axis = 1)
y_co = np.log1p(train['target_carbon_monoxide'])
y_be = np.log1p(train['target_benzene'])
y_no = np.log1p(train['target_nitrogen_oxides'])
Xtrain

In [None]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
ss.fit(Xtrain)
Xtrain1 = ss.transform(Xtrain)

In [None]:
X_train, X_test, y_train_co, y_test_co = train_test_split(Xtrain1, 
                                                          y_co, test_size=0.33, random_state=44)

model_co = XGBRegressor(n_estimators=75,random_state=44)
model_co.fit(X_train,y_train_co)

pred = model_co.predict(X_test)
np.sqrt(mean_squared_log_error(y_test_co,pred))

#Carbon Monoxide

In [None]:
from sklearn.model_selection import GridSearchCV
params = {'n_estimators' : [50, 75, 100, 125, 150]}
grid = GridSearchCV(XGBRegressor(random_state=44),params, verbose = 1)
grid.fit(X_train, y_train_co)

In [None]:
grid.best_params_
pred = grid.predict(X_test)
np.sqrt(mean_squared_log_error(y_test_co,pred))

#Benzine

In [None]:
X_train, X_test, y_train_be, y_test_be = train_test_split(Xtrain1, 
                                                          y_be, test_size=0.33, random_state=44)

model_be = RandomForestRegressor(random_state=44)
model_be.fit(X_train,y_train_be)

pred = model_be.predict(X_test)
np.sqrt(mean_squared_log_error(y_test_be,pred))

#Nitrogen Oxide

In [None]:
Xtrain2 = pd.concat([Xtrain, train['target_carbon_monoxide'],train['date_time'].dt.weekofyear],axis = 1)
Xtrain2.head()
ss2 = StandardScaler()
ss2.fit(Xtrain2)
Xtrain2 = ss2.transform(Xtrain2)

In [None]:
X_train, X_test, y_train_no, y_test_no = train_test_split(Xtrain2, 
                                                          y_no, test_size=0.33, random_state=44)

model_no = GradientBoostingRegressor(random_state=44)
model_no.fit(X_train,y_train_no)

pred = model_no.predict(X_test)
for i in range(len(pred)):
    if pred[i] < 0:
        pred[i] = 0
np.sqrt(mean_squared_log_error(y_test_no,pred))

In [None]:
test1 = test.drop(['date_time'], axis = 1)
test1 = ss.transform(test1)
pred_co = model_co.predict(test1)
pred_be = model_be.predict(test1)

In [None]:
test2 = pd.concat([test.drop(['date_time'], axis = 1),test['date_time'].dt.weekofyear, pd.Series(pred_co)],axis = 1)
test2 = ss2.transform(test2)
pred_no = model_no.predict(test2)

#Inverse Transform

In [None]:
pred_co_f = np.exp(pred_co)-1
pred_be_f = np.exp(pred_be)-1
pred_no_f = np.exp(pred_no)-1

In [None]:
df = pd.DataFrame(test_dt)
df['target_carbon_monoxide'] = pred_co_f
df['target_benzene'] = pred_be_f
df['target_nitrogen_oxides'] = pred_no_f
df