In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
warnings.filterwarnings('ignore')


from sklearn.preprocessing import MinMaxScaler, LabelEncoder, StandardScaler, OneHotEncoder
le = LabelEncoder()
oe = OneHotEncoder()
scaler = MinMaxScaler()
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import mean_squared_error

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor



data_path = r'C:\\Users\\Sunil\\Projects\\Machine Hack\\Wipro\\Data\\'

In [2]:
train = pd.read_csv(data_path + 'train.csv')
test = pd.read_csv(data_path + 'test.csv')
sample_sub = pd.read_csv(data_path + 'sample_submission.csv')

In [3]:
train.loc[ (train['Fill Flag'] == 0) | (train['Fill Flag'] > 5), 'Fill Flag'] = 0
test.loc[ (test['Fill Flag'] == 0) | (test['Fill Flag'] > 5), 'Fill Flag'] = 0

In [4]:
int8_cols = ['Month', 'Day', 'Hour', 'Minute', 'Cloud Type', 'Fill Flag']
int16_cols = ['Year', 'Clearsky DHI', 'Clearsky DNI', 'Clearsky GHI', 'Pressure']
test_int16_cols = ['Year', 'Pressure']


train[int8_cols] = train[int8_cols].astype('int8')
train[int16_cols] = train[int16_cols].astype('int16')

test[int8_cols] = test[int8_cols].astype('int8')
test[test_int16_cols] = test[test_int16_cols].astype('int16')

In [5]:
def join(train_, test_):
    df_ = pd.concat([train_, test_], axis = 0).reset_index(drop=True)
    return df_

def split(df_):
    train_, test_ = df_[:train.shape[0]], df_[train.shape[0]:].reset_index(drop = True)
    targets = ['Clearsky DHI', 'Clearsky GHI', 'Clearsky DNI', 'Date']
    features_ = [col for col in df_.columns if col not in targets]
    
    return train_, test_, features_

In [20]:
df = join(train, test)

df['Date'] = pd.to_datetime(df[['Year', 'Month', 'Day', 'Hour', 'Minute']])
df['Week'] = df['Date'].dt.week
df['Quarter'] = df['Date'].dt.quarter #805
df['day_of_week'] = df['Date'].dt.dayofweek # 773

#######
distance = [0.9840, 0.9888, 0.9962, 1.005, 1.0122, 1.0163, 1.0161, 1.0116, 1.0039, 0.9954, 0.9878, 0.9837]
month = [i for i in range(1, 13)]
md_dict = dict(zip(month, distance))
df['Vertical_distance'] = df.apply(lambda x: (md_dict[x['Month']] * np.cos(x['Solar Zenith Angle'])), axis = 1) # 771
#######

df['Hour Bins'] = pd.cut(df["Hour"], bins=3, labels=False) #770

#######
df['b'] = df['Relative Humidity'] - df['Relative Humidity'].shift(48)
df['e'] = df['Solar Zenith Angle'] - df['Solar Zenith Angle'].shift(48)
df['Temperature Trnd'] = df['Temperature'] - df['Temperature'].shift(1) 
df['Solar Zenith Angle Trnd'] = df['Solar Zenith Angle'] - df['Solar Zenith Angle'].shift(1) 
df['Relative Humidity Trnd'] = df['Relative Humidity'] - df['Relative Humidity'].shift(1) 
df['Precipitable Water Trnd'] = df['Precipitable Water'] - df['Precipitable Water'].shift(48) 
df['Dew Point Trnd'] = df['Dew Point'] - df['Dew Point'].shift(48) 
df['Wind Speed Trnd'] = df['Wind Speed'] - df['Wind Speed'].shift(1) 
df['Fill Flag Trnd'] = (df['Fill Flag']+1) / (df['Fill Flag'].shift(1)+1)
df['Pressure / Solar Zenith Angle'] = df['Pressure'] / df['Solar Zenith Angle']
df['Temperature / Solar Zenith Angle'] = df['Temperature'] / df['Solar Zenith Angle']
df['Dew Point * Temperature'] = df['Dew Point'] * df['Temperature']

#762
#######
df['rolling_mean_sa'] = df['Solar Zenith Angle'].rolling(window=6).mean()
df['rolling_mean_tmp'] = df['Temperature'].rolling(window=6).mean()
df['rolling_mean_ws'] = df['Wind Speed'].rolling(window=6).mean()
df['rolling_mean_dp'] = df['Dew Point'].rolling(window=6).mean()
df['rolling_mean_ca'] = df['Dew Point'].rolling(window=6).mean()

#759
#######
df['2_diff'] = df['Solar Zenith Angle'].diff(periods=2)
df['3_diff'] = df['Solar Zenith Angle'].diff(periods=3)
df['4_diff'] = df['Solar Zenith Angle'].diff(periods=4)
df['2_diff_tmp'] = df['Temperature'].diff(periods=2)
df['3_diff_tmp'] = df['Temperature'].diff(periods=3)
df['4_diff_tmp'] = df['Temperature'].diff(periods=4)
df['2_diff_ws'] = df['Wind Speed'].diff(periods=2)
df['3_diff_ws'] = df['Wind Speed'].diff(periods=3)
df['4_diff_ws'] = df['Wind Speed'].diff(periods=4)

train_proc, test_proc, features = split(df)

In [21]:
%%time
total_error = 0
for i in ['Clearsky DNI']:
    target = i
    trn, val = train_test_split(train_proc, test_size = 0.2, random_state = 42)

    ##### Input for model
    X_trn, X_val = trn[features], val[features]

    ##### Target column
    y_trn, y_val = trn[target], val[target]

    ##### Features for test data that we will be predicting
    X_test = test_proc[features]
    
    
    model = CatBoostRegressor(random_state=1999, n_estimators=1000, task_type = 'GPU', eval_metric='RMSE')
    
    model.fit(X_trn, y_trn, eval_set=[(X_val, y_val)], verbose = 200, early_stopping_rounds=100)
    preds = model.predict(X_val)

    error = mean_squared_error(y_val, preds)
    
    print(f'Target is {target} and mean_squared_error is : {error}')
    
    test_preds = model.predict(X_test)
    sample_sub[target] = test_preds
    test[target] = test_preds

Learning rate set to 0.105919
0:	learn: 332.0083610	test: 332.2481207	best: 332.2481207 (0)	total: 11.4ms	remaining: 11.4s
200:	learn: 32.7376230	test: 33.0598586	best: 33.0598586 (200)	total: 2.05s	remaining: 8.14s
400:	learn: 30.0253921	test: 30.7770649	best: 30.7770649 (400)	total: 4.07s	remaining: 6.08s
600:	learn: 28.1137077	test: 29.3789850	best: 29.3789850 (600)	total: 6.04s	remaining: 4.01s
800:	learn: 26.6763982	test: 28.3695451	best: 28.3695451 (800)	total: 8.01s	remaining: 1.99s
999:	learn: 25.5094665	test: 27.5789258	best: 27.5789258 (999)	total: 9.95s	remaining: 0us
bestTest = 27.57892579
bestIteration = 999
Target is Clearsky DNI and mean_squared_error is : 760.5972359697549
CPU times: total: 17.7 s
Wall time: 10.7 s
