# import packages

In [1]:
# data preprocess
import pandas as pd 
import numpy as np 
import os 
from itertools import chain, repeat
import math
import warnings 
warnings.filterwarnings(action='ignore')
pd.set_option('display.max_columns', 100)

# data EDA 
import matplotlib.pyplot as plt
import seaborn as sns

# data load

In [None]:
# file path = data directory
file_path = '../태양광예측/'

# train, submission data load 
train = pd.read_csv(file_path+'train/'+'train.csv')
submission = pd.read_csv(file_path+'sample_submission.csv')

# test load 
t_test = []

for i in range(81):
    file_path = '../태양광예측/test/' + str(i) + '.csv'
    temp = pd.read_csv(file_path)
    t_test.append(temp)
    
test = pd.concat(t_test)

# Data Preprocess

## Basic preprocess

In [95]:
def base_preprocess(data, is_train=True) : 
    
    if is_train != True :   

        pass
    
    else : 
        
        
        # 원래 방법대로 한다면, 마지막 48개행은 target2에 대해 미지의 값을 가진게 아닌가?
        # step1 : 하루 뒤, 이틀 뒤 target 값 가져오기 
        data['1day_after_target'] = data.shift(-48)['TARGET']
        data['2day_after_target'] = data.shift(-96)['TARGET']

        # step2 : 1~3일전의 컬럼 및 target 가져오기 
        
    
    #data = data.dropna(axis=0)

    return(data)

In [96]:
df_train = base_preprocess(train, is_train=True)
df_test = base_preprocess(test, is_train=False)

print('train shape :' ,train.shape)
print('df_train shape :', df_train.shape)
print('train - df_train shape :', train.shape[0]-df_train.shape[0])
print('df_test shape :', df_test.shape)
print('test shape :', test.shape)

train shape : (52560, 11)
df_train shape : (52560, 11)
train - df_train shape : 0
df_test shape : (27216, 9)
test shape : (27216, 9)


## Feature Engineering

### 천정각

In [97]:
test_day = df_test['Day'].copy() 

label_list = []

for i in range(1,568) : 
    label_list.append([i]*48)

label_list = [item for sublist in label_list for item in sublist]

In [98]:
def zenith_angel_def(data, merge_data, is_train=True, label_lists=None) : 
    
    if is_train != True : 
        
        data['Day'] = label_list
        merge_data['Day'] = label_list
    
    # 시간 및 분을 합쳐주는 함수
    def time_pil(data) : 

        texts = str(data['Hour']) + str(data['Minute'])[:1]

        return(texts)

    data['GHI_less'] = data['DHI'] + data['DNI']
    data['Time'] = data[['Hour','Minute']].apply(lambda x: time_pil(x), axis=1)
    data['GHI_less'] = data['GHI_less'].astype('int') 

    f = data[data['GHI_less']>0].groupby(['Day']).head(1)[['Day','Time','T','GHI_less']]
    f.columns = ['Day','First_time','First_T','First_GHI_less']

    l = data[data['GHI_less']>0].groupby(['Day']).tail(1)[['Day','Time','T','GHI_less']]
    l.columns = ['Day','Last_time','Last_T','Last_GHI_less']
    
    f_l = pd.merge(f, l, how='inner', on=['Day']) 
    
    def sun_hour(data) : 

        hours = (int(str(data['Last_time'])[:2]) - int(str(data['First_time'])[0]))*60

        return(hours)

    def sun_minute(data) : 

        minutes = (int(str(data['Last_time'][2])) - int(str(data['First_time'][1])))*10

        return(minutes)

    f_l['SUN_hour'] = f_l[['First_time','Last_time']].apply(lambda x:sun_hour(x), axis=1)
    f_l['SUN_minute'] = f_l[['First_time','Last_time']].apply(lambda x:sun_minute(x), axis=1)
    
    f_l['SUN_time'] = f_l['SUN_hour'] + f_l['SUN_minute']
    
    temp = pd.merge(data, f_l, how='inner', on='Day')
    temp['Time'] = temp[['Hour','Minute']].apply(lambda x: time_pil(x), axis=1)
    temp[['Time','First_time','Last_time']] = temp[['Time','First_time','Last_time']].astype('int')
    
    temp2 = temp[(temp['Time']>=temp['First_time']) & (temp['Time']<=temp['Last_time'])]  
    
    zenith_angle_lists = []

    for day in temp2.Day.unique() : 

        temp3 = temp2[temp2['Day']==day]
        zenith_angles = np.arange(0,181,180/(temp3['SUN_time'].iloc[0]/30))

        for num in range(len(zenith_angles)) : 

            if zenith_angles[num] < 90 : 
                zenith_angles[num] = 90 - zenith_angles[num]
                
            elif zenith_angles[num] == 0 :
                zenith_angles[num] = 1

            else : 
                zenith_angles[num] = zenith_angles[num] - 90

        zenith_angle_lists.append(zenith_angles)
        
    zenith_angle_lists = [item for sublist in zenith_angle_lists for item in sublist]
    temp2['zenith_angle'] = zenith_angle_lists
    
    final = pd.merge(merge_data, temp2[['Day','Hour','Minute','zenith_angle']],
                    how='left', on=['Day','Hour','Minute'])
    
    final = final.fillna(90) 
    
    final = pd.merge(final, temp2[['Day','SUN_time']].drop_duplicates(),
                    how='inner', on=['Day'])
    
    return(final)

In [99]:
df_train = zenith_angel_def(data = df_train, merge_data = df_train
                            , is_train=True, label_lists=None)

df_test = zenith_angel_def(data = df_test, merge_data = df_test
                            , is_train=False, label_lists=label_list)

In [100]:
def custom_ghi(data) : 
    
    answer = data['DHI'] + (data['DNI']*math.cos(data['zenith_angle']))
    
    return answer 

df_train['GHI'] = df_train[['DHI','DNI','zenith_angle']].apply(lambda x: custom_ghi(x), axis=1)
df_test['GHI'] = df_test[['DHI','DNI','zenith_angle']].apply(lambda x: custom_ghi(x), axis=1)

print('df_train shape :', df_train.shape)
print('df_test shape :', df_test.shape)

df_train shape : (52560, 16)
df_test shape : (27216, 14)


In [101]:
df_test['Day'] = test_day.values

### 6일간의 데이터 

In [102]:
label_list = []

for i in range(81) : 
    label_list.append([i]*336)

label_list = [item for sublist in label_list for item in sublist]

In [103]:
def day_of_six(dataset) : 
    
    data = dataset.copy()
    
    colnames = []
    temp = data.copy()
    
    
    # 6일전까지의 데이터 구축 
    for six_days in range(1,7,1) : 
        
        for colname in data.columns : 
            
            if colname in ('Day','Hour','Minute','1day_after_target', '2day_after_target',
                          'GHI_less','Time') :
                
                pass 
            
            else : 
                
                temp[f'{six_days}_preday_{colname}'] = temp.shift(six_days*48)[colname]
                
                if six_days == 1 :
                
                    colnames.append(colname)
    
    # 6일전까지의 데이터를 바탕으로 새로운 변수 구축 
    day_0_2 = ['6_preday_','5_preday_','4_preday_']
    day_2_4 = ['4_preday_','3_preday_','2_preday_']
    day_4_6 = ['2_preday_','1_preday_','']
    
    def dev_cv(data) : 
        
        means = np.mean(data)
        sds = np.std(data)
        
        if means == 0 : 
            cv = 0 
        else : 
            cv = sds/means
            
        return(cv)
    
    
    for colname in colnames : 
        
        # 1) 구간별 평균 (0~2일차, 2~4일차, 4~6일차)
        data[f'day_0to2_{colname}_mean'] = temp[[i+colname for i in day_0_2]].apply('mean',axis=1)
        data[f'day_2to4_{colname}_mean'] = temp[[i+colname for i in day_2_4]].apply('mean',axis=1)
        data[f'day_4to6_{colname}_mean'] = temp[[i+colname for i in day_4_6]].apply('mean',axis=1)
        
        # 2) 구간별 변동계수 (0~2일차, 2~4일차, 4~6일차)
        data[f'day_0to2_{colname}_cv'] = temp[[i+colname for i in day_0_2]].apply(lambda x: dev_cv(x),axis=1)
        data[f'day_2to4_{colname}_cv'] = temp[[i+colname for i in day_2_4]].apply(lambda x: dev_cv(x),axis=1)
        data[f'day_4to6_{colname}_cv'] = temp[[i+colname for i in day_4_6]].apply(lambda x: dev_cv(x),axis=1)
        
        
    return(data)

In [104]:
df_train = day_of_six(dataset=df_train)

In [105]:
df_test['label'] = label_list

In [106]:
df_testset = []

for i in range(81) : 
    
    test_data = df_test[df_test['label']==i].drop(['label'],axis=1)
    temp_test = day_of_six(dataset=test_data)
    
    df_testset.append(temp_test)
    print(f'{i}st dataset complete!!')

0st dataset complete!!
1st dataset complete!!
2st dataset complete!!
3st dataset complete!!
4st dataset complete!!
5st dataset complete!!
6st dataset complete!!
7st dataset complete!!
8st dataset complete!!
9st dataset complete!!
10st dataset complete!!
11st dataset complete!!
12st dataset complete!!
13st dataset complete!!
14st dataset complete!!
15st dataset complete!!
16st dataset complete!!
17st dataset complete!!
18st dataset complete!!
19st dataset complete!!
20st dataset complete!!
21st dataset complete!!
22st dataset complete!!
23st dataset complete!!
24st dataset complete!!
25st dataset complete!!
26st dataset complete!!
27st dataset complete!!
28st dataset complete!!
29st dataset complete!!
30st dataset complete!!
31st dataset complete!!
32st dataset complete!!
33st dataset complete!!
34st dataset complete!!
35st dataset complete!!
36st dataset complete!!
37st dataset complete!!
38st dataset complete!!
39st dataset complete!!
40st dataset complete!!
41st dataset complete!!
42

In [109]:
df_test = pd.concat(df_testset)

### N일간의 평균 데이터 (시간 및 분 기준의 N일전)

In [92]:
def train_2day_variable(df_train, rolling_num) : 
    
    hour_lists = []

    numbers = list(np.arange(0,24,1))
    n_list = [2]*24

    hours_range = list(chain.from_iterable((repeat(number, n) for (number, n) in zip(numbers, n_list))))

    for (hours,minutes) in zip(hours_range,list(np.arange(0,31,30))*30) : 

        hour_lists.append(df_train[(df_train['Hour']==hours) & (df_train['Minute']==minutes)])

    for num in range(len(hour_lists)) : 

        hour_lists[num][f'{rolling_num}days_mean_DHI'] = hour_lists[num]['DHI'].rolling(rolling_num).mean()
        hour_lists[num][f'{rolling_num}days_mean_DNI'] = hour_lists[num]['DNI'].rolling(rolling_num).mean()
        #hour_lists[num][f'{rolling_num}days_mean_T'] = hour_lists[num]['T'].rolling(rolling_num).mean()
        #hour_lists[num][f'{rolling_num}days_mean_RH'] = hour_lists[num]['RH'].rolling(rolling_num).mean()
        hour_lists[num][f'{rolling_num}days_mean_WS'] = hour_lists[num]['WS'].rolling(rolling_num).mean() 
        hour_lists[num][f'{rolling_num}days_mean_TARGET'] = hour_lists[num]['TARGET'].rolling(rolling_num).mean()
        hour_lists[num][f'{rolling_num}days_mean_SUN_time'] = hour_lists[num]['SUN_time'].rolling(rolling_num).mean()
        hour_lists[num][f'{rolling_num}days_mean_zenith_angle'] = hour_lists[num]['zenith_angle'].rolling(rolling_num).mean()
        hour_lists[num][f'{rolling_num}days_mean_GHI'] = hour_lists[num]['GHI'].rolling(rolling_num).mean()
        
    df_train = pd.concat(hour_lists).sort_index() 
    
    return(df_train)

In [93]:
df_train = train_2day_variable(df_train, rolling_num=3)

In [94]:
def test_2day_variable(df_test, rolling_num) : 

label_list = []

for i in range(81) : 
    label_list.append([i]*336)

label_list = [item for sublist in label_list for item in sublist]

df_test['label'] = label_list 

test_dataset = []

for label_num in df_test.label.unique() : 

    hour_lists = []
    numbers = list(np.arange(0,24,1))
    n_list = [2]*24
    df_test2 = df_test[df_test['label']==label_num]

    hours_range = list(chain.from_iterable((repeat(number, n) for (number, n) in zip(numbers, n_list))))

    for (hours,minutes) in zip(hours_range,list(np.arange(0,31,30))*30) : 

        hour_lists.append(df_test2[(df_test2['Hour']==hours) & (df_test2['Minute']==minutes)])

    for num in range(len(hour_lists)) : 

        hour_lists[num][f'{rolling_num}days_mean_DHI'] = hour_lists[num]['DHI'].rolling(rolling_num).mean()
        hour_lists[num][f'{rolling_num}days_mean_DNI'] = hour_lists[num]['DNI'].rolling(rolling_num).mean()
        #hour_lists[num][f'{rolling_num}days_mean_T'] = hour_lists[num]['T'].rolling(rolling_num).mean()
        #hour_lists[num][f'{rolling_num}days_mean_RH'] = hour_lists[num]['RH'].rolling(rolling_num).mean()
        hour_lists[num][f'{rolling_num}days_mean_WS'] = hour_lists[num]['WS'].rolling(rolling_num).mean()
        hour_lists[num][f'{rolling_num}days_mean_TARGET'] = hour_lists[num]['TARGET'].rolling(rolling_num).mean()
        hour_lists[num][f'{rolling_num}days_mean_SUN_time'] = hour_lists[num]['SUN_time'].rolling(rolling_num).mean()
        hour_lists[num][f'{rolling_num}days_mean_zenith_angle'] = hour_lists[num]['zenith_angle'].rolling(rolling_num).mean()
        hour_lists[num][f'{rolling_num}days_mean_GHI'] = hour_lists[num]['GHI'].rolling(rolling_num).mean()

    temp = pd.concat(hour_lists)
    test_dataset.append(temp)

df_test = pd.concat(test_dataset).reset_index().sort_values(['label','index'])
df_test = df_test.drop(['index'],axis=1).reset_index(drop=True) 

return(df_test)

In [95]:
[df_test = test_2day_variable(df_test, rolling_num=3)]

In [96]:
print('df_train shape :', df_train.shape)
print('df_test shape :', df_test.shape)

df_train shape : (52560, 23)
df_test shape : (27216, 22)


# Model

In [None]:
df_train = df_train[df_train['RH']!=100]

In [110]:
print('df_train shape :', df_train.shape)
print('df_test shape :', df_test.shape)

df_train shape : (52560, 70)
df_test shape : (27216, 68)


In [111]:
df_train = df_train.dropna()
df_train = df_train.drop(['Day','Minute','GHI_less','Time'], axis=1)
df_train[['Hour']] = df_train[['Hour']].astype('category')

df_test = df_test.dropna()
df_test = df_test[df_test['Day']==6]
df_test = df_test.drop(['Day','Minute','GHI_less','Time'], axis=1)
df_test[['Hour']] = df_test[['Hour']].astype('category')

print('df_train shape :', df_train.shape)
print('df_test shape :', df_test.shape)

df_train shape : (52368, 66)
df_test shape : (3888, 64)


In [112]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import train_test_split

drop_columns = ['1day_after_target','2day_after_target']
y1 = '1day_after_target'
y2 = '2day_after_target'

X_train_1, X_valid_1, Y_train_1, Y_valid_1 = train_test_split(df_train.drop(drop_columns,axis=1), df_train[y1], test_size=0.33, random_state=0)
X_train_2, X_valid_2, Y_train_2, Y_valid_2 = train_test_split(df_train.drop(drop_columns,axis=1), df_train[y2], test_size=0.33, random_state=0)

In [113]:
quantiles = [0.06, 0.17, 0.25, 0.36, 0.52, 0.63, 0.74, 0.85, 0.95]

from lightgbm import LGBMRegressor

# Get the model and the predictions in (a) - (b)
def LGBM(q, X_train, Y_train, X_valid, Y_valid, df_test):
    
    # (a) Modeling  
    model = LGBMRegressor(objective='quantile', alpha=q, 
                         n_estimators=10000, bagging_fraction=0.7, learning_rate=0.027, subsample=0.7)                   
                         
                         
    model.fit(X_train, Y_train, eval_metric = ['quantile'], 
          eval_set=[(X_valid, Y_valid)], early_stopping_rounds=300, verbose=500)
    

    # (b) Predictions
    pred = pd.Series(model.predict(df_test).round(2))
    return pred, model

# Target 예측

def train_data(X_train, Y_train, X_valid, Y_valid, df_test):

    LGBM_models=[]
    LGBM_actual_pred = pd.DataFrame()

    for q in quantiles:
        print(q)
        pred , model = LGBM(q, X_train, Y_train, X_valid, Y_valid, df_test)
        LGBM_models.append(model)
        LGBM_actual_pred = pd.concat([LGBM_actual_pred,pred],axis=1)

    LGBM_actual_pred.columns=quantiles
    
    return LGBM_models, LGBM_actual_pred

In [114]:
# Target1
models_1, results_1 = train_data(X_train_1, Y_train_1, X_valid_1, Y_valid_1, df_test)

# Target2
models_2, results_2 = train_data(X_train_2, Y_train_2, X_valid_2, Y_valid_2, df_test)

0.06
Training until validation scores don't improve for 300 rounds
[500]	valid_0's quantile: 0.880712
[1000]	valid_0's quantile: 0.883013
Early stopping, best iteration is:
[715]	valid_0's quantile: 0.879751
0.17
Training until validation scores don't improve for 300 rounds
[500]	valid_0's quantile: 1.86602
[1000]	valid_0's quantile: 1.81859
[1500]	valid_0's quantile: 1.80321
[2000]	valid_0's quantile: 1.79478
[2500]	valid_0's quantile: 1.7878
[3000]	valid_0's quantile: 1.78418
[3500]	valid_0's quantile: 1.78121
[4000]	valid_0's quantile: 1.77781
[4500]	valid_0's quantile: 1.77624
[5000]	valid_0's quantile: 1.77266
[5500]	valid_0's quantile: 1.77034
[6000]	valid_0's quantile: 1.76918
Early stopping, best iteration is:
[5875]	valid_0's quantile: 1.76748
0.25
Training until validation scores don't improve for 300 rounds
[500]	valid_0's quantile: 2.26274
[1000]	valid_0's quantile: 2.18737
[1500]	valid_0's quantile: 2.15104
[2000]	valid_0's quantile: 2.12631
[2500]	valid_0's quantile: 2.10

[5000]	valid_0's quantile: 2.24161
[5500]	valid_0's quantile: 2.23547
[6000]	valid_0's quantile: 2.22891
[6500]	valid_0's quantile: 2.22519
[7000]	valid_0's quantile: 2.22304
[7500]	valid_0's quantile: 2.2207
[8000]	valid_0's quantile: 2.2172
[8500]	valid_0's quantile: 2.21545
[9000]	valid_0's quantile: 2.21305
[9500]	valid_0's quantile: 2.21162
[10000]	valid_0's quantile: 2.20714
Did not meet early stopping. Best iteration is:
[9997]	valid_0's quantile: 2.20714
0.52
Training until validation scores don't improve for 300 rounds
[500]	valid_0's quantile: 2.4168
[1000]	valid_0's quantile: 2.32928
[1500]	valid_0's quantile: 2.29755
[2000]	valid_0's quantile: 2.27131
[2500]	valid_0's quantile: 2.24741
[3000]	valid_0's quantile: 2.2252
[3500]	valid_0's quantile: 2.21327
[4000]	valid_0's quantile: 2.20335
[4500]	valid_0's quantile: 2.19492
[5000]	valid_0's quantile: 2.18927
[5500]	valid_0's quantile: 2.18313
[6000]	valid_0's quantile: 2.18032
[6500]	valid_0's quantile: 2.17824
[7000]	valid_0

In [115]:
submission.loc[submission.id.str.contains("Day7"), "q_0.1":] = results_1.sort_index().values
submission.loc[submission.id.str.contains("Day8"), "q_0.1":] = results_2.sort_index().values

In [116]:
submission.to_csv('../태양광예측/pilryoung_0111_v1.csv', index=False)