In [2]:
!pip install scikit-opt --user

Collecting scikit-opt
  Using cached https://files.pythonhosted.org/packages/b1/c4/35919cabffc2b5c76a792fed3c0de3548a9eed3ed1a86f6f2a4d25a24680/scikit_opt-0.5.0-py3-none-any.whl
Installing collected packages: scikit-opt
Successfully installed scikit-opt-0.5.0


In [1]:
import gc
import os
from pathlib import Path
import random
import sys

from tqdm import tqdm_notebook as tqdm
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn import preprocessing
from sklearn.model_selection import KFold

from sklearn.metrics import mean_squared_error

In [2]:
from pandas.api.types import is_datetime64_any_dtype as is_datetime
from pandas.api.types import is_categorical_dtype

def reduce_mem_usage(df, use_float16=False):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        if is_datetime(df[col]) or is_categorical_dtype(df[col]):
            # skip datetime type or categorical type
            continue
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if use_float16 and c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [3]:
path = '../../Resources/'
test_df = pd.read_csv(path + 'test.csv')
building_meta_df = pd.read_csv(path + 'building_metadata.csv')

In [4]:
leak_df = pd.read_feather('../../Large_output/leaking/leak.feather')
leak_df.fillna(0, inplace=True)
leak_df = leak_df[(leak_df.timestamp.dt.year > 2016) & (leak_df.timestamp.dt.year < 2019)]
leak_df.loc[leak_df.meter_reading < 0, 'meter_reading'] = 0 # remove large negative values
leak_df = leak_df[leak_df.building_id!=245]

In [5]:
sub_path = '../../Large_output/good_submission/'
sample_submission1 = pd.read_csv(sub_path + 'lgb3_site0_change.csv', index_col=0)
sample_submission2 = pd.read_csv(sub_path + 'lgb_half.csv', index_col=0)
sample_submission3 = pd.read_csv(sub_path + 'lgb3_cleaned.csv', index_col=0)
sample_submission4 = pd.read_csv(sub_path + 'cat_clean_no_bayes.csv', index_col=0)
sample_submission5 = pd.read_csv(sub_path + 'xgb_no_bayes_clean.csv', index_col=0)
sample_submission6 = pd.read_csv(sub_path + 'nn_clean.csv', index_col=0)

  mask |= (ar1 == a)


In [6]:
test_df['pred1'] = sample_submission1.meter_reading
test_df['pred2'] = sample_submission2.meter_reading
test_df['pred3'] = sample_submission3.meter_reading
test_df['pred4'] = sample_submission4.meter_reading
test_df['pred5'] = sample_submission5.meter_reading
test_df['pred6'] = sample_submission6.meter_reading

del  sample_submission1,  sample_submission2,  sample_submission3,sample_submission4,sample_submission5,sample_submission6
gc.collect()

0

In [7]:
test_df["timestamp"] = pd.to_datetime(test_df["timestamp"], format='%Y-%m-%d %H:%M:%S')
leak_df["timestamp"] = pd.to_datetime(leak_df["timestamp"], format='%Y-%m-%d %H:%M:%S')

In [9]:
leak_df = leak_df.merge(test_df[['building_id', 'meter', 'timestamp', 'pred1',\
                                 'pred2', 'pred3','pred4','pred5','pred6',\
                                 'row_id']], on = ['building_id', 'meter', 'timestamp'], how = "left")
leak_df = leak_df.merge(building_meta_df[['building_id', 'site_id']], on='building_id', how='left')

In [14]:
leak_df['meter_reading_l1p'] = np.log1p(leak_df.meter_reading)

In [10]:
leak_df

Unnamed: 0,building_id,meter,meter_reading,timestamp,pred1,pred2,pred3,pred4,pred5,pred6,row_id,site_id
0,0,0.0,173.370293,2017-01-01 00:00:00,196.794798,82.730302,134.285627,121.220393,131.142820,204.162670,0,0
1,1,0.0,53.512720,2017-01-01 00:00:00,98.870458,62.356422,69.056423,55.516887,61.839855,99.557440,1,0
2,2,0.0,6.143042,2017-01-01 00:00:00,23.472831,11.211765,10.880237,10.712582,6.077408,13.165344,2,0
3,3,0.0,101.701470,2017-01-01 00:00:00,276.100799,154.241294,251.449750,353.429545,335.934840,302.159640,3,0
4,4,0.0,1141.240666,2017-01-01 00:00:00,1159.726712,860.610848,1112.319660,1332.812678,1362.286000,1109.127600,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...
11937496,1363,0.0,184.524994,2018-12-31 19:00:00,271.613198,247.946070,278.245812,195.938456,275.569240,198.379180,41497410,15
11937497,1363,0.0,183.600006,2018-12-31 20:00:00,269.931399,244.945510,275.208346,187.017012,268.388500,194.510350,41497660,15
11937498,1363,0.0,178.475006,2018-12-31 21:00:00,261.406075,227.145438,260.905062,179.807843,269.436900,184.452240,41497910,15
11937499,1363,0.0,179.725006,2018-12-31 22:00:00,238.664772,217.781872,245.831210,178.231907,254.699950,180.633790,41498160,15


In [11]:
from sko.PSO import PSO
from sko.GA import GA
# Define the genetic algorithm function
def optimization(p):
    w1,w2,w3,w4,w5,w6 = p
    v = w1 * leak_df['pred1'].values + w2 * leak_df['pred2'].values + w3 * leak_df['pred3'] +\
    w4 * leak_df['pred4'].values + w5 * leak_df['pred5'].values + w6 * leak_df['pred6']
    vl1p = np.log1p(v)
    print(np.sqrt(mean_squared_error(vl1p, leak_df.meter_reading_l1p)))
    return np.sqrt(mean_squared_error(vl1p, leak_df.meter_reading_l1p))

In [None]:
#Run this: PSO algorithm iter 100 test to test the score
ga = GA(func=optimization, n_dim=6, size_pop=100, max_iter=100, lb=[0,0,0,0,0,0], ub=[0.5,0.5,0.5,0.5,0.5,0.5], precision=1e-7)
ga.run()
print('best_x is ', pso.gbest_x, 'best_y is', pso.gbest_y)

1.109737976015069
1.071967825760083
1.009634291515726
1.0602919102496149
0.9950684836406741
1.1528982240925243
1.020739429068769
1.2836804753728865
0.9704735033863185
1.1416376890257836
1.0065571252861902
1.189731310079097
1.0797603426075733
1.1301383242364194
1.1686875150064888
0.9682509576139967
1.1957461385869983
1.1412918637774978
1.0886561175828984
1.1614582069958226
0.9827337167035173
1.2518159375433986
1.2650653212118772
1.2288951759484743
1.2121881049880472
1.044555622440699
0.9909368117998765
1.003520317984029
1.1196916185244783
1.1082087077841833
1.0663890810809376
1.1716893643061608
1.1573231566378412
1.1430672722410262
1.1252819988262788
0.9947758698792276
1.2154651555819007
1.078155950875734
1.0357618186686322
1.0275512184023252
1.0999413138079304
1.1342679687175607
0.9727911805687389
1.0646035246892527
1.181390346608529
1.0161485493232443
0.9943709833489309
1.3060904980027233
1.2094952243933599
1.121675883540353
1.1587971983839018
1.0007995606360416
1.119933874116876
1.02

In [13]:
print('best_x is ', pso.gbest_x, 'best_y is', pso.gbest_y)

best_x is  [0.5        0.         0.         0.09939658 0.20748924 0.08029802] best_y is 0.9634


In [16]:
w1 = 0.5 
w2 = 0.01895657
w3 = 0.00168529
w4 = 0.25909864
w5 = 0.12004669
w6 = 0

In [17]:
v = w1 * leak_df['pred1'].values + w2 * leak_df['pred2'].values + w3 * leak_df['pred3'] +\
w4 * leak_df['pred4'].values + w5 * leak_df['pred5'].values + w6 * leak_df['pred6']
vl1p = np.log1p(v)
np.sqrt(mean_squared_error(vl1p, leak_df.meter_reading_l1p))

0.963

In [18]:
sample_submission = pd.read_csv(path + 'sample_submission.csv')

In [19]:
sample_submission['meter_reading'] = w1 * test_df.pred1 +  w2 * test_df.pred3  + w3 * test_df.pred2+\
      w4 * test_df.pred4+w5 * test_df.pred5+w6 * test_df.pred6
sample_submission.loc[sample_submission.meter_reading < 0, 'meter_reading'] = 0

In [20]:
# Reindex the row_id, put the meter_reading into sample_submission
leak_df = leak_df[['meter_reading', 'row_id']].set_index('row_id').dropna()
sample_submission.loc[leak_df.index, 'meter_reading'] = leak_df['meter_reading']

In [22]:
# Save the submission
sample_submission.to_csv('../../Large_output/ensemble_pso_second_try.csv', index=False, float_format='%.4f')

In [None]:
sample_submission.head()