In [1]:
import gc
import os
from pathlib import Path
import random
import sys

from tqdm import tqdm_notebook as tqdm
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn import preprocessing
from sklearn.model_selection import KFold

from sklearn.metrics import mean_squared_error

In [2]:
from pandas.api.types import is_datetime64_any_dtype as is_datetime
from pandas.api.types import is_categorical_dtype

def reduce_mem_usage(df, use_float16=False):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        if is_datetime(df[col]) or is_categorical_dtype(df[col]):
            # skip datetime type or categorical type
            continue
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if use_float16 and c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [6]:
def put_together(df_leaking, path, name):
    df = reduce_mem_usage(pd.read_csv(path))
    df = df.rename(columns = {'ParcelId': 'parcelid'})
    df_leaking[name] = 0.0
    
    for i in df_leaking.index:
        temp = df.loc[df.parcelid == df_leaking['parcelid'][i],:]
        month = '2016' + str(df_leaking['month'][i])
        df_leaking[name][i] = temp[month]
    return df_leaking

In [9]:
import glob
all_file = glob.glob('../output/submission/*.csv')
csv_name = []
for i in all_file:
    csv_name.append(i.split('/')[-1].split('.')[0])
df_leaking = reduce_mem_usage(pd.read_csv('../output/leaking_data.csv').dropna().reset_index(drop=True))
df_leaking.month = df_leaking.month.astype('int')
for i in range(len(all_file)):
    df_leaking = put_together(df_leaking, all_file[i], csv_name[i])

Memory usage of dataframe is 0.20 MB
Memory usage after optimization is: 0.10 MB
Decreased by 50.0%
Memory usage of dataframe is 159.43 MB
Memory usage after optimization is: 79.71 MB
Decreased by 50.0%


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


Memory usage of dataframe is 159.43 MB
Memory usage after optimization is: 79.71 MB
Decreased by 50.0%
Memory usage of dataframe is 159.43 MB
Memory usage after optimization is: 79.71 MB
Decreased by 50.0%
Memory usage of dataframe is 159.43 MB
Memory usage after optimization is: 79.71 MB
Decreased by 50.0%
Memory usage of dataframe is 159.43 MB
Memory usage after optimization is: 79.71 MB
Decreased by 50.0%
Memory usage of dataframe is 159.43 MB
Memory usage after optimization is: 79.71 MB
Decreased by 50.0%
Memory usage of dataframe is 159.43 MB
Memory usage after optimization is: 79.71 MB
Decreased by 50.0%
Memory usage of dataframe is 159.43 MB
Memory usage after optimization is: 79.71 MB
Decreased by 50.0%


In [51]:
from sko.PSO import PSO
from sko.GA import GA
# Define the genetic algorithm function
def optimization(p):
    v = 0
    for i in range(len(p)):
        v += df_leaking[csv_name[i]] * p[i]
    return np.sqrt(mean_squared_error(v, df_leaking.logerror))
pso = PSO(func=optimization, dim=8, pop=200, max_iter=300, lb=[0,0,0,0,0,0,0,0], 
          ub=[.5,.5,.5,.5,.5,.5,.5,.5], w=0.8, c1=0.5, c2=0.5)
pso.run()
print('best_x is ', pso.gbest_x, 'best_y is', pso.gbest_y)

best_x is  [0.45095925 0.         0.         0.06025154 0.39638982 0.5
 0.02443458 0.5       ] best_y is 0.14613875674835552


In [52]:
p = [0.45095925, 0,0,0.06025154,0.39638982,0.5,0.02443458,0.5]
sum(p)

1.9320351900000001

In [12]:
p = [0.0679783, 0, 0, 0, 0, 1, 0,1]
v = 0
for i in range(len(p)):
    v += df_leaking[csv_name[i]] * p[i]
np.sqrt(mean_squared_error(v, df_leaking.logerror))

0.1459036213177278

In [55]:
sample = pd.read_csv('../Resources/sample_submission.csv')

In [19]:
col_list = [x for x in sample.columns if x != 'ParcelId']

In [37]:
all_file

['../output/submission/lgb_en5_opt.csv',
 '../output/submission/lgb_3_outlier.csv',
 '../output/submission/cat5_opt.csv',
 '../output/submission/xgb5_opt.csv',
 '../output/submission/lgb_5_outlier.csv',
 '../output/submission/cat_en5_total.csv',
 '../output/submission/cat_en5_opt.csv',
 '../output/submission/lgb_3_66.csv']

In [67]:
p = [0, 0.2, 0, 0.1,0.1,0.3,0.3,0]
sum(p)

1.0

In [71]:
sample = pd.read_csv('../Resources/sample_submission.csv')
for i in range(len(p)):
    df = reduce_mem_usage(pd.read_csv(all_file[i]))
    df = df.rename(columns = {'ParcelId': 'parcelid'})
    for j in col_list:
        sample[j] += df[j] * p[i]

Memory usage of dataframe is 159.43 MB
Memory usage after optimization is: 79.71 MB
Decreased by 50.0%
Memory usage of dataframe is 159.43 MB
Memory usage after optimization is: 79.71 MB
Decreased by 50.0%
Memory usage of dataframe is 159.43 MB
Memory usage after optimization is: 79.71 MB
Decreased by 50.0%
Memory usage of dataframe is 159.43 MB
Memory usage after optimization is: 79.71 MB
Decreased by 50.0%
Memory usage of dataframe is 159.43 MB
Memory usage after optimization is: 79.71 MB
Decreased by 50.0%
Memory usage of dataframe is 159.43 MB
Memory usage after optimization is: 79.71 MB
Decreased by 50.0%
Memory usage of dataframe is 159.43 MB
Memory usage after optimization is: 79.71 MB
Decreased by 50.0%
Memory usage of dataframe is 159.43 MB
Memory usage after optimization is: 79.71 MB
Decreased by 50.0%


In [72]:
sample.to_csv('../output/submission/ensemble.csv', index = False)

In [48]:
sample_set = sample.set_index('ParcelId')
sample_set

Unnamed: 0_level_0,201610,201611,201612,201710,201711,201712
ParcelId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
10754147,-0.004084,0.006581,0.002249,0.004453,0.014758,0.009945
10759547,0.018843,0.018574,0.022269,0.027900,0.027954,0.031769
10843547,0.021962,0.031627,0.031954,0.021665,0.030265,0.031599
10859147,0.078831,0.081240,0.081257,0.075467,0.077521,0.078273
10879947,0.036754,0.032803,0.037066,0.032474,0.031184,0.032567
...,...,...,...,...,...,...
168176230,0.020938,0.011020,0.019575,0.045777,0.033245,0.042244
14273630,0.020352,0.010434,0.018989,0.037458,0.025226,0.034218
168040630,0.020938,0.011020,0.019575,0.019531,0.007378,0.016388
168040830,0.020612,0.010694,0.019249,0.154943,0.142496,0.151706


In [49]:
for i in df_leaking.index:
    month = '2016' + str(df_leaking['month'][i])
    leaking_id = df_leaking['parcelid'][i]
    sample_set[month][leaking_id] = df_leaking['logerror'][i]

In [50]:
sample_set.reset_index().to_csv('../output/submission/ensemble.csv', index = False)

In [69]:
sample.to_csv('../output/submission/ensemble.csv', index = False)

In [70]:
sample

Unnamed: 0,ParcelId,201610,201611,201612,201710,201711,201712
0,10754147,0.039203,0.048380,0.047072,0.042681,0.050576,0.049181
1,10759547,0.028650,0.029875,0.031798,0.040130,0.040650,0.042169
2,10843547,0.044545,0.052099,0.051585,0.048365,0.053519,0.053565
3,10859147,0.086140,0.088113,0.087418,0.082915,0.083276,0.082939
4,10879947,0.047789,0.046208,0.049070,0.043565,0.043696,0.044311
...,...,...,...,...,...,...,...
2985212,168176230,0.057158,0.050584,0.056876,0.049886,0.043216,0.048576
2985213,14273630,0.055145,0.048522,0.054814,0.043824,0.036003,0.041955
2985214,168040630,0.057158,0.050584,0.056876,0.023818,0.017076,0.022982
2985215,168040830,0.057598,0.051025,0.057317,0.116153,0.108412,0.114295


In [59]:
sample_set.reset_index()

Unnamed: 0,ParcelId,201610,201611,201612,201710,201711,201712
0,10754147,-0.004084,0.006581,0.002249,0.004453,0.014758,0.009945
1,10759547,0.018843,0.018574,0.022269,0.027900,0.027954,0.031769
2,10843547,0.021962,0.031627,0.031954,0.021665,0.030265,0.031599
3,10859147,0.078831,0.081240,0.081257,0.075467,0.077521,0.078273
4,10879947,0.036754,0.032803,0.037066,0.032474,0.031184,0.032567
...,...,...,...,...,...,...,...
2985212,168176230,0.020938,0.011020,0.019575,0.045777,0.033245,0.042244
2985213,14273630,0.020352,0.010434,0.018989,0.037458,0.025226,0.034218
2985214,168040630,0.020938,0.011020,0.019575,0.019531,0.007378,0.016388
2985215,168040830,0.020612,0.010694,0.019249,0.154943,0.142496,0.151706
