In [25]:
# installing catboost
# Catboost == 0.22 was the version of catboost at the start of this competition
!pip install catboost==0.22 --quiet

You should consider upgrading via the '/opt/conda/bin/python3.7 -m pip install --upgrade pip' command.[0m


In [27]:
# Importing libraries
import pandas as pd
import numpy as np
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor, XGBRFRegressor
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.utils import shuffle
from tqdm import tqdm, tqdm_notebook
from functools import reduce
from catboost import CatBoostRegressor, CatBoostClassifier
import joblib


import warnings
warnings.filterwarnings('ignore')


In [28]:
# Loading data
train = pd.read_csv('./input/Train.csv')
test = pd.read_csv('./input/Test.csv')

In [29]:
# Feature interaction functions
# There are 4 types of interactions: product interactions, division interactions, sum interactions and divide interactions

def add_prod_interacts(df, inter_cols):   
    def apply_interacts(x, inter_cols):
        cols = [x + '_prod_' + c for c in inter_cols[inter_cols.index(x)+1:]]
        interacts_df[cols] = pd.concat([df[x] * df[c] for c in inter_cols[inter_cols.index(x)+1:]], axis=1)
    
    interacts_df = pd.DataFrame()
    _ = df[inter_cols[:-1]].apply(lambda x: apply_interacts(x.name, inter_cols))
    df = pd.concat([df, interacts_df], axis=1)
    return df


def add_div_interacts(df, inter_cols):   
    def apply_interacts(x, inter_cols):
        cols = [x + '_div_' + c for c in inter_cols[inter_cols.index(x)+1:]]
        interacts_df[cols] = pd.concat([df[x] / df[c] for c in inter_cols[inter_cols.index(x)+1:]], axis=1)
    
    interacts_df = pd.DataFrame()
    _ = df[inter_cols[:-1]].apply(lambda x: apply_interacts(x.name, inter_cols))
    df = pd.concat([df, interacts_df], axis=1)

    return df

def add_sum_interacts(df, inter_cols):   
    def apply_interacts(x, inter_cols):
        cols = [x + '_sum_' + c for c in inter_cols[inter_cols.index(x)+1:]]
        interacts_df[cols] = pd.concat([df[x] + df[c] for c in inter_cols[inter_cols.index(x)+1:]], axis=1)
    
    interacts_df = pd.DataFrame()
    _ = df[inter_cols[:-1]].apply(lambda x: apply_interacts(x.name, inter_cols))
    df = pd.concat([df, interacts_df], axis=1)

    return df

def add_diff_interacts(df, inter_cols):   
    def apply_interacts(x, inter_cols):
        cols = [x + '_diff_' + c for c in inter_cols[inter_cols.index(x)+1:]]
        interacts_df[cols] = pd.concat([df[x] - df[c] for c in inter_cols[inter_cols.index(x)+1:]], axis=1)
    
    interacts_df = pd.DataFrame()
    _ = df[inter_cols[:-1]].apply(lambda x: apply_interacts(x.name, inter_cols))
    df = pd.concat([df, interacts_df], axis=1)

    return df

In [30]:
# Loading data
train = pd.read_csv('./input/Train.csv')
test = pd.read_csv('./input/Test.csv')

In [31]:
# Separating the target variable from the training dataframe
#
target = train.target

# Aligning the train and test dataframes
#
train, test = train.align(test, join = 'inner',axis = 1)

# Creating a separator column to both train and test,
# This is to be used in separation
#
train['separator'] = 0
test['separator'] = 1
# Combing the train and test dataframes together
#
comb = pd.concat([train, test])

# Creating a function to replace all spaces in the dataframe with np.nan
#
def replace_nan(x):
    if x==" ":
        return np.nan
    else :
        return float(x)

# Creating a list of the main columns
#
main_cols = ["temp","precip","rel_humidity","wind_dir","wind_spd","atmos_press"]

# Replacing spaces with np.nan
#
for col in main_cols: 
    comb[col]=comb[col].apply(lambda x: [ replace_nan(X) for X in x.replace("nan"," ").split(",")])

def make_columns(feature):
    return [f"{feature}_{i}" for i in range(1, 122)]
    
# Generating dataframes of hours for each main column
#
comb_temp         = pd.DataFrame([x for x in comb.temp],         columns=make_columns('temp'))
comb_precip       = pd.DataFrame([x for x in comb.precip],       columns=make_columns('precip'))
comb_rel_humidity = pd.DataFrame([x for x in comb.rel_humidity], columns=make_columns('rel_humidity'))
comb_wind_dir     = pd.DataFrame([x for x in comb.wind_dir],     columns=make_columns('wind_dir'))
comb_wind_spd     = pd.DataFrame([x for x in comb.wind_spd],     columns=make_columns('wind_spd'))
comb_atmos_press  = pd.DataFrame([x for x in comb.atmos_press],  columns=make_columns('atmos_press'))

comb_temp['ID'], comb_precip['ID'], comb_rel_humidity['ID'], comb_wind_dir['ID'], comb_wind_spd['ID'], comb_atmos_press['ID'] = [list(comb.ID)] * 6

# Combining the generated dataframes together
#
comb_dfs = [comb, comb_temp, comb_precip, comb_rel_humidity, comb_wind_dir, comb_wind_spd, comb_atmos_press]
comb = reduce(lambda  left,right: pd.merge(left,right,on=['ID'], how='outer'), comb_dfs)
comb.drop(main_cols, axis = 1, inplace = True)
df = comb.copy()

In [32]:
comb = df.copy()

In [33]:
# Adding aggregation features for each variable
#
aggs = ['mean', 'std', 'var', 'kurt', 'skew', 'max', 'median', 'sum', 'mode', 'sem', 'min']
for col in tqdm_notebook(main_cols):
  for ag in tqdm_notebook(aggs, leave = False):
    if ag == 'mode':
      comb[col[0] + col[-1] + '_'+ag] = comb[[x for x in comb.columns if x.startswith(col)]].agg(ag, axis = 1)[0]
    else:
      comb[col[0] + col[-1] + '_'+ag] = comb[[x for x in comb.columns if x.startswith(col)]].agg(ag, axis = 1)

# Creating separate dataframes for each variable
# Creating a list of columns for each separate dataframe
#
temp, temp_cols  = comb[[x for x in comb.columns if x.startswith('temp')]], [x for x in comb.columns if x.startswith('temp')]
precip, precip_cols = comb[[x for x in comb.columns if x.startswith('precip')]], [x for x in comb.columns if x.startswith('precip')]
humid, humid_cols = comb[[x for x in comb.columns if x.startswith('rel_humidity')]], [x for x in comb.columns if x.startswith('rel_humidity')]
wind_dir, wind_dir_cols = comb[[x for x in comb.columns if x.startswith('wind_dir')]], [x for x in comb.columns if x.startswith('wind_dir')]
wind_spd, wind_spd_cols = comb[[x for x in comb.columns if x.startswith('wind_spd')]], [x for x in comb.columns if x.startswith('wind_spd')]
atmp, atmp_cols = comb[[x for x in comb.columns if x.startswith('atmos_press')]], [x for x in comb.columns if x.startswith('atmos_press')]
fill_cols = comb.columns

HBox(children=(FloatProgress(value=0.0, max=6.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=11.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=11.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=11.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=11.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=11.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=11.0), HTML(value='')))




In [34]:
# Previewing the head of the generated dataframe
#
comb.head()

Unnamed: 0,ID,location,separator,temp_1,temp_2,temp_3,temp_4,temp_5,temp_6,temp_7,...,as_std,as_var,as_kurt,as_skew,as_max,as_median,as_sum,as_mode,as_sem,as_min
0,ID_train_0,C,0,,,,,,,,...,0.072682,0.005283,-0.158696,-0.383144,87.871667,87.762083,1404.038939,87.614167,0.01817,87.614167
1,ID_train_1,D,0,22.533333,21.716667,20.833333,20.983333,20.875,20.141667,19.375,...,0.156648,0.024539,-0.44634,-0.173356,90.725,90.429167,10942.020833,90.219167,0.014241,90.056667
2,ID_train_10,A,0,28.975,27.95,29.6,26.425,22.091667,21.775,22.333333,...,0.180233,0.032484,-0.227481,-0.243561,88.813333,88.425,10610.511667,88.2875,0.016453,87.9825
3,ID_train_100,A,0,22.966667,24.266667,25.275,25.625,25.866667,25.091667,24.025,...,0.16243,0.026384,-0.462889,-0.34761,88.685,88.4,10693.606667,88.271667,0.014766,87.965
4,ID_train_1000,A,0,21.875,21.575,21.525,21.433333,20.508333,19.916667,18.991667,...,0.120393,0.014494,-0.062557,-0.705667,88.719167,88.5525,2656.143106,88.268333,0.021981,88.268333


In [35]:
# Creating a list of columns containing aggregates only
cols_mean = []
cols_max = []
cols_min = []
for x in tqdm_notebook(comb.columns):
  if 'mean' in x:
    cols_mean.append(x)
  elif 'max' in x:
    cols_max.append(x)
  elif 'min' in x:
    cols_min.append(x)
  else:
    pass

# Generating feature interactions between aggregates only
for num_cols in tqdm_notebook([cols_mean, cols_max, cols_min], leave = False):
    comb = add_prod_interacts(comb, num_cols)
    comb = add_div_interacts(comb, num_cols)
    comb = add_diff_interacts(comb, num_cols)

HBox(children=(FloatProgress(value=0.0, max=795.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))

In [36]:
# Generating new features, by adding each variable per hour
for x, y, z, a, b in zip(temp.columns, precip.columns, humid.columns, wind_spd.columns, atmp.columns):
  comb['add_tp' +y[-4:]] = temp[x] + precip[y] + humid[z] + wind_spd[a] + atmp[b]

In [37]:
# Filling missing values using forward fill
comb = comb.ffill(axis = 1)

In [38]:
# Calculating the difference between features per each dataframe

dfs = [temp, precip, humid, wind_spd, atmp]

diff_dfs = []
for i in range(5):
  i = dfs[i]
  name = str(i.columns[0].split('_')[0])
  temp_df = i.diff(axis = 1).values
  temp_df = pd.DataFrame(temp_df, columns=['diff_' +name + '_' + str(i) for i in range(1, 122)])
  diff_dfs.append(temp_df)

diff_dfs.append(comb)
comb = reduce(lambda  left,right: pd.merge(left,right, right_index=True, left_index=True, how='outer'), diff_dfs)

In [39]:
comb.head()

Unnamed: 0,diff_temp_1,diff_temp_2,diff_temp_3,diff_temp_4,diff_temp_5,diff_temp_6,diff_temp_7,diff_temp_8,diff_temp_9,diff_temp_10,...,add_tp_112,add_tp_113,add_tp_114,add_tp_115,add_tp_116,add_tp_117,add_tp_118,add_tp_119,add_tp_120,add_tp_121
0,,,,,,,,,,,...,111.009,110.396,109.991,110.031,110.085,109.618,109.161,108.627,108.839,109.081
1,,-0.816667,-0.883333,0.15,-0.108333,-0.733333,-0.766667,-0.583333,-0.016667,-0.133333,...,118.268,119.334,120.191,122.078,122.734,123.072,121.885,119.338,118.589,114.357
2,,-1.025,1.65,-3.175,-4.333333,-0.316667,0.558333,-0.383333,-1.508333,0.0,...,111.332,110.389,110.174,110.092,110.746,113.401,116.527,118.449,119.659,120.692
3,,1.3,1.008333,0.35,0.241667,-0.775,-1.066667,-1.6,-2.191667,-2.816667,...,109.088,108.214,108.176,107.477,107.332,108.305,110.405,112.255,114.158,115.68
4,,-0.3,-0.05,-0.091667,-0.925,-0.591667,-0.925,-0.4,-0.541667,-0.133333,...,110.649,110.649,110.649,110.649,110.649,110.649,110.649,110.649,110.649,110.649


In [40]:
def apply_qcut(feat):
    return pd.qcut(comb[feat], 24, labels=False, duplicates='drop')

In [41]:
%%time
# Binning feaures
other_features = [x for x in comb.columns if x not in ['separator', 'ID', 'location']]

# Multiprocessing trick: 15 seconds instead of 7 minutes !
binned_data = joblib.Parallel(n_jobs=-1, backend='multiprocessing')(
    joblib.delayed(apply_qcut)(feat) for feat in tqdm_notebook(other_features))

comb_binned_data = pd.concat(binned_data, axis=1)
comb = pd.concat([comb[['separator', 'ID', 'location']], comb_binned_data], axis=1)

# Separating train and test from the combined dataframe
train = comb[comb.separator == 0]
test = comb[comb.separator == 1]
train.drop('separator', axis = 1, inplace = True)
test.drop('separator', axis = 1, inplace = True)

# Creating a list of test ids in the order that they will be trained
testA =  test[test.location == 'A']
testB =  test[test.location == 'B']
testC =  test[test.location == 'C']
testD =  test[test.location == 'D']
testE =  test[test.location == 'E']

tA, tD, tE, tBC = testA.ID, testD.ID, testE.ID, test[(test.location == 'B') | (test.location == 'C')].ID
test_id = pd.concat([tA, tD, tE, tBC])

# Adding back target to the train set
train['target'] = target

HBox(children=(FloatProgress(value=0.0, max=1653.0), HTML(value='')))


CPU times: user 2.11 s, sys: 1.36 s, total: 3.48 s
Wall time: 18.5 s


In [42]:
%%time
# Creating X and y values
X = train.drop(['ID', 'location', 'target'], axis = 1)
y = target.values

# Shuffling the X, y values
X, y = shuffle(X, y, random_state = 0)
tes = test.drop(['ID', 'location'], axis = 1)

# Traing the model across multiple seeds
predictions = []
for i in tqdm_notebook(range(25)):
  cat = CatBoostRegressor(verbose = False, random_seed=i)
  cat.fit(X, y)

  preds = cat.predict(tes)
  predictions.append(preds)

# Averaging the predictions
avg_preds = np.mean(predictions, axis = 0)

# Post processing of the predictions
post_proc = [((((((((((x-0.85)*1.015)-0.85)*1.012)-0.75)*1.0095)-0.55)*1.0065)-0.8)*1.007) for x in avg_preds]
post_proc = predzz = [((x-0.85)*1.015) for x in post_proc]

# Creating a submission file
sub_df = pd.DataFrame({'ID': test.ID, 'target':post_proc})
sub_df.to_csv('model_2.csv', index = False)

HBox(children=(FloatProgress(value=0.0, max=25.0), HTML(value='')))


CPU times: user 2h 35min 4s, sys: 5min 2s, total: 2h 40min 7s
Wall time: 42min 26s


In [43]:
sub_df.head(10)

Unnamed: 0,ID,target
15539,ID_test_0,154.512605
15540,ID_test_1,117.225585
15541,ID_test_10,26.247779
15542,ID_test_100,63.167372
15543,ID_test_1000,92.044408
15544,ID_test_1001,41.451966
15545,ID_test_1002,84.857269
15546,ID_test_1003,37.807807
15547,ID_test_1004,30.820292
15548,ID_test_1005,45.990773
