In [None]:
# installing catboost
# Catboost == 0.22 was the version of catboost at the start of this competition
!pip install catboost==0.22 --quiet

In [None]:
# Importing libraries
import pandas as pd
import numpy as np
import warnings
import joblib

from tqdm import tqdm, tqdm_notebook
from functools import reduce
from time import time

from catboost import CatBoostRegressor, CatBoostClassifier
from sklearn.utils import shuffle

pd.set_option('display.max_rows', 1000)           
warnings.filterwarnings('ignore')

In [None]:
# Loading data
train = pd.read_csv('./input/Train.csv')
test = pd.read_csv('./input/Test.csv')

In [None]:
start = time()

In [None]:
# Separating the target variable from the training dataframe
#
target = train.target

# Aligning the train and test dataframes
#
train, test = train.align(test, join='inner', axis=1)

# Creating a separator column to both train and test,
# This is to be used in separation
#
train['separator'] = 0
test['separator'] = 1

# Combing the train and test dataframes together
#
comb = pd.concat([train, test])

# Creating a function to replace all spaces in the dataframe with np.nan
#
def replace_nan(x):
    if x == " ":
        return np.nan
    else:
        return float(x)

# Creating a list of the main columns
#
main_cols = ["temp", "precip", "rel_humidity", "wind_dir", "wind_spd", "atmos_press"]

# Replacing spaces with np.nan
#
for col in main_cols: 
    comb[col] = comb[col].apply(lambda x: [replace_nan(X) for X in x.replace("nan", " ").split(",")])

def make_columns(feature):
    return [f"{feature}_{i}" for i in range(1, 122)]
    
# Generating dataframes of hours for each main column
#
comb_temp         = pd.DataFrame([x for x in comb.temp],         columns=make_columns('temp'))
comb_precip       = pd.DataFrame([x for x in comb.precip],       columns=make_columns('precip'))
comb_rel_humidity = pd.DataFrame([x for x in comb.rel_humidity], columns=make_columns('rel_humidity'))
comb_wind_dir     = pd.DataFrame([x for x in comb.wind_dir],     columns=make_columns('wind_dir'))
comb_wind_spd     = pd.DataFrame([x for x in comb.wind_spd],     columns=make_columns('wind_spd'))
comb_atmos_press  = pd.DataFrame([x for x in comb.atmos_press],  columns=make_columns('atmos_press'))

comb_temp['ID'], comb_precip['ID'], comb_rel_humidity['ID'], comb_wind_dir['ID'], comb_wind_spd['ID'], comb_atmos_press['ID'] = [list(comb.ID)] * 6

# Combining the generated dataframes together
#
comb_dfs = [comb, comb_temp, comb_precip, comb_rel_humidity, comb_wind_dir, comb_wind_spd, comb_atmos_press]
comb = reduce(lambda  left, right: pd.merge(left, right, on=['ID'], how='outer'), comb_dfs)
comb.drop(main_cols, axis=1, inplace=True)
df = comb.copy()

In [None]:
# Creating original series for each feature
orig_cols_dict = {}
weather_cols = ['temp', 'precip', 'rel_humidity', 'wind_dir','wind_spd', 'atmos_press']

for w in tqdm_notebook(weather_cols):
    selected_cols = [c for c in df.columns if w in c]
    orig_cols_dict[w] = pd.Series(selected_cols)

In [None]:
# Aggregating features per hour

for w in tqdm_notebook(weather_cols):
    tmp_df = pd.DataFrame()
    tmp_df['weather_col_orig'] = orig_cols_dict[w]
    tmp_df['hours_since_start'] = tmp_df['weather_col_orig'].apply(lambda x: x.split('_')[-1]).astype('int')
    tmp_df['hour_of_day'] = tmp_df['hours_since_start'] % 24

    for hour in range(1, 25):
        selected_cols = tmp_df[tmp_df['hour_of_day'] == hour]['weather_col_orig'].tolist()
        df_cols = df[selected_cols] # factorizing this part
        
        df[f'{w}_hour_{hour}_mean'] = df_cols.mean(axis=1)
        df[f'{w}_hour_{hour}_min'] = df_cols.min(axis=1)
        df[f'{w}_hour_{hour}_max'] = df_cols.max(axis=1)
        df[f'{w}_hour_{hour}_range'] = df[f'{w}_hour_{hour}_max'] - df[f'{w}_hour_{hour}_min']
        df[f'{w}_hour_{hour}_skew'] = df_cols.skew()
        df[f'{w}_hour_{hour}_kurt'] = df_cols.kurt()

        if hour - 3 > 0 and hour % 3 == 0:
            df[f'{w}_hour_{hour}_prev_hour_mean_diff'] = df[f'{w}_hour_{hour}_mean'] - df[f'{w}_hour_{hour - 3}_mean']
        if hour - 5 > 0 and hour % 3 == 0:
            df[f'{w}_hour_{hour}_prev_hour_mean_diff_5'] = df[f'{w}_hour_{hour}_mean'] - df[f'{w}_hour_{hour - 5}_mean']


In [None]:
comb = df.copy()

In [None]:
comb.head()

In [None]:
# Creating aggregation features for each variable
aggs = ['mean', 'std', 'var', 'kurt', 'skew', 'max', 'median', 'sum', 'mode', 'sem', 'min']

for col in tqdm_notebook(main_cols):
    for ag in tqdm(aggs):
        selected_cols = [x for x in comb.columns if x.startswith(col)]

        if ag == 'mode':
            aggregate = comb[selected_cols].agg(ag, axis=1)[0]
        else:
            aggregate = comb[selected_cols].agg(ag, axis=1)
        
        comb[col[0] + col[-1] + '_' + ag] = aggregate

# Creating separate dataframes for each variable
# Creating a list of columns for each separate dataframe
temp_cols = [x for x in comb.columns if x.startswith('temp')]
temp = comb[temp_cols]

precip_cols = [x for x in comb.columns if x.startswith('precip')]
precip = comb[precip_cols]

humid_cols = [x for x in comb.columns if x.startswith('rel_humidity')]
humid = comb[humid_cols]

wind_dir_cols = [x for x in comb.columns if x.startswith('wind_dir')]
wind_dir = comb[wind_dir_cols]

wind_spd_cols = [x for x in comb.columns if x.startswith('wind_spd')]
wind_spd  = comb[wind_spd_cols]

atmp_cols = [x for x in comb.columns if x.startswith('atmos_press')]
atmp = comb[atmp_cols]

fill_cols = comb.columns

In [None]:
# Generating new features, by adding each variable per hour
for x, y, z, a, b in zip(temp.columns, precip.columns, humid.columns, wind_spd.columns, atmp.columns):
    comb['add_tp' + y[-4:]] = temp[x] + precip[y] + humid[z] + wind_spd[a] + atmp[b]

In [None]:
# Filling missing values using forward fill
comb = comb.ffill(axis=1)

In [None]:
comb.head()

In [None]:
def apply_qcut(feat):
    return pd.qcut(comb[feat], 24, labels=False, duplicates='drop')

In [None]:
other_features = [x for x in comb.columns if x not in ['separator', 'ID', 'location']]

# Multiprocessing trick: 15 seconds instead of 7 minutes !
binned_data = joblib.Parallel(n_jobs=-1, backend='multiprocessing')(
    joblib.delayed(apply_qcut)(feat) for feat in tqdm_notebook(other_features))

comb_binned_data = pd.concat(binned_data, axis=1)
comb = pd.concat([comb[['separator', 'ID', 'location']], comb_binned_data], axis=1)

In [None]:
comb.head()

In [None]:
# Separating train and test from the combined dataframe
train = comb[comb.separator == 0]
test = comb[comb.separator == 1]
train.drop('separator', axis=1, inplace=True)
test.drop('separator', axis=1, inplace=True)

# Creating a list of test ids in the order that they will be trained
testA =  test[test.location == 'A']
testB =  test[test.location == 'B']
testC =  test[test.location == 'C']
testD =  test[test.location == 'D']
testE =  test[test.location == 'E']

tA, tD, tE, tBC = testA.ID, testD.ID, testE.ID, test[(test.location == 'B') | (test.location == 'C')].ID
test_id = pd.concat([tA, tD, tE, tBC])

# Adding back target to the train set
train['target'] = target

In [None]:
end = time()
print(f"Total preprocessing time = {end - start:.1f}")

In [None]:
%%time
# Creating X and y values
X = train.drop(['ID', 'location', 'target'], axis=1)
y = target.values

# Shuffling the X, y values
X, y = shuffle(X, y, random_state=0)
tes = test.drop(['ID', 'location'], axis=1)

# Traing the model across multiple seeds
predictions = []
for i in tqdm_notebook(range(25)):
    cat = CatBoostRegressor(verbose=False, random_seed=i)
    cat.fit(X, y)
    
    preds = cat.predict(tes)
    predictions.append(preds)

# Averaging the predictions
avg_preds = np.mean(predictions, axis=0)

# Post processing of the predictions
# This post processing was done with the help of a validation set.
# The validation set was adversarial, i.e. we chose the examples from the training set closest to the test set, and applied post processing to it.
post_proc = [((((((((((x-0.85)*1.015)-0.85)*1.012)-0.75)*1.0095)-0.55)*1.0065)-0.8)*1.007) for x in avg_preds]
post_proc = predzz = [((x-0.85)*1.015) for x in post_proc]

# Creating a submission file
sub_df = pd.DataFrame({'ID': test.ID, 'target': post_proc})
sub_df.to_csv('model_1_1.csv', index=False)

In [None]:
%%time
# Creating a list to hold predictions per seed
predzz = []
for i in tqdm_notebook(range(25), leave=False):
    # Creating a list to hold predictions per location
    # Training model per location per seed
    predictions = []
    for area in tqdm_notebook(['A', 'D', 'E'], leave=False):
        # Separating training data per location
        X = train[train.location == area]
        y = X.target
        X = X.drop(['ID', 'location', 'target'], axis=1)

        # Shuffling data
        X, y = shuffle(X, y, random_state=0)

        # Separating testing data per location
        tes = test[test.location == area]
        tes = tes.drop(['ID', 'location'], axis=1)

        # Training the model and making predictions per seed, per location
        preds = CatBoostRegressor(verbose=False, random_seed=i).fit(X, y).predict(tes)
        predictions.extend(preds)

    X = train[(train.location == 'B') | (train.location == 'C')]
    y = X.target
    X = X.drop(['ID', 'location', 'target'], axis=1)
    X, y = shuffle(X, y, random_state=0)

    tes = test[(test.location == 'B') | (test.location == 'C')]
    tes = tes.drop(['ID', 'location'], axis=1)
    preds = CatBoostRegressor(verbose=False, random_seed=i).fit(X, y).predict(tes)
    predictions.extend(preds)

    predzz.append(predictions)

# Averaging the predictions
preds_av = np.mean(predzz, axis=0)

# Post processing of the predictions
# This post processing was done with the help of a validation set.
# The validation set was adversarial, i.e. we chose the examples from the training set closest to the test set, and applied post processing to it.
predz = [((((((((((x-0.85)*1.015)-0.85)*1.012)-0.75)*1.0095)-0.55)*1.0065)-0.8)*1.007) for x in preds_av]
predzz = [((x-0.85)*1.015) for x in predz]

# Creating a submission file
sub_df = pd.DataFrame({'ID': test_id, 'target': predzz})
sub_df.to_csv('model_1_2.csv', index = False)

In [48]:
blend_df = pd.read_csv('model_1_1.csv')[['ID']]
blend_df['A'] = pd.read_csv('model_1_1.csv')['target']
blend_df = pd.merge(blend_df, pd.read_csv('model_1_2.csv').rename({'target': 'B'}, axis=1), on = 'ID', how = 'left')
blend_df.corr()

Unnamed: 0,A,B
A,1.0,0.979202
B,0.979202,1.0


In [53]:
blend_df['target'] = blend_df['A']*0.5 + blend_df['B']*0.5
blend_df[['ID', 'target']].to_csv('model_12_blend.csv', index=False)

In [54]:
blend_df.corr()

Unnamed: 0,A,B,target
A,1.0,0.979202,0.994761
B,0.979202,1.0,0.994813
target,0.994761,0.994813,1.0


In [55]:
SUB_FILE_NAME = 'model_12_blend.csv'

In [57]:
blend_df[['ID', 'target']].head(10)

Unnamed: 0,ID,target
0,ID_test_0,158.123774
1,ID_test_1,97.217908
2,ID_test_10,21.393733
3,ID_test_100,63.222891
4,ID_test_1000,92.0462
5,ID_test_1001,44.955298
6,ID_test_1002,83.270765
7,ID_test_1003,36.458014
8,ID_test_1004,34.101068
9,ID_test_1005,47.728921


In [56]:
from IPython.display import HTML
def create_download_link(title = "Download CSV file", filename = "data.csv"):  
    html = '<a href={filename}>{title}</a>'
    html = html.format(title=title,filename=filename)
    return HTML(html)
create_download_link(filename = SUB_FILE_NAME)