2nd Iteration for Tensorflow Competition

In [5]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder,QuantileTransformer,StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from catboost import CatBoostRegressor, Pool
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, explained_variance_score

In [16]:
train_df = pd.read_csv("../Dataset/train.csv", index_col=0)
test_df = pd.read_csv("../Dataset/test.csv", index_col=0)
train_df.shape, test_df.shape

((300000, 25), (200000, 24))

In [17]:
train_df.head()

Unnamed: 0_level_0,cont0,cont1,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,...,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.877618,0.724417,0.895799,0.42165,0.281421,0.124454,0.923191,0.719903,0.701915,0.802461,...,A,B,A,A,I,E,D,A,B,6.994023
2,0.326679,0.613252,0.593413,0.34623,0.282354,0.357438,0.437627,0.808464,0.741289,0.546056,...,A,B,A,B,F,E,B,A,A,8.071256
3,0.869133,0.264104,0.86562,0.369602,0.293756,0.454644,0.732209,0.828352,0.695561,0.825251,...,A,B,A,A,N,B,D,C,A,5.760456
4,0.809799,0.494269,0.868099,0.57893,0.769785,0.153735,0.705142,0.614766,0.698125,0.794402,...,A,B,A,A,K,E,D,C,A,7.806457
6,0.343457,0.724447,0.440967,0.70594,0.279105,0.496212,0.486063,0.297743,0.683073,0.462146,...,A,B,A,A,F,E,B,A,B,6.868974


In [18]:
test_df.head()


Unnamed: 0_level_0,cont0,cont1,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,...,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.653654,0.285147,0.202234,0.286912,0.701679,0.486284,0.331261,0.302448,0.264308,0.242654,...,E,A,B,A,A,G,E,D,C,B
5,0.318492,0.562065,0.857073,0.397436,0.27748,0.296852,0.402404,0.736251,0.730542,0.516393,...,C,A,B,A,A,L,E,D,C,B
15,0.333572,0.836193,0.586934,0.695284,0.279508,0.760865,0.530677,0.273905,0.759788,0.548555,...,C,A,B,A,A,F,E,D,C,B
16,0.58908,0.414131,0.442475,0.240049,0.479503,0.715786,0.439653,0.311625,0.255382,0.596746,...,E,A,B,B,A,F,E,D,A,A
17,0.28586,0.710961,0.170475,0.329851,0.757845,0.315269,0.389844,0.794931,0.224045,0.263235,...,E,A,B,A,A,I,E,B,A,B


In [22]:
X = train_df.loc[:, "cont0":"cat9"]
y = train_df["target"]
X.head().T

id,1,2,3,4,6
cont0,0.877618,0.326679,0.869133,0.809799,0.343457
cont1,0.724417,0.613252,0.264104,0.494269,0.724447
cont2,0.895799,0.593413,0.86562,0.868099,0.440967
cont3,0.42165,0.34623,0.369602,0.57893,0.70594
cont4,0.281421,0.282354,0.293756,0.769785,0.279105
cont5,0.124454,0.357438,0.454644,0.153735,0.496212
cont6,0.923191,0.437627,0.732209,0.705142,0.486063
cont7,0.719903,0.808464,0.828352,0.614766,0.297743
cont8,0.701915,0.741289,0.695561,0.698125,0.683073
cont9,0.802461,0.546056,0.825251,0.794402,0.462146


In [23]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y.values, test_size=0.2)

feature_transformation = ColumnTransformer([
    ("cat", Pipeline([("encoder", OrdinalEncoder()), ("scaler", StandardScaler())]),
     [f"cat{i}" for i in range(10)]),
    ("num", StandardScaler(),
     [f"cont{i}" for i in range(14)])

])

X_train_prepared = feature_transformation.fit_transform(X_train)
X_valid_prepared = feature_transformation.transform(X_valid)

In [24]:
X_train_prepared, X_valid_prepared, y_train, y_valid

(array([[ 1.7209471 , -0.14133734, -0.02283337, ...,  1.84072055,
          1.35967766, -1.68353232],
        [-0.54231232, -0.14133734, -0.02283337, ..., -0.22478545,
         -0.06653168, -1.65858686],
        [-0.54231232, -0.14133734, -0.02283337, ...,  1.6998605 ,
         -1.19928991,  1.03009869],
        ...,
        [ 0.58931739, -0.14133734, -0.02283337, ..., -0.58358994,
         -0.31120149,  0.91445388],
        [-0.54231232, -0.14133734, -0.02283337, ...,  0.29616504,
         -0.30331425, -0.08130836],
        [-1.67394203, -0.14133734, -0.02283337, ..., -0.20861256,
          0.56086576, -1.55982575]]),
 array([[-1.67394203, -0.14133734, -0.02283337, ...,  1.79658069,
          1.34738609,  0.46704159],
        [-0.54231232, -0.14133734, -0.02283337, ..., -0.4754945 ,
         -1.11299436, -1.96186313],
        [-0.54231232, -0.14133734, -0.02283337, ..., -0.71173512,
         -1.07744527,  0.59570969],
        ...,
        [-0.54231232, -0.14133734, -0.02283337, ..., -

In [25]:
model_params = {
    "task_type": "GPU",
    "devices": "0",
    "loss_function": "RMSE",
    "n_estimators": 10000,
    "learning_rate": 0.03628302216953097,
    "reg_lambda": 0.0008746338866473539,
    "subsample": 0.7875490025178415,
    "max_depth": 3,
    "bootstrap_type": "Bernoulli"
}
model = CatBoostRegressor(**model_params)
model.fit(X_train_prepared, y_train, verbose=100)

model.predict(X_train_prepared)

0:	learn: 0.8871093	total: 4.41ms	remaining: 44.1s
100:	learn: 0.8643954	total: 328ms	remaining: 32.1s
200:	learn: 0.8580469	total: 632ms	remaining: 30.8s
300:	learn: 0.8546505	total: 929ms	remaining: 29.9s
400:	learn: 0.8523592	total: 1.23s	remaining: 29.5s
500:	learn: 0.8506920	total: 1.53s	remaining: 29.1s
600:	learn: 0.8492886	total: 1.84s	remaining: 28.7s
700:	learn: 0.8482068	total: 2.14s	remaining: 28.4s
800:	learn: 0.8471955	total: 2.44s	remaining: 28s
900:	learn: 0.8464054	total: 2.75s	remaining: 27.8s
1000:	learn: 0.8456420	total: 3.05s	remaining: 27.4s
1100:	learn: 0.8449951	total: 3.35s	remaining: 27.1s
1200:	learn: 0.8444095	total: 3.65s	remaining: 26.7s
1300:	learn: 0.8438426	total: 3.95s	remaining: 26.4s
1400:	learn: 0.8433473	total: 4.25s	remaining: 26.1s
1500:	learn: 0.8429127	total: 4.58s	remaining: 26s
1600:	learn: 0.8424737	total: 5s	remaining: 26.2s
1700:	learn: 0.8420731	total: 5.39s	remaining: 26.3s
1800:	learn: 0.8416912	total: 5.74s	remaining: 26.1s
1900:	learn

array([7.42732928, 7.42591028, 7.39812534, ..., 7.39247794, 7.25004955,
       7.41675727])

In [26]:
y_pred = model.predict(X_valid_prepared)
y_pred

array([7.35006864, 7.40201455, 7.06164413, ..., 7.38502877, 7.32354603,
       7.72309742])

In [27]:
pd.DataFrame.from_dict({
    "MAE": mean_squared_error(y_valid, y_pred, squared=False),
    "MSE": mean_squared_error(y_valid, y_pred, squared=True),
    "R^2": r2_score(y_valid, y_pred)
}, orient="index", columns=["Errors"])

Unnamed: 0,Errors
MAE,0.842187
MSE,0.709279
R^2,0.093628


In [28]:
test_df_prepared = feature_transformation.transform(test_df)
target_pred = model.predict(test_df_prepared)

output = pd.DataFrame({'Id': test_df.index,
                       'target': target_pred})

output.to_csv('../Submission_output/submission1.csv', index=False)