In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
import catboost as cat


In [2]:
data = pd.read_csv('data/train.csv')
data.shape

(300000, 26)

In [3]:
data.head()

Unnamed: 0,id,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,...,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,target
0,1,A,B,A,A,B,D,A,E,C,...,0.881122,0.42165,0.741413,0.895799,0.802461,0.724417,0.701915,0.877618,0.719903,6.994023
1,2,B,A,A,A,B,B,A,E,A,...,0.440011,0.34623,0.278495,0.593413,0.546056,0.613252,0.741289,0.326679,0.808464,8.071256
2,3,A,A,A,C,B,D,A,B,C,...,0.914155,0.369602,0.832564,0.86562,0.825251,0.264104,0.695561,0.869133,0.828352,5.760456
3,4,A,A,A,C,B,D,A,E,G,...,0.934138,0.57893,0.407313,0.868099,0.794402,0.494269,0.698125,0.809799,0.614766,7.806457
4,6,A,B,A,A,B,B,A,E,C,...,0.3826,0.70594,0.325193,0.440967,0.462146,0.724447,0.683073,0.343457,0.297743,6.868974


In [4]:
data = pd.get_dummies(data)
data.shape

(300000, 72)

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300000 entries, 0 to 299999
Data columns (total 72 columns):
id        300000 non-null int64
cont0     300000 non-null float64
cont1     300000 non-null float64
cont2     300000 non-null float64
cont3     300000 non-null float64
cont4     300000 non-null float64
cont5     300000 non-null float64
cont6     300000 non-null float64
cont7     300000 non-null float64
cont8     300000 non-null float64
cont9     300000 non-null float64
cont10    300000 non-null float64
cont11    300000 non-null float64
cont12    300000 non-null float64
cont13    300000 non-null float64
target    300000 non-null float64
cat0_A    300000 non-null uint8
cat0_B    300000 non-null uint8
cat1_A    300000 non-null uint8
cat1_B    300000 non-null uint8
cat2_A    300000 non-null uint8
cat2_B    300000 non-null uint8
cat3_A    300000 non-null uint8
cat3_B    300000 non-null uint8
cat3_C    300000 non-null uint8
cat3_D    300000 non-null uint8
cat4_A    300000 non-null u

In [6]:
Y = data['target']
X = data.drop(['target'], axis=1)
print(Y.shape, X.shape)

(300000,) (300000, 71)


In [7]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, random_state=2021, shuffle=True)
print(X_train.shape, X_test.shape)

(270000, 71) (30000, 71)


In [8]:
print(Y_train.shape, Y_test.shape)

(270000,) (30000,)


In [9]:
def rmse(y_pred, y_actual):
    return np.sqrt(1/len(y_pred) * np.sum(np.square(y_actual - y_pred)))

In [10]:
rmse(np.array([1,2,3,4]), np.array([1.5, 2.5, 3.5, 4.5]))

0.5

In [11]:
from sklearn.preprocessing import StandardScaler  
scaler = StandardScaler()
# Don't cheat - fit only on training data
scaler.fit(X_train)  
X_train = scaler.transform(X_train)  
# apply same transformation to test data
X_test = scaler.transform(X_test)  

In [12]:
from sklearn.decomposition import PCA
pca = PCA(n_components = 25,svd_solver = 'randomized')
pca_fit = pca.fit(X_train)
X_train = pca_fit.transform(X_train)
X_test = pca_fit.transform(X_test)

In [13]:
model = cat.CatBoostRegressor(random_state=2021)
model.fit(X_train, Y_train)
rmse(model.predict(X_train), Y_train)

Learning rate set to 0.105726
0:	learn: 0.8842714	total: 247ms	remaining: 4m 6s
1:	learn: 0.8820164	total: 302ms	remaining: 2m 30s
2:	learn: 0.8800967	total: 357ms	remaining: 1m 58s
3:	learn: 0.8784222	total: 412ms	remaining: 1m 42s
4:	learn: 0.8770797	total: 465ms	remaining: 1m 32s
5:	learn: 0.8759324	total: 522ms	remaining: 1m 26s
6:	learn: 0.8750072	total: 589ms	remaining: 1m 23s
7:	learn: 0.8741423	total: 664ms	remaining: 1m 22s
8:	learn: 0.8735058	total: 733ms	remaining: 1m 20s
9:	learn: 0.8728398	total: 817ms	remaining: 1m 20s
10:	learn: 0.8723071	total: 889ms	remaining: 1m 19s
11:	learn: 0.8718359	total: 959ms	remaining: 1m 18s
12:	learn: 0.8714114	total: 1.02s	remaining: 1m 17s
13:	learn: 0.8710328	total: 1.1s	remaining: 1m 17s
14:	learn: 0.8707388	total: 1.16s	remaining: 1m 16s
15:	learn: 0.8704640	total: 1.22s	remaining: 1m 15s
16:	learn: 0.8702015	total: 1.29s	remaining: 1m 14s
17:	learn: 0.8699562	total: 1.35s	remaining: 1m 13s
18:	learn: 0.8697526	total: 1.41s	remaining: 1

163:	learn: 0.8632573	total: 9.89s	remaining: 50.4s
164:	learn: 0.8632188	total: 9.95s	remaining: 50.4s
165:	learn: 0.8631822	total: 10s	remaining: 50.3s
166:	learn: 0.8631512	total: 10.1s	remaining: 50.2s
167:	learn: 0.8631178	total: 10.1s	remaining: 50.1s
168:	learn: 0.8630865	total: 10.2s	remaining: 50s
169:	learn: 0.8630605	total: 10.2s	remaining: 49.8s
170:	learn: 0.8630164	total: 10.3s	remaining: 49.8s
171:	learn: 0.8629727	total: 10.3s	remaining: 49.8s
172:	learn: 0.8629398	total: 10.4s	remaining: 49.7s
173:	learn: 0.8629050	total: 10.5s	remaining: 49.6s
174:	learn: 0.8628833	total: 10.5s	remaining: 49.5s
175:	learn: 0.8628609	total: 10.5s	remaining: 49.4s
176:	learn: 0.8628260	total: 10.6s	remaining: 49.4s
177:	learn: 0.8627901	total: 10.7s	remaining: 49.4s
178:	learn: 0.8627535	total: 10.7s	remaining: 49.3s
179:	learn: 0.8627124	total: 10.8s	remaining: 49.2s
180:	learn: 0.8626805	total: 10.9s	remaining: 49.1s
181:	learn: 0.8626489	total: 10.9s	remaining: 49s
182:	learn: 0.8626

325:	learn: 0.8583441	total: 19.3s	remaining: 39.8s
326:	learn: 0.8583160	total: 19.3s	remaining: 39.8s
327:	learn: 0.8582873	total: 19.4s	remaining: 39.7s
328:	learn: 0.8582566	total: 19.4s	remaining: 39.7s
329:	learn: 0.8582255	total: 19.5s	remaining: 39.6s
330:	learn: 0.8581992	total: 19.6s	remaining: 39.5s
331:	learn: 0.8581752	total: 19.6s	remaining: 39.5s
332:	learn: 0.8581485	total: 19.7s	remaining: 39.4s
333:	learn: 0.8581172	total: 19.7s	remaining: 39.4s
334:	learn: 0.8580883	total: 19.8s	remaining: 39.3s
335:	learn: 0.8580557	total: 19.9s	remaining: 39.3s
336:	learn: 0.8580271	total: 20s	remaining: 39.3s
337:	learn: 0.8580011	total: 20s	remaining: 39.2s
338:	learn: 0.8579748	total: 20.1s	remaining: 39.1s
339:	learn: 0.8579407	total: 20.1s	remaining: 39.1s
340:	learn: 0.8579215	total: 20.2s	remaining: 39s
341:	learn: 0.8578950	total: 20.2s	remaining: 38.9s
342:	learn: 0.8578678	total: 20.3s	remaining: 38.9s
343:	learn: 0.8578290	total: 20.4s	remaining: 38.8s
344:	learn: 0.8578

484:	learn: 0.8542168	total: 28.6s	remaining: 30.3s
485:	learn: 0.8541884	total: 28.6s	remaining: 30.3s
486:	learn: 0.8541620	total: 28.7s	remaining: 30.2s
487:	learn: 0.8541320	total: 28.7s	remaining: 30.1s
488:	learn: 0.8541101	total: 28.8s	remaining: 30.1s
489:	learn: 0.8540929	total: 28.8s	remaining: 30s
490:	learn: 0.8540675	total: 28.9s	remaining: 29.9s
491:	learn: 0.8540445	total: 28.9s	remaining: 29.9s
492:	learn: 0.8540197	total: 29s	remaining: 29.8s
493:	learn: 0.8539925	total: 29.1s	remaining: 29.8s
494:	learn: 0.8539730	total: 29.1s	remaining: 29.7s
495:	learn: 0.8539573	total: 29.1s	remaining: 29.6s
496:	learn: 0.8539352	total: 29.2s	remaining: 29.5s
497:	learn: 0.8539025	total: 29.3s	remaining: 29.5s
498:	learn: 0.8538727	total: 29.3s	remaining: 29.4s
499:	learn: 0.8538401	total: 29.4s	remaining: 29.4s
500:	learn: 0.8538185	total: 29.5s	remaining: 29.3s
501:	learn: 0.8537983	total: 29.5s	remaining: 29.3s
502:	learn: 0.8537707	total: 29.6s	remaining: 29.2s
503:	learn: 0.85

646:	learn: 0.8503321	total: 37.6s	remaining: 20.5s
647:	learn: 0.8503125	total: 37.6s	remaining: 20.4s
648:	learn: 0.8502887	total: 37.7s	remaining: 20.4s
649:	learn: 0.8502672	total: 37.8s	remaining: 20.3s
650:	learn: 0.8502434	total: 37.8s	remaining: 20.3s
651:	learn: 0.8502227	total: 37.9s	remaining: 20.2s
652:	learn: 0.8502027	total: 37.9s	remaining: 20.2s
653:	learn: 0.8501877	total: 38s	remaining: 20.1s
654:	learn: 0.8501627	total: 38.1s	remaining: 20.1s
655:	learn: 0.8501462	total: 38.1s	remaining: 20s
656:	learn: 0.8501249	total: 38.2s	remaining: 19.9s
657:	learn: 0.8501082	total: 38.2s	remaining: 19.9s
658:	learn: 0.8500873	total: 38.3s	remaining: 19.8s
659:	learn: 0.8500675	total: 38.4s	remaining: 19.8s
660:	learn: 0.8500460	total: 38.4s	remaining: 19.7s
661:	learn: 0.8500249	total: 38.5s	remaining: 19.6s
662:	learn: 0.8500050	total: 38.5s	remaining: 19.6s
663:	learn: 0.8499806	total: 38.6s	remaining: 19.5s
664:	learn: 0.8499550	total: 38.6s	remaining: 19.5s
665:	learn: 0.84

806:	learn: 0.8467234	total: 47.4s	remaining: 11.3s
807:	learn: 0.8466966	total: 47.5s	remaining: 11.3s
808:	learn: 0.8466796	total: 47.6s	remaining: 11.2s
809:	learn: 0.8466549	total: 47.7s	remaining: 11.2s
810:	learn: 0.8466286	total: 47.7s	remaining: 11.1s
811:	learn: 0.8466097	total: 47.8s	remaining: 11.1s
812:	learn: 0.8465908	total: 47.8s	remaining: 11s
813:	learn: 0.8465671	total: 47.9s	remaining: 10.9s
814:	learn: 0.8465437	total: 47.9s	remaining: 10.9s
815:	learn: 0.8465154	total: 48s	remaining: 10.8s
816:	learn: 0.8464896	total: 48.1s	remaining: 10.8s
817:	learn: 0.8464641	total: 48.1s	remaining: 10.7s
818:	learn: 0.8464415	total: 48.2s	remaining: 10.7s
819:	learn: 0.8464143	total: 48.3s	remaining: 10.6s
820:	learn: 0.8463999	total: 48.3s	remaining: 10.5s
821:	learn: 0.8463785	total: 48.4s	remaining: 10.5s
822:	learn: 0.8463613	total: 48.4s	remaining: 10.4s
823:	learn: 0.8463361	total: 48.5s	remaining: 10.4s
824:	learn: 0.8463111	total: 48.6s	remaining: 10.3s
825:	learn: 0.84

966:	learn: 0.8431926	total: 57.5s	remaining: 1.96s
967:	learn: 0.8431681	total: 57.5s	remaining: 1.9s
968:	learn: 0.8431459	total: 57.6s	remaining: 1.84s
969:	learn: 0.8431237	total: 57.6s	remaining: 1.78s
970:	learn: 0.8431077	total: 57.7s	remaining: 1.72s
971:	learn: 0.8430929	total: 57.8s	remaining: 1.66s
972:	learn: 0.8430723	total: 57.8s	remaining: 1.6s
973:	learn: 0.8430565	total: 57.9s	remaining: 1.54s
974:	learn: 0.8430352	total: 58s	remaining: 1.49s
975:	learn: 0.8430179	total: 58s	remaining: 1.43s
976:	learn: 0.8430005	total: 58.1s	remaining: 1.37s
977:	learn: 0.8429762	total: 58.1s	remaining: 1.31s
978:	learn: 0.8429584	total: 58.2s	remaining: 1.25s
979:	learn: 0.8429414	total: 58.2s	remaining: 1.19s
980:	learn: 0.8429222	total: 58.3s	remaining: 1.13s
981:	learn: 0.8428969	total: 58.3s	remaining: 1.07s
982:	learn: 0.8428762	total: 58.4s	remaining: 1.01s
983:	learn: 0.8428513	total: 58.5s	remaining: 951ms
984:	learn: 0.8428304	total: 58.5s	remaining: 891ms
985:	learn: 0.8428

0.8425326147468172

In [14]:
test_predict = model.predict(X_test)
rmse(test_predict, Y_test)

0.8686348911115489