In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
from matplotlib.colors import ListedColormap
import seaborn as sns
from cycler import cycler
from IPython.display import display
import datetime

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import mutual_info_classif

plt.rcParams['axes.facecolor'] = '#0057b8' # blue
plt.rcParams['axes.prop_cycle'] = cycler(color=['#ffd700'] +
                                         plt.rcParams['axes.prop_cycle'].by_key()['color'][1:])

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

print()
print('Train')
display(train.head(3))

print()
print('Test')
display(test.head(3))

print('Dataframe shapes:', train.shape, test.shape)
print()


Train


Unnamed: 0,id,f_00,f_01,f_02,f_03,f_04,f_05,f_06,f_07,f_08,...,f_22,f_23,f_24,f_25,f_26,f_27,f_28,f_29,f_30,target
0,0,-1.373246,0.238887,-0.243376,0.567405,-0.647715,0.839326,0.113133,1,5,...,-2.540739,0.766952,-2.730628,-0.208177,1.363402,ABABDADBAB,67.609153,0,0,0
1,1,1.697021,-1.710322,-2.230332,-0.545661,1.113173,-1.552175,0.447825,1,3,...,2.278315,-0.633658,-1.217077,-3.782194,-0.058316,ACACCADCEB,377.096415,0,0,1
2,2,1.681726,0.616746,-1.027689,0.810492,-0.609086,0.113965,-0.70866,1,0,...,-1.385775,-0.520558,-0.009121,2.788536,-3.703488,AAAEABCKAD,-195.599702,0,2,1



Test


Unnamed: 0,id,f_00,f_01,f_02,f_03,f_04,f_05,f_06,f_07,f_08,...,f_21,f_22,f_23,f_24,f_25,f_26,f_27,f_28,f_29,f_30
0,900000,0.442517,0.17438,-0.999816,0.762741,0.186778,-1.074775,0.501888,6,6,...,-1.0064,-1.193879,-2.435736,-2.42743,-1.966887,5.734205,BAAABADLAC,99.478419,0,0
1,900001,-0.605598,-0.305715,0.627667,-0.578898,-1.750931,1.35555,-0.190911,1,3,...,2.382405,0.149442,1.883322,-2.848714,-0.725155,3.194219,AFABBAEGCB,-65.993825,1,0
2,900002,0.30399,2.44511,0.246515,0.818248,0.359731,-1.331845,1.358622,3,3,...,-7.026098,1.312277,-5.157192,1.714005,0.585032,0.066898,BBACABBKEE,-87.405622,0,1


Dataframe shapes: (900000, 33) (700000, 32)



In [3]:
train_copied = train.copy()
test_copied = test.copy()
def load_data():
    train = train_copied.copy()
    test = test_copied.copy()
    return train,test

In [51]:
train,test = load_data()

In [52]:
for df in [train, test]:
    for i in range(10):
        df[f'ch{i}'] = df.f_27.str.get(i).apply(ord) - ord('A')
    df["unique_characters"] = df.f_27.apply(lambda s: len(set(s)))

In [53]:
for df in [train, test]:
    df['i_02_21'] = (df.f_21 + df.f_02 > 5.2).astype(int) - (df.f_21 + df.f_02 < -5.3).astype(int)
    df['i_05_22'] = (df.f_22 + df.f_05 > 5.1).astype(int) - (df.f_22 + df.f_05 < -5.4).astype(int)
    i_00_01_26 = df.f_00 + df.f_01 + df.f_26
    df['i_00_01_26'] = (i_00_01_26 > 5.0).astype(int) - (i_00_01_26 < -5.0).astype(int)

In [54]:
feature_drop = ['f_27']
for df in [train,test]:
    df.drop(feature_drop,axis = 1, inplace = True)

In [55]:
X = train.drop(['id','target'],axis = 1)
y = train['target']

In [56]:
from sklearn.model_selection import train_test_split
X_train,X_val,y_train,y_val = train_test_split(X,y,test_size = 0.2,random_state = 42)
X_test = test.drop(['id'],axis = 1)

In [58]:
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score,precision_score,recall_score
catboost = CatBoostClassifier(iterations=1500, learning_rate=0.5)
catboost.fit(X_train,y_train,eval_set=(X_val,y_val))
y_val_pred_cat = catboost.predict(X_val)
print(f"Accuracy validation: {accuracy_score(y_val,y_val_pred_cat)}")
print(f"Precision validation: {precision_score(y_val,y_val_pred_cat)}")
print(f"Recall validation: {recall_score(y_val,y_val_pred_cat)}")

0:	learn: 0.6061443	test: 0.6070476	best: 0.6070476 (0)	total: 104ms	remaining: 2m 35s
1:	learn: 0.5640418	test: 0.5651513	best: 0.5651513 (1)	total: 226ms	remaining: 2m 49s
2:	learn: 0.5134770	test: 0.5147625	best: 0.5147625 (2)	total: 353ms	remaining: 2m 56s
3:	learn: 0.4816077	test: 0.4833088	best: 0.4833088 (3)	total: 515ms	remaining: 3m 12s
4:	learn: 0.4566991	test: 0.4586164	best: 0.4586164 (4)	total: 686ms	remaining: 3m 25s
5:	learn: 0.4393251	test: 0.4411549	best: 0.4411549 (5)	total: 772ms	remaining: 3m 12s
6:	learn: 0.4190971	test: 0.4205013	best: 0.4205013 (6)	total: 874ms	remaining: 3m 6s
7:	learn: 0.4048963	test: 0.4066282	best: 0.4066282 (7)	total: 984ms	remaining: 3m 3s
8:	learn: 0.3780189	test: 0.3799570	best: 0.3799570 (8)	total: 1.1s	remaining: 3m 2s
9:	learn: 0.3572903	test: 0.3591961	best: 0.3591961 (9)	total: 1.21s	remaining: 3m
10:	learn: 0.3475242	test: 0.3492444	best: 0.3492444 (10)	total: 1.34s	remaining: 3m 1s
11:	learn: 0.3320769	test: 0.3344109	best: 0.33441

In [59]:
y_test = catboost.predict_proba(X_test)

In [29]:
y_test

array([[1.38829290e-06, 9.99998612e-01],
       [1.65670903e-05, 9.99983433e-01],
       [1.00000000e+00, 6.30934402e-12],
       ...,
       [9.90087110e-01, 9.91289049e-03],
       [9.99509640e-01, 4.90359727e-04],
       [1.00000000e+00, 3.54328132e-15]])

In [60]:
id = test['id']
df_submiss = pd.DataFrame({'id':id,'target':y_test[:,1]})
df_submiss.to_csv('submission.csv',index = False)