In [1]:
import pandas as pd

In [2]:
train_df = pd.read_csv("TRAIN.csv").drop(columns="id")
df = train_df.drop(columns="insomnia")
df.head()

Unnamed: 0,age,weight,height,sex,stress,doctor,sport,pernicious_1,pernicious_2,ubp,lbp
0,50.35729,62.0,168,2,1,1,1,0,0,110,80
1,55.381246,85.0,156,1,3,1,1,0,0,140,90
2,51.627652,64.0,165,1,3,1,0,0,0,130,70
3,48.249144,82.0,169,2,1,1,1,0,0,150,100
4,47.841205,56.0,156,1,1,1,0,0,0,100,60


In [3]:
df["sex"] -= 1
df["stress"] -= 1
df["doctor"] -= 1
df.head()

Unnamed: 0,age,weight,height,sex,stress,doctor,sport,pernicious_1,pernicious_2,ubp,lbp
0,50.35729,62.0,168,1,0,0,1,0,0,110,80
1,55.381246,85.0,156,0,2,0,1,0,0,140,90
2,51.627652,64.0,165,0,2,0,0,0,0,130,70
3,48.249144,82.0,169,1,0,0,1,0,0,150,100
4,47.841205,56.0,156,0,0,0,0,0,0,100,60


In [4]:
from sklearn.model_selection import train_test_split

## Check balance

In [6]:
train_df["insomnia"].value_counts(normalize=True)

0    0.5003
1    0.4997
Name: insomnia, dtype: float64

In [7]:
from sklearn.model_selection import cross_validate
from catboost import CatBoostClassifier
from sklearn.model_selection import GridSearchCV
import numpy as np

## Training a model to predict `sport` column given the other ones for data imputing

In [8]:
params = {'n_estimators' : np.arange(300,400,20),
          'learning_rate' : [0.01, 0.05, 0.1]}

gs = GridSearchCV(CatBoostClassifier(verbose=False), params, cv=5, scoring='accuracy', verbose=True, n_jobs=-1)
gs.fit(
    df.drop(columns="sport"),
    df["sport"]
)

Fitting 5 folds for each of 15 candidates, totalling 75 fits


GridSearchCV(cv=5,
             estimator=<catboost.core.CatBoostClassifier object at 0x000002022D3E0B20>,
             n_jobs=-1,
             param_grid={'learning_rate': [0.01, 0.05, 0.1],
                         'n_estimators': array([300, 320, 340, 360, 380])},
             scoring='accuracy', verbose=True)

In [9]:
gs.best_score_, gs.best_params_

(0.8038142857142857, {'learning_rate': 0.05, 'n_estimators': 380})

In [10]:
cv_res_sport = cross_validate(
    CatBoostClassifier(
        n_estimators=380,
        verbose=False,
        learning_rate=0.05
    ),
    df.drop(columns="sport"),
    df["sport"],
    cv=5, 
    scoring='accuracy', 
    return_train_score=True,
    verbose=True,
    n_jobs=-1
)

print(f"train: {cv_res_sport['train_score'].mean()}")
print(f"test: {cv_res_sport['test_score'].mean()}")

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


train: 0.8044821428571428
test: 0.8038142857142857


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    8.3s finished


In [11]:
sport_model = CatBoostClassifier(
    n_estimators=380,
    verbose=False,
    learning_rate=0.05,
)
sport_model.fit(df.drop(columns="sport"), df["sport"],)

<catboost.core.CatBoostClassifier at 0x2022d3e0850>

## pernicious_1

In [12]:
params = {'n_estimators' : np.arange(300,400,20),
          'learning_rate' : [0.01, 0.05, 0.1]}

gs = GridSearchCV(CatBoostClassifier(verbose=False), params, cv=5, scoring='accuracy', verbose=True, n_jobs=-1)
gs.fit(
    df.drop(columns="pernicious_1"),
    df["pernicious_1"]
)

gs.best_score_, gs.best_params_

Fitting 5 folds for each of 15 candidates, totalling 75 fits


(0.9230857142857143, {'learning_rate': 0.01, 'n_estimators': 300})

In [13]:
cv_res_pernicious_1 = cross_validate(
    CatBoostClassifier(
        n_estimators=300,
        verbose=False,
        learning_rate=0.01
    ),
    df.drop(columns="pernicious_1"),
    df["pernicious_1"],
    cv=5, 
    scoring='accuracy', 
    return_train_score=True,
    verbose=True,
    n_jobs=-1
)

print(f"train: {cv_res_pernicious_1['train_score'].mean()}")
print(f"test: {cv_res_pernicious_1['test_score'].mean()}")

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


train: 0.9236464285714285
test: 0.9230857142857143


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


In [14]:
per_1_model = CatBoostClassifier(
    n_estimators=300,
    verbose=False,
    learning_rate=0.01,
)
per_1_model.fit(df.drop(columns="pernicious_1"), df["pernicious_1"],)

<catboost.core.CatBoostClassifier at 0x2022e9c7580>

## pernicious_2

In [15]:
params = {'n_estimators' : np.arange(300,400,20),
          'learning_rate' : [0.01, 0.05, 0.1]}

gs = GridSearchCV(CatBoostClassifier(verbose=False), params, cv=5, scoring='accuracy', verbose=True, n_jobs=-1)
gs.fit(
    df.drop(columns="pernicious_2"),
    df["pernicious_2"]
)

gs.best_score_, gs.best_params_

Fitting 5 folds for each of 15 candidates, totalling 75 fits


(0.9462428571428572, {'learning_rate': 0.01, 'n_estimators': 360})

In [16]:
cv_res_pernicious_2 = cross_validate(
    CatBoostClassifier(
        n_estimators=360,
        verbose=False,
        learning_rate=0.01
    ),
    df.drop(columns="pernicious_2"),
    df["pernicious_2"],
    cv=5, 
    scoring='accuracy', 
    return_train_score=True,
    verbose=True,
    n_jobs=-1
)

print(f"train: {cv_res_pernicious_2['train_score'].mean()}")
print(f"test: {cv_res_pernicious_2['test_score'].mean()}")

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


train: 0.9462678571428572
test: 0.9462428571428572


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    8.6s finished


In [17]:
per_2_model = CatBoostClassifier(
    n_estimators=360,
    verbose=False,
    learning_rate=0.01,
)
per_2_model.fit(df.drop(columns="pernicious_2"), df["pernicious_2"])

<catboost.core.CatBoostClassifier at 0x2022d543fa0>

## Training Prediction Model

In [18]:
params = {'n_estimators' : np.arange(300,400,20),
          'learning_rate' : [0.01, 0.05, 0.1]}

gs = GridSearchCV(CatBoostClassifier(verbose=False), params, cv=5, scoring='accuracy', verbose=True, n_jobs=-1)
gs.fit(
    train_df.drop(columns="insomnia"),
    train_df["insomnia"]
)

gs.best_score_, gs.best_params_

Fitting 5 folds for each of 15 candidates, totalling 75 fits


(0.7367142857142858, {'learning_rate': 0.1, 'n_estimators': 380})

In [19]:
cv_res_insomnia = cross_validate(
    CatBoostClassifier(
        n_estimators=380,
        verbose=False,
        learning_rate=0.1
    ),
    train_df.drop(columns="insomnia"),
    train_df["insomnia"],
    cv=5, 
    scoring='accuracy', 
    return_train_score=True,
    verbose=True,
    n_jobs=-1
)

print(f"train: {cv_res_insomnia['train_score'].mean()}")
print(f"test: {cv_res_insomnia['test_score'].mean()}")

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


train: 0.750357142857143
test: 0.7367142857142858


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    9.2s finished


In [20]:
insomnia_model = CatBoostClassifier(
    n_estimators=380,
    verbose=False,
    learning_rate=0.1,
)
insomnia_model.fit(train_df.drop(columns="insomnia"), train_df["insomnia"])

<catboost.core.CatBoostClassifier at 0x2022e9ae460>

## Predicting missing values on test set

In [28]:
test_df = pd.read_csv("TEST.csv")

In [29]:
test_df.loc[test_df['sport'].isna(), "sport"] = sport_model.predict(
    test_df[test_df['sport'].isna()].drop(columns=["id", "sport"])
)
test_df.loc[test_df['pernicious_1'].isna(), "pernicious_1"] = per_1_model.predict(
    test_df[test_df['pernicious_1'].isna()].drop(columns=["id", "pernicious_1"])
)
test_df.loc[test_df['pernicious_2'].isna(), "pernicious_2"] = per_2_model.predict(
    test_df[test_df['pernicious_2'].isna()].drop(columns=["id", "pernicious_2"])
)

In [30]:
feat_research = pd.DataFrame(test_df.isna().sum() / test_df.shape[0], columns=['train_null_share'])
feat_research['test_null_share'] = test_df.isna().sum() / test_df.shape[0]
feat_research['train_dtypes'] = test_df.dtypes
feat_research['train_mean'] = test_df.mean()
feat_research['test_mean'] = test_df.mean()

feat_research

Unnamed: 0,train_null_share,test_null_share,train_dtypes,train_mean,test_mean
id,0.0,0.0,int64,50062.6869,50062.6869
age,0.0,0.0,float64,53.266334,53.266334
weight,0.0,0.0,float64,74.120615,74.120615
height,0.0,0.0,int64,164.322267,164.322267
sex,0.0,0.0,int64,1.353733,1.353733
stress,0.0,0.0,int64,1.368033,1.368033
doctor,0.0,0.0,int64,1.2227,1.2227
sport,0.0,0.0,float64,0.824567,0.824567
pernicious_1,0.0,0.0,float64,0.0839,0.0839
pernicious_2,0.0,0.0,float64,0.048967,0.048967


In [52]:
preds = insomnia_model.predict(test_df, prediction_type="Probability")
# preds = insomnia_model.predict(test_df)

In [54]:
preds[:5]

array([[0.47723199, 0.52276801],
       [0.48397724, 0.51602276],
       [0.60412834, 0.39587166],
       [0.49197389, 0.50802611],
       [0.79136225, 0.20863775]])

In [55]:
submission = pd.read_csv("sample_submission.csv")
submission.head()

Unnamed: 0,id,insomnia
0,5,0.5
1,6,0.5
2,7,0.5
3,10,0.5
4,11,0.5


In [56]:
submission["insomnia"] = preds[:, 1]
submission.head()

Unnamed: 0,id,insomnia
0,5,0.522768
1,6,0.516023
2,7,0.395872
3,10,0.508026
4,11,0.208638


In [57]:
submission.to_csv("sub.csv", index=False)