# OHE,Target + Logit

Ideas:
* Replace missing values with constant
* Add number of missing values in row as a feature
* Apply StandardScaler to created feature
* Apply Target to features that have many unique values
* Apply OHE to other features
* Logistic regression trained on full train set

In [1]:
import numpy as np
import pandas as pd

import warnings
warnings.simplefilter('ignore')

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## Load data

In [2]:
train = pd.read_csv('train.csv', index_col='id')
test = pd.read_csv('test.csv', index_col='id')

In [3]:
train.head(3).T

id,0,1,2
bin_0,0,1,0
bin_1,0,1,1
bin_2,0,0,0
bin_3,F,F,F
bin_4,N,Y,N
nom_0,Red,Red,Red
nom_1,Trapezoid,Star,
nom_2,Hamster,Axolotl,Hamster
nom_3,Russia,,Canada
nom_4,Bassoon,Theremin,Bassoon


In [4]:
def summary(df):
    summary = pd.DataFrame(df.dtypes, columns=['dtypes'])
    summary = summary.reset_index()
    summary['Name'] = summary['index']
    summary = summary[['Name', 'dtypes']]
    summary['Missing'] = df.isnull().sum().values    
    summary['Uniques'] = df.nunique().values
    summary['First Value'] = df.loc[0].values
    summary['Second Value'] = df.loc[1].values
    summary['Third Value'] = df.loc[2].values
    return summary


summary(train)

Unnamed: 0,Name,dtypes,Missing,Uniques,First Value,Second Value,Third Value
0,bin_0,float64,17894,2,0,1,0
1,bin_1,float64,18003,2,0,1,1
2,bin_2,float64,17930,2,0,0,0
3,bin_3,object,18014,2,F,F,F
4,bin_4,object,18047,2,N,Y,N
5,nom_0,object,18252,3,Red,Red,Red
6,nom_1,object,18156,6,Trapezoid,Star,
7,nom_2,object,18035,6,Hamster,Axolotl,Hamster
8,nom_3,object,18121,6,Russia,,Canada
9,nom_4,object,18035,4,Bassoon,Theremin,Bassoon


## Handle missing values

Add number of missing values in row as a feature

In [5]:
train['missing_count'] = train.isnull().sum(axis=1)
test['missing_count'] = test.isnull().sum(axis=1)

Replace missing values with constants

In [6]:
missing_number = -99999
missing_string = 'MISSING_STRING'

In [7]:
numerical_features = [
    'bin_0', 'bin_1', 'bin_2',
    'ord_0',
    'day', 'month'
]

string_features = [
    'bin_3', 'bin_4',
    'ord_1', 'ord_2', 'ord_3', 'ord_4', 'ord_5',
    'nom_0', 'nom_1', 'nom_2', 'nom_3', 'nom_4', 'nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9'
]

In [8]:
def impute(train, test, columns, value):
    for column in columns:
        train[column] = train[column].fillna(value)
        test[column] = test[column].fillna(value)

In [9]:
impute(train, test, numerical_features, missing_number)
impute(train, test, string_features, missing_string)

## Feature engineering

Split 'ord_5' preserving missing values

In [10]:
train['ord_5_1'] = train['ord_5'].str[0]
train['ord_5_2'] = train['ord_5'].str[1]

train.loc[train['ord_5'] == missing_string, 'ord_5_1'] = missing_string
train.loc[train['ord_5'] == missing_string, 'ord_5_2'] = missing_string

train = train.drop('ord_5', axis=1)


test['ord_5_1'] = test['ord_5'].str[0]
test['ord_5_2'] = test['ord_5'].str[1]

test.loc[test['ord_5'] == missing_string, 'ord_5_1'] = missing_string
test.loc[test['ord_5'] == missing_string, 'ord_5_2'] = missing_string

test = test.drop('ord_5', axis=1)

In [11]:
simple_features = [
    'missing_count'
]

ohe_features = [
    'bin_0', 'bin_1', 'bin_2', 'bin_3', 'bin_4',
    'nom_0', 'nom_1', 'nom_2', 'nom_3', 'nom_4',
    'ord_0', 'ord_1', 'ord_2', 'ord_3', 'ord_4', 'ord_5_1', 'ord_5_2',
    'day', 'month'
]

target_features = [
    'nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9'
]

## Extract target variable

In [12]:
y_train = train['target'].copy()
x_train = train.drop('target', axis=1)
del train

x_test = test.copy()
del test

## Standard scaler

In [13]:
from sklearn.preprocessing import StandardScaler


scaler = StandardScaler()
simple_x_train = scaler.fit_transform(x_train[simple_features])
simple_x_test = scaler.transform(x_test[simple_features])

## OHE

In [14]:
from sklearn.preprocessing import OneHotEncoder


ohe = OneHotEncoder(dtype='uint16', handle_unknown="ignore")
ohe_x_train = ohe.fit_transform(x_train[ohe_features])
ohe_x_test = ohe.transform(x_test[ohe_features])

## Target encoder

In [31]:
from category_encoders import TargetEncoder, CatBoostEncoder
from sklearn.model_selection import StratifiedKFold

In [32]:
def transform(transformer, x_train, y_train, cv):
    oof = pd.DataFrame(index=x_train.index, columns=x_train.columns)
    for train_idx, valid_idx in cv.split(x_train, y_train):
        x_train_train = x_train.loc[train_idx]
        y_train_train = y_train.loc[train_idx]
        x_train_valid = x_train.loc[valid_idx]
        transformer.fit(x_train_train, y_train_train)
        oof_part = transformer.transform(x_train_valid)
        oof.loc[valid_idx] = oof_part
    return oof

In [38]:
target = CatBoostEncoder(drop_invariant=True, a=0.2,sigma=0.05)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
target_x_train = transform(target, x_train[target_features], y_train, cv).astype('float')

target.fit(x_train[target_features], y_train)
target_x_test = target.transform(x_test[target_features]).astype('float')

## Merge all

In [39]:
import scipy


final_x_train = scipy.sparse.hstack([simple_x_train, ohe_x_train, target_x_train]).tocsr()
final_x_test = scipy.sparse.hstack([simple_x_test, ohe_x_test, target_x_test]).tocsr()

## Logistic regression

In [40]:
from sklearn.linear_model import LogisticRegression


logit = LogisticRegression(C=0.54321, solver='lbfgs', max_iter=10000, verbose=1)
logit.fit(final_x_train, y_train)
y_pred = logit.predict_proba(final_x_test)[:, 1]

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   45.6s finished


## Submit predictions

In [41]:
submission = pd.read_csv('sample_submission.csv', index_col='id')
submission['target'] = y_pred
submission.to_csv('logit.csv')

In [28]:
submission.head()

Unnamed: 0_level_0,target
id,Unnamed: 1_level_1
600000,0.141813
600001,0.255423
600002,0.154422
600003,0.09296
600004,0.136414


In [37]:
submission.head()

Unnamed: 0_level_0,target
id,Unnamed: 1_level_1
600000,0.141223
600001,0.254925
600002,0.154099
600003,0.092716
600004,0.136065


In [42]:
submission.head()

Unnamed: 0_level_0,target
id,Unnamed: 1_level_1
600000,0.141223
600001,0.254925
600002,0.154099
600003,0.092716
600004,0.136065
