In [1]:
import pandas as pd
import numpy as np
np.random.seed(0)
from tqdm import tqdm
import catboost as ctb
import xgboost as xgb
import lightgbm as lgb
from sklearn.metrics import f1_score, classification_report


from sklearn.model_selection import StratifiedKFold, cross_val_score
import eli5
from eli5.sklearn import PermutationImportance
from sklearn.preprocessing import PolynomialFeatures

import scikitplot as skplt

import matplotlib.pyplot as plt

In [2]:
df_train = pd.read_hdf("../input/train_taiwan.h5")
df_test = pd.read_hdf("../input/test_taiwan.h5")

df_all = pd.concat([df_train, df_test]).reset_index(drop=True)
df_all.columns = [x.replace(" ", "_").lower() for x in df_all.columns]

df_train.shape, df_test.shape, df_all.shape

((3409, 97), (3410, 96), (6819, 97))

In [17]:
def make_experiment(model, feats=None, threshold=0.5, n_splits=3, black_list=["target"], show_feats=False, show_cr=False, show_cm=False, show_pr=False, show_lc=False):
    if feats is None:
        num_feats = df_all.select_dtypes("number").columns
        feats = [x for x in num_feats if x not in black_list]

    if show_feats:
        print(feats)
        
    df_train = df_all[ df_all["target"].notnull() ]
    X_train = df_train[feats].values
    y_train = df_train["target"].values

    scores = []
    cv=StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=0)
    for train_idx, test_idx in cv.split(X_train, y_train):    

        model.fit(X_train[train_idx], y_train[train_idx])

        y_probas = model.predict_proba(X_train[test_idx])
        y_pred = (y_probas[:,1] > threshold).astype(np.int)
        #y_pred = model.predict(X_train[test_idx])

        if show_cr:
            print(classification_report(y_train[test_idx], y_pred))
        
        if show_cm:
            skplt.metrics.plot_confusion_matrix(y_train[test_idx], y_pred)#normalize=True
            
        if show_pr:
            skplt.metrics.plot_precision_recall(y_train[test_idx], y_probas)

        score = f1_score(y_train[test_idx], y_pred)
        scores.append(score)

    
    
    print("Score: ", np.mean(scores), np.std(scores))
    
    skplt.estimators.plot_learning_curve(model, X_train, y_train, cv=cv, scoring="f1", random_state=0)
    
    return eli5.show_weights(model, feature_names=feats, top=50)

In [None]:
model = xgb.XGBClassifier(max_depth=5, n_estimators=100, random_state=0)
make_experiment(model, threshold=0.1)

In [None]:
model = ctb.CatBoostClassifier(max_depth=5, n_estimators=100, verbose=0)
make_experiment(model, threshold=0.1, show_cm=True)

In [None]:
model = lgb.LGBMClassifier(max_depth=5, n_estimators=100)
make_experiment(model, threshold=0.1, show_cm=True)

najlepsze cechy z eli5

In [4]:
best_feats =[
    "borrowing_dependency",
    "interest-bearing_debt_interest_rate",
    "net_income_to_total_assets",
    "average_collection_days",
    "non-industry_revenue_and_expenditure/revenue",
    "continuous_profit_rate_(after_tax)",
    "net_worth_growth_rate",
    "permanent_net_profit_growth_rate"
]

tworzenie nowych cech z best_feats

logarytm i pierwiastek

In [6]:
for feat in best_feats:
    df_all['log_' + feat] = df_all[feat].map(lambda x: np.log1p(x))
    df_all['sqrt_' + feat] = df_all[feat].map(lambda x: np.sqrt(x))

dzielenie wartości na 5 przedziałów i factorize

In [7]:
for feat in best_feats:
    df_all[feat + "_range"]=pd.cut(df_all[feat],5).factorize()[0]

transformacja wielomianowa 2 stopnia

In [8]:
df_temp = df_all[best_feats].copy()
trans = PolynomialFeatures(degree=2)
df_temp = pd.DataFrame(trans.fit_transform(df_temp))
df_all = pd.concat([df_all, df_temp], axis = 1)
df_all.columns=df_all.columns.map(lambda x: str(x))

wygenerowanie par i dodawanie/odejmowanie parami

In [9]:
pairs = []
for i,x in enumerate(best_feats):
    if i == 0: continue
    pairs.append((best_feats[0], x))

for i,x in enumerate(best_feats):
    if i == 1: continue
    pairs.append((best_feats[1], x))

for i,x in enumerate(best_feats):
    if i == 2: continue
    pairs.append((best_feats[2], x))

for i,x in enumerate(best_feats):
    if i == 3: continue
    pairs.append((best_feats[3], x))
    
for i,x in enumerate(best_feats):
    if i == 4: continue
    pairs.append((best_feats[4], x))

for i,x in enumerate(best_feats):
    if i == 5: continue
    pairs.append((best_feats[5], x))

for i,x in enumerate(best_feats):
    if i == 6: continue
    pairs.append((best_feats[6], x))

for i,x in enumerate(best_feats):
    if i == 7: continue
    pairs.append((best_feats[7], x))

In [11]:
sum_pairs = pairs

for left, right in sum_pairs:
    output_feat = 'add_{0}_{1}'.format(left, right)
    df_all[output_feat] = df_all[left] + df_all[right]


In [13]:
minus_pairs = pairs

for left, right in minus_pairs:
    output_feat = 'min_{0}_{1}'.format(left, right)
    df_all[output_feat] = df_all[left] - df_all[right]


faktoryzacja wartości na podstawie średniej/mediany w kolumnie na zasadzie x < m->0, x > m->1

In [14]:
for feat in best_feats:
    median_ = df_all[feat].median() 
    mean_ = df_all[feat].mean()
    df_all[feat + "_med_cut"]=df_all[feat].map(lambda x: 1 if x > median_ else 0)
    df_all[feat + "_mean_cut"]=df_all[feat].map(lambda x: 1 if x > mean_ else 0)

obcięcie wierzy z wartościami odstającymi

In [15]:
for feat_name in best_feats:    
    min_value = df_all[feat_name].quantile(0.01)
    max_value = df_all[feat_name].quantile(0.99)
    df_all = df_all[(df_all[feat_name] > min_value) & (df_all[feat_name] < max_value) ]

In [None]:
model = ctb.CatBoostClassifier(max_depth=7, n_estimators=100, verbose=0)
make_experiment(model, threshold=0.1, show_cm=True)