In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

from catboost import CatBoostClassifier, CatBoostRegressor
import lightgbm as lgb

import matplotlib.pyplot as plt

from tqdm import tqdm
import random
import time

import numpy_indexed as npi
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error,roc_auc_score, auc
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier, RandomForestClassifier

from rgf.sklearn import RGFClassifier
import xgboost

import json

# One-Hot Encoding Training and Test Data

In [2]:
Xs = pd.read_csv("train_me_features.csv").set_index('musteri')
ys = Xs["target"] .values
Xs.drop(["target"], axis=1, inplace=True)


categorical_features = ["egitim", "is_durumu", "meslek_grubu", "max_cat_last2", "max_cat_last3", "max_cat_last6"]
Xs_ohot = Xs[categorical_features]

for col in categorical_features:
    s = Xs_ohot[col].unique()

    # Create a One Hot Dataframe with 1 row for each unique value
    one_hot_df = pd.get_dummies(s, prefix='%s_' % col)
    one_hot_df[col] = s


    # Merge the one hot columns
    Xs_ohot = Xs_ohot.merge(one_hot_df, on=[col], how="left")
    Xs_ohot = Xs_ohot.drop([col], axis=1)
    Xs = Xs.drop([col], axis=1)
    
Xs = Xs.values

In [3]:
Xs2 = pd.read_csv("test_me_features.csv").set_index('musteri')

Xs2_ohot = Xs2[categorical_features]

for col in categorical_features:
    s = Xs2_ohot[col].unique()

    # Create a One Hot Dataframe with 1 row for each unique value
    one_hot_df = pd.get_dummies(s, prefix='%s_' % col)
    one_hot_df[col] = s


    # Merge the one hot columns
    Xs2_ohot = Xs2_ohot.merge(one_hot_df, on=[col], how="left")
    Xs2_ohot = Xs2_ohot.drop([col], axis=1)
    Xs2 = Xs2.drop([col], axis=1)
    
Xs_test = Xs2.values

# Setting a Stack-Ensemble of 3 Estimator

In [4]:
lgbm_model_params = {'bagging_fraction': 0.96,
 'boosting': 'goss',
 'feature_fraction': 0.61,
 'lambda_l1': 4.69368271984883,
 'lambda_l2': 3.6071327694899726,
 'learning_rate': 0.10343568519698648,
 'max_bin': 175,
 'max_depth': 3,
 'metric': 'auc',
 'min_data_in_bin': 238,
 'min_data_in_leaf': 2,
 'min_gain_to_split': 4.13,
 'num_leaves': 1372,
 'objective': 'binary',
 'other_rate': 0.024192209625214116,
 'scale_pos_weight': 3.239010106166049,
 'top_rate': 0.42298692298543883}
lgbm_model = lgb.LGBMClassifier(silent=True)
lgbm_model.set_params(**lgbm_model_params)


xgb_model = xgboost.XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=0.6, gamma=1.5, gpu_id=-1,
importance_type='gain', interaction_constraints=None,
learning_rate=0.02, max_delta_step=0, max_depth=4,
min_child_weight=10, monotone_constraints=None,
n_estimators=600, n_jobs=6, nthread=16, num_parallel_tree=1,
objective='binary:logistic', random_state=0, reg_alpha=0,
reg_lambda=1, scale_pos_weight=1, silent=True, subsample=0.8,
tree_method=None, validate_parameters=False, verbosity=None)

rgf_model = RGFClassifier(verbose=False)

estimators = [
    ('lgbm', lgbm_model),
    ('xgb', xgb_model),
    ('rgf', rgf_model),
]
clf = StackingClassifier(
    estimators=estimators, final_estimator=LogisticRegression(class_weight='balanced')
)




# Predicting on CV

In [5]:
cv = StratifiedKFold(n_splits=5, random_state=1, shuffle=True)

preds_for_subm = []

scores = []

for train_ix, test_ix in cv.split(Xs, y=ys):
    # get data
    train_X, test_X = Xs[train_ix], Xs[test_ix]
    train_y, test_y = ys[train_ix], ys[test_ix]
    # fit model
    clf.fit(train_X, train_y)
    # evaluate model
    yhat = clf.predict_proba(test_X)[:,1]
    # store score
    auc = roc_auc_score(test_y, yhat)
    scores.append(auc)
    
    yhat_subm = clf.predict_proba(Xs_test)[:,1]
    preds_for_subm.append(yhat_subm)
    
    print('> ', auc)
    
# summarize model performance
mean_s, std_s = np.mean(scores), np.std(scores)
print('Mean AUC: %.3f, Standard Deviation: %.3f' % (mean_s, std_s))

Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoo

Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoo

Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoo

Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoo

In [6]:
preds_for_subm_arr = np.array(preds_for_subm)
preds_for_subm_arr

array([[0.21317242, 0.3653087 , 0.45424363, ..., 0.20644555, 0.34896737,
        0.23897941],
       [0.21776334, 0.4170073 , 0.43022083, ..., 0.21069686, 0.38943236,
        0.26637529],
       [0.21178755, 0.46701847, 0.42087038, ..., 0.20943017, 0.38458902,
        0.24973319],
       [0.2091083 , 0.47387196, 0.47518719, ..., 0.20631223, 0.34029239,
        0.24228376],
       [0.2103869 , 0.41383997, 0.45782484, ..., 0.20946049, 0.35877139,
        0.24823477]])

In [7]:
preds_for_subm_arr = preds_for_subm_arr.mean(axis=0)
preds_for_subm_arr

array([0.2124437 , 0.42740928, 0.44766938, ..., 0.20846906, 0.36441051,
       0.24912129])

In [8]:
preds_for_subm_arr.mean()

0.4099407321063016

# Creating the Submission File

In [9]:
data_frame = pd.read_csv("test.csv")
sub_df = data_frame[["musteri"]]
sub_df["target"] = preds_for_subm_arr

sub_df.to_csv("submission_x1.csv", index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
