In [1]:
import pandas as pd
import numpy as np
from gc import collect
import time
import lightgbm as lgb

from sklearn import preprocessing
from collections import defaultdict
from sklearn.model_selection import cross_validate, StratifiedKFold, KFold
%load_ext autoreload
%autoreload 2

In [2]:
start = time.time()

PATH = './input/'
nrows = None
SEED = 999
RND = 111

In [3]:
df = pd.read_csv('{PATH}train.csv'.format(PATH = PATH), nrows = nrows, index_col='item_id')
print(df.shape)
df['subtarget'] = df['deal_probability'].astype(bool)

(1503424, 17)


In [4]:
for col in df.columns:
    if (df[col].dtypes=='object'):
        df[col].fillna('-999', inplace=True)
    elif (df[col].dtype.name=='category'):
        df[col] = df[col].astype(str)
        df[col].fillna(-999, inplace=True)
    else:
        df[col].fillna(-999, inplace=True)

In [5]:
LE =defaultdict(preprocessing.LabelEncoder)
cat_col = []
cat_name = []
for n, col in enumerate(df.columns):
    if df[col].dtype.name=='object':
        cat_col.append(n)
        cat_name.append(col)
df[cat_name] = df[cat_name].apply(lambda x: LE[x.name].fit_transform(x))
print(cat_col)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 13, 14]


In [6]:
for i in df.columns:
    if df[i].dtypes=='float64':
        df[i] = df[i].astype('float16')
    elif df[i].dtypes=='int64':
        df[i] = df[i].astype('int16')

In [7]:
def rmse(y, y0):
    assert len(y) == len(y0)
    return np.sqrt(np.mean(np.power((y - y0), 2)))

In [8]:
model = lgb.LGBMRegressor(num_leaves=300, random_state=RND)
cv = StratifiedKFold(n_splits=10, random_state=RND)

In [9]:
y = df['deal_probability'].values
y_sub = df['subtarget'].values
df.drop(['subtarget', 'deal_probability'], axis=1, inplace=True)

In [10]:
lgbm_params =  {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'rmse',
    # 'max_depth': 15,
    'num_leaves': 100,
    'feature_fraction': 0.5,
    'bagging_fraction': 0.75,
    'bagging_freq': 2,
    'learning_rate': 0.016,
    'verbose': 0
} 

In [51]:
scor_train = []
scor_test = []
for train_index, test_index in cv.split(df, y_sub):
    X_train, X_test = df.iloc[train_index, :], df.iloc[test_index, :]
    y_train, y_test = y[train_index], y[test_index]
    
    lgtrain = lgb.Dataset(X_train, y_train,
                     categorical_feature=cat_col)
    lgvalid = lgb.Dataset(X_test, y_test,
                     categorical_feature=cat_col)
    
    model = lgb.train(
        lgbm_params,
        lgtrain,
        num_boost_round=100,
        valid_sets=[lgtrain, lgvalid],
        valid_names=['train','valid'],
        early_stopping_rounds=50,
        verbose_eval=False)
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    scor_train.append(rmse(y_train_pred, y_train))
    scor_test.append(rmse(y_test_pred, y_test))
print("train_score: %0.4f"%np.mean(scor_train))
print("test_score: %0.4f"%np.mean(scor_test))



train_score: 0.2315
test_score: 0.2325


In [11]:
lgbm_params_sub =  {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'rmse',
    # 'max_depth': 15,
    'num_leaves': 100,
    'feature_fraction': 0.5,
    'bagging_fraction': 0.75,
    'bagging_freq': 2,
    'learning_rate': 0.016,
    'verbose': 0
} 

In [16]:
scor_train = []
scor_test = []
model_class = lgb.LGBMClassifier(num_leaves=100)
for train_index, test_index in cv.split(df, y_sub):
    X_train, X_test = df.iloc[train_index, :], df.iloc[test_index, :]
    y_train, y_test = y[train_index], y[test_index]
    y_sub_train, y_sub_test = y_sub[train_index], y_sub[test_index]
    
    
    
    lgtrain_sub = lgb.Dataset(X_train, y_sub_train,
                     categorical_feature=cat_col, free_raw_data=False)
    lgvalid_sub = lgb.Dataset(X_test, y_sub_test,
                     categorical_feature=cat_col, free_raw_data=False)
    
    
    
    model_class = lgb.train(
        lgbm_params_sub,
        lgtrain_sub,
        num_boost_round=100,
        valid_sets=[lgtrain_sub, lgvalid_sub],
        valid_names=['train','valid'],
        early_stopping_rounds=50,
        verbose_eval=False)
    
    y_zero_train = model_class.predict(X_train)>=.05
    y_zero_test = model_class.predict(X_test)>=.05
    
    lgtrain = lgb.Dataset(X_train[y_zero_train], y_train[y_zero_train],
                     categorical_feature=cat_col, free_raw_data=False)
    lgvalid = lgb.Dataset(X_test[y_zero_test], y_test[y_zero_test],
                     categorical_feature=cat_col, free_raw_data=False)
    
    model = lgb.train(
        lgbm_params,
        lgtrain,
        num_boost_round=100,
        valid_sets=[lgtrain, lgvalid],
        valid_names=['train','valid'],
        early_stopping_rounds=50,
        verbose_eval=False)
    
    y_train_pred = model.predict(X_train)
    y_train_pred[y_zero_train==0] = 0
    
    y_test_pred = model.predict(X_test)
    y_test_pred[y_zero_test==0] = 0
    scor_train.append(rmse(y_train_pred, y_train))
    scor_test.append(rmse(y_test_pred, y_test))
print("train_score: %0.4f"%np.mean(scor_train))
print("test_score: %0.4f"%np.mean(scor_test))



train_score: 0.2315
test_score: 0.2325


In [13]:
scor_train = []
scor_test = []
model_class = lgb.LGBMClassifier(num_leaves=100)
for train_index, test_index in cv.split(df, y_sub):
    X_train, X_test = df.iloc[train_index, :], df.iloc[test_index, :]
    y_train, y_test = y[train_index], y[test_index]
    y_sub_train, y_sub_test = y_sub[train_index], y_sub[test_index]
    
    
    
    lgtrain_sub = lgb.Dataset(X_train, y_sub_train, weight=5-y_train,
                     categorical_feature=cat_col, free_raw_data=False)
    lgvalid_sub = lgb.Dataset(X_test, y_sub_test, weight=5-y_test,
                     categorical_feature=cat_col, free_raw_data=False)
    
    
    
    model_class = lgb.train(
        lgbm_params_sub,
        lgtrain_sub,
        num_boost_round=100,
        valid_sets=[lgtrain_sub, lgvalid_sub],
        valid_names=['train','valid'],
        early_stopping_rounds=50,
        verbose_eval=False)
    
    
    y_train_pred = model_class.predict(X_train)
    y_test_pred = model_class.predict(X_test)
    scor_train.append(rmse(y_train_pred, y_train))
    scor_test.append(rmse(y_test_pred, y_test))
print("train_score: %0.4f"%np.mean(scor_train))
print("test_score: %0.4f"%np.mean(scor_test))



train_score: 0.3258
test_score: 0.3265


In [None]:
lgtrain = lgb.Dataset(df, y,
                     categorical_feature=cat_col, free_raw_data=False)
    
lgtrain_sub = lgb.Dataset(df, y_sub,
                     categorical_feature=cat_col, free_raw_data=False)
    
    
    
model_class = lgb.train(
        lgbm_params_sub,
        lgtrain_sub,
        num_boost_round=100,
        valid_sets=[lgtrain, lgvalid],
        valid_names=['train','valid'],
        early_stopping_rounds=50,
        verbose_eval=False)

model = lgb.train(
        lgbm_params,
        lgtrain,
        num_boost_round=100,
        valid_sets=[lgtrain, lgvalid],
        valid_names=['train','valid'],
        early_stopping_rounds=50,
        verbose_eval=False)

In [None]:
    
y_zero_test = model_class.predict(X_test)>=.2
    
    
    
    y_train_pred = model.predict(X_train)
    y_train_pred[y_sub_train==0] = 0
    
    y_test_pred = model.predict(X_test)
    y_test_pred[y_sub_test==0] = 0
    scor_train.append(rmse(y_train_pred, y_train))
    scor_test.append(rmse(y_test_pred, y_test))
print("train_score: %0.4f"%np.mean(scor_train))
print("test_score: %0.4f"%np.mean(scor_test))

In [None]:
scor_train = []
scor_test = []
model_class = lgb.LGBMClassifier(num_leaves=100, )
for train_index, test_index in cv.split(df, y_sub):
    X_train, X_test = df.iloc[train_index, :], df.iloc[test_index, :]
    y_train, y_test = y[train_index], y[test_index]
    y_sub_train, y_sub_test = y_sub[train_index], y_sub[test_index]
    model_class.fit(X_train, y_sub_train)
    y_zero_train = model_class.predict_proba(X_train)[:,1]>=.2
    y_zero_test = model_class.predict_proba(X_test)[:,1]>=.2
    
    model.fit(X_train[y_sub_train!=0], y_train[y_sub_train!=0])
    y_train_pred = model.predict(X_train)
    y_train_pred[y_sub_train==0] = 0
    
    y_test_pred = model.predict(X_test)
    y_test_pred[y_sub_test==0] = 0
    scor_train.append(rmse(y_train_pred, y_train))
    scor_test.append(rmse(y_test_pred, y_test))
print("train_score: %0.2f"%np.mean(scor_train))
print("test_score: %0.2f"%np.mean(scor_test)), 

In [11]:
model = ct.CatBoostRegressor(iterations=200)
cv = KFold(n_splits=10, random_state=RND)
cross = cross_validate(model, scoring='neg_mean_squared_error', cv = cv)

ValueError: DataFrame.dtypes for data must be int, float or bool. Did not expect the data types in fields user_id, region, city, parent_category_name, category_name, param_1, param_2, param_3, title, description, activation_date, user_type, image