In [2]:
import pandas as pd
from tqdm.autonotebook import tqdm
import os
tqdm.pandas()



In [3]:
train_dir = './train-data'
train_x = pd.read_csv(os.path.join(train_dir, 'train_x.csv'))
train_y = pd.read_csv(os.path.join(train_dir, 'train_y.csv'))
test_x = pd.read_csv(os.path.join(train_dir, 'test_x.csv'))

In [3]:
categorical_features = ['gender', 'city', 'prodName', 'color', 'carrier']

In [4]:
train_x.fillna(train_x.mean(), inplace=True)
test_x.fillna(train_x.mean(), inplace=True)

In [5]:
# train_x.fillna(train_x.min(), inplace=True)
# test_x.fillna(train_x.min(), inplace=True)

In [26]:
import sklearn.model_selection
import sklearn.metrics
import lightgbm as lgb
import sklearn.ensemble
import gc

def cv(clf, x, y, params={}, splits=3, fit_params={}):
    kfold = sklearn.model_selection.StratifiedKFold(splits, shuffle=True)
    cv_score = sklearn.model_selection.cross_validate(clf, x, y, cv=kfold, scoring={
        'accuracy': 'accuracy',
    }, return_train_score=True, verbose=5, fit_params=fit_params)
    return cv_score


In [8]:

params = {
        'boosting_type': 'gbdt',
        'objective': 'multiclass,multi_error',
        "metric": 'multi_logloss',
        'n_jobs': -1,
    
        #better
        'n_estimators': 1500,
        "num_leaves": 120,
        "subsample_for_bin": 90000,
        'max_bin': 1200,
    
        #fast
#         'n_estimators': 100,
#         "num_leaves": 36,
#         "subsample_for_bin": 20000,
#         'max_bin': 512,
    
        # Overfit
        'feature_fraction': 0.4,
        'bagging_fraction': 0.7,
        'bagging_freq': 10,
        'reg_alpha': 2,
        'reg_lambda': 8,
    
        
        "max_depth": -1,
        'min_child_samples': 100,
        'learning_rate': 0.05,
}
def run_cross_validation(clf, x, y, fit_params={}):
    cv_result = cv(clf, x, y, params=params, splits=3, fit_params=fit_params)
    for scorer, score in cv_result.items():
        print('%s: %s' % (scorer, score))
        print('Average %s: %f' % (scorer, score.mean()))

In [8]:
gc.collect()
clf = lgb.LGBMClassifier(**params)
run_cross_validation(clf, train_x, train_y.values.ravel(), {'categorical_feature': categorical_features})

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  ................................................................


New categorical_feature is ['carrier', 'city', 'color', 'gender', 'prodName']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


[CV] .................... , accuracy=0.5425439663522891, total=65.5min


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 80.0min remaining:    0.0s


[CV]  ................................................................
[CV] .................... , accuracy=0.5429089552238806, total=65.4min


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed: 159.7min remaining:    0.0s


[CV]  ................................................................
[CV] .................... , accuracy=0.5430963329796015, total=66.5min


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed: 240.5min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed: 240.5min finished


fit_time: [3498.29871869 3498.61290765 3558.13864565]
Average fit_time: 3518.350091
score_time: [432.49204183 428.03095031 429.00038242]
Average score_time: 429.841125
test_accuracy: [0.54254397 0.54290896 0.54309633]
Average test_accuracy: 0.542850
train_accuracy: [0.69804604 0.69818582 0.69792411]
Average train_accuracy: 0.698052


In [9]:
def train(x, y, params={}):
    clf = lgb.LGBMClassifier(**params)
    clf.fit(x, y, categorical_feature=categorical_features)
    return clf


In [10]:
# load model
# model = joblib.load('lgb.pkl')

In [11]:
model = train(train_x, train_y.values.ravel(), params)

In [12]:
pred_train_y  = model.predict(train_x)
acc = (pred_train_y == train_y.values.ravel()).sum()
acc / pred_train_y.shape[0]

0.6622721393034826

In [13]:
pred_y = model.predict(test_x.drop('uid', axis=1))

In [14]:
result = pd.DataFrame(test_x['uid'])
result.columns = ['id']
result['label'] = pred_y

In [15]:
result.to_csv('submission.csv', index=False)

In [4]:
for i in range(len(train_x.columns)):
    print(train_x.columns[i], model.feature_importances_[i])

gender 5511
city 193573
prodName 109586
ramCap 969
ramLeft 4638
romCap 2589
romLeft 7949
color 97513
fontSize 5436
carrier 2204
os 2432
4g 1308
3g 35
2g 11
wifi 1098
bootTimes 8957
aTimes 5172
bTimes 3096
cTimes 7029
dTimes 9868
eTimes 9663
fTimes 1009
fSum 9651
app_w2v_商务 2516
app_w2v_社交通讯 492
app_w2v_实用工具 1
app_w2v_教育 1427
app_w2v_运动健康 580
app_w2v_便捷生活 3741
app_w2v_金融理财 1950
app_w2v_购物比价 3027
app_w2v_儿童 0
app_w2v_新闻阅读 1580
app_w2v_汽车 2506
app_w2v_unknown 1
app_w2v_出行导航 43
app_w2v_影音娱乐 4421
app_w2v_美食 3689
app_w2v_旅游住宿 2780
app_w2v_拍摄美化 5704
app_w2v_角色扮演 3530
app_w2v_动作射击 1340
app_w2v_休闲益智 94
app_w2v_经营策略 644
app_w2v_棋牌桌游 1400
app_w2v_主题个性 0
app_w2v_体育竞速 3759
app_w2v_学习办公 1375
app_w2v_棋牌天地 106
app_w2v_网络游戏 1365
app_w2v_休闲游戏 0
app_w2v_图书阅读 562
app_w2v_益智棋牌 2736
app_w2v_表盘个性 1001
app_w2v_动作冒险 3790
app_tfidf_商务 8164
app_tfidf_社交通讯 2373
app_tfidf_实用工具 9
app_tfidf_教育 5386
app_tfidf_运动健康 1480
app_tfidf_便捷生活 9889
app_tfidf_金融理财 3640
app_tfidf_购物比价 8466
app_tfidf_儿童 0
app_tfidf_新闻阅读 4936
app_

In [30]:
sum(model.feature_importances_[23:88]) / sum(model.feature_importances_)

0.2148702147525677

In [17]:
from sklearn.externals import joblib
# save model
joblib.dump(model, 'model/lgb.pkl')


['lgb.pkl']

In [1]:
from sklearn.externals import joblib

model = joblib.load('model/lgb.pkl')

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [10]:
model

LGBMClassifier(bagging_fraction=0.7, bagging_freq=10, boosting_type='gbdt',
        class_weight=None, colsample_bytree=1.0, feature_fraction=0.4,
        importance_type='split', learning_rate=0.05, max_bin=1200,
        max_depth=-1, metric='multi_logloss', min_child_samples=100,
        min_child_weight=0.001, min_split_gain=0.0, n_estimators=1500,
        n_jobs=-1, num_leaves=120, objective='multiclass,multi_error',
        random_state=None, reg_alpha=2, reg_lambda=8, silent=True,
        subsample=1.0, subsample_for_bin=90000, subsample_freq=0)

In [12]:
proba = model.predict_proba(test_x.drop('uid', axis=1))