In [1]:
import pandas as pd
from tqdm.autonotebook import tqdm
import os
import numpy as np
tqdm.pandas()



In [2]:
%%time
train_dir = './train-data'
train_tfidf = np.load(os.path.join(train_dir, 'train_tfidf.npy'))[()]
test_tfidf = np.load(os.path.join(train_dir, 'test_tfidf.npy'))[()]
# Class will start with 0
train_y = pd.read_csv(os.path.join(train_dir, 'train_y.csv'))
test_uid = pd.read_csv(os.path.join(train_dir, 'test_x.csv'))['uid']
train_x = pd.read_csv(os.path.join(train_dir, 'train_x.csv'))
test_x = pd.read_csv(os.path.join(train_dir, 'test_x.csv'))

CPU times: user 51.5 s, sys: 12 s, total: 1min 3s
Wall time: 1min 5s


In [3]:
train_x.fillna(train_x.mean(), inplace=True)
test_x.fillna(train_x.mean(), inplace=True)

In [4]:
train_df = pd.SparseDataFrame(train_tfidf,
                         default_fill_value=0).astype('float32')
del train_tfidf
test_df = pd.SparseDataFrame(test_tfidf,
                         default_fill_value=0).astype('float32')
del test_tfidf

In [5]:
for col in train_x.columns:
    train_df[col] = train_x[col]
del train_x
for col in test_x.drop('uid', axis=1).columns:
    test_df[col] = test_x[col]
del test_x

In [6]:
categorical_features = ['gender', 'city', 'prodName', 'color', 'carrier']

In [7]:
import sklearn.model_selection
import sklearn.metrics
import lightgbm as lgb
import sklearn.ensemble
import gc

def cv(clf, x, y, params={}, splits=3, fit_params={}):
    kfold = sklearn.model_selection.StratifiedKFold(splits, shuffle=True)
    cv_score = sklearn.model_selection.cross_validate(clf, x, y, cv=kfold, scoring={
        'accuracy': 'accuracy',
    }, return_train_score=True, verbose=5, fit_params=fit_params)
    return cv_score

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [8]:

params = {
        'boosting_type': 'gbdt',
        'objective': 'multiclass,multi_error',
        "metric": 'multi_logloss',
        'n_jobs': -1,
    
        #better
        'n_estimators': 2000,
        "num_leaves": 120,
        "subsample_for_bin": 100000,
        'max_bin': 1500,
    
        # Bagging
        'feature_fraction': 0.4,
        'bagging_fraction': 0.7,
        'bagging_freq': 20,
        'reg_alpha': 4,
        'reg_lambda': 8,
    
        
        "max_depth": -1,
        'min_child_samples': 100,
        'learning_rate': 0.05,
    
        'verbose': 5
}
def run_cross_validation(clf, x, y, fit_params={}):
    cv_result = cv(clf, x, y, params=params, splits=3, fit_params=fit_params)
    for scorer, score in cv_result.items():
        print('%s: %s' % (scorer, score))
        print('Average %s: %f' % (scorer, score.mean()))

In [9]:
gc.collect()


40

In [10]:
categorical_indices = [list(train_df.columns).index(cat) for cat in categorical_features]

In [11]:
# gc.collect()
# clf = lgb.LGBMClassifier(**params)
# run_cross_validation(clf, train_df.to_coo(), train_y.values.ravel(), {'categorical_feature': categorical_indices})

In [12]:
def train(x, y, params={}):
    clf = lgb.LGBMClassifier(**params)
    clf.fit(x, y, categorical_feature=categorical_indices)
    return clf


In [13]:
# load model
# from sklearn.externals import joblib

# model = joblib.load('lgb.pkl')

In [14]:
%%time
model = train(train_df.to_coo(), train_y.values.ravel(), params)

New categorical_feature is [9401, 9402, 9403, 9408, 9410]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


CPU times: user 2d 12h 2min 38s, sys: 48min 54s, total: 2d 12h 51min 33s
Wall time: 11h 15min 38s


In [16]:
from sklearn.externals import joblib
# save model
joblib.dump(model, 'model/merged.pkl')

['model/merged.pkl']

In [19]:
# pred_train_y  = model.predict(train_df.to_coo())
# acc = (pred_train_y == train_y.values.ravel()).sum()
# acc / pred_train_y.shape[0]

In [15]:
%%time
pred_y = model.predict(test_df.to_coo())



CPU times: user 1h 57min 22s, sys: 1min, total: 1h 58min 22s
Wall time: 12min 45s


In [16]:
result = pd.DataFrame(test_uid)
result.columns = ['id']
result['label'] = pred_y

In [17]:
result.to_csv('submission.csv', index=False)

In [18]:
for i in range(train_df.shape[1]):
    print(train_df.columns[i], model.feature_importances_[i])

0 0
1 13
2 0
3 0
4 0
5 0
6 0
7 1
8 10
9 0
10 11
11 0
12 7
13 0
14 2
15 13
16 0
17 0
18 2
19 0
20 2
21 0
22 0
23 3
24 1
25 0
26 0
27 41
28 0
29 0
30 89
31 1
32 37
33 0
34 0
35 0
36 467
37 151
38 14
39 14
40 0
41 0
42 0
43 1
44 0
45 0
46 0
47 91
48 352
49 0
50 0
51 3
52 0
53 2
54 0
55 1
56 0
57 67
58 297
59 10
60 6
61 15
62 0
63 1
64 4
65 138
66 15
67 0
68 6
69 75
70 0
71 0
72 0
73 27
74 44
75 176
76 36
77 3
78 0
79 7
80 83
81 315
82 214
83 10
84 66
85 3
86 0
87 0
88 0
89 20
90 0
91 22
92 0
93 0
94 0
95 0
96 69
97 2
98 0
99 22
100 1
101 46
102 0
103 156
104 14
105 6
106 6
107 0
108 0
109 0
110 152
111 0
112 0
113 2
114 4
115 28
116 31
117 0
118 0
119 0
120 0
121 0
122 11
123 100
124 0
125 19
126 31
127 38
128 9
129 31
130 171
131 85
132 0
133 137
134 4
135 0
136 11
137 21
138 46
139 67
140 0
141 27
142 39
143 0
144 22
145 18
146 0
147 2
148 0
149 0
150 0
151 0
152 33
153 16
154 88
155 0
156 3
157 12
158 249
159 1
160 0
161 91
162 34
163 0
164 183
165 1
166 155
167 20
168 65
169 0
170 0
1

1276 14
1277 615
1278 0
1279 21
1280 4
1281 3
1282 1408
1283 0
1284 1
1285 1
1286 30
1287 58
1288 0
1289 5
1290 220
1291 0
1292 5
1293 18
1294 2
1295 21
1296 60
1297 21
1298 171
1299 10
1300 65
1301 0
1302 77
1303 44
1304 10
1305 0
1306 0
1307 1
1308 0
1309 277
1310 0
1311 0
1312 0
1313 38
1314 1
1315 5
1316 0
1317 29
1318 0
1319 0
1320 0
1321 0
1322 0
1323 0
1324 0
1325 2
1326 0
1327 0
1328 282
1329 505
1330 1
1331 152
1332 220
1333 2
1334 11
1335 9
1336 3
1337 0
1338 0
1339 0
1340 0
1341 0
1342 80
1343 2
1344 0
1345 20
1346 2
1347 7
1348 11
1349 12
1350 0
1351 0
1352 0
1353 40
1354 0
1355 16
1356 50
1357 32
1358 0
1359 180
1360 0
1361 25
1362 0
1363 0
1364 0
1365 11
1366 0
1367 0
1368 3
1369 38
1370 8
1371 72
1372 85
1373 0
1374 0
1375 5
1376 3
1377 13
1378 0
1379 22
1380 17
1381 16
1382 6
1383 0
1384 0
1385 0
1386 77
1387 0
1388 7
1389 0
1390 84
1391 2594
1392 0
1393 96
1394 45
1395 56
1396 0
1397 72
1398 25
1399 0
1400 153
1401 0
1402 2
1403 1
1404 1972
1405 5
1406 31
1407 3
1408 3

2383 34
2384 0
2385 0
2386 101
2387 3
2388 461
2389 0
2390 75
2391 9
2392 19
2393 0
2394 0
2395 57
2396 2
2397 375
2398 3
2399 43
2400 0
2401 33
2402 7
2403 0
2404 31
2405 0
2406 6
2407 13
2408 0
2409 0
2410 0
2411 0
2412 4
2413 0
2414 0
2415 0
2416 2
2417 45
2418 30
2419 23
2420 1
2421 89
2422 0
2423 18
2424 0
2425 1
2426 0
2427 64
2428 4
2429 14
2430 1
2431 17
2432 0
2433 0
2434 0
2435 1161
2436 0
2437 0
2438 0
2439 0
2440 0
2441 4
2442 22
2443 0
2444 0
2445 0
2446 8
2447 7
2448 1
2449 64
2450 0
2451 21
2452 1
2453 0
2454 0
2455 1
2456 6
2457 4
2458 0
2459 36
2460 95
2461 0
2462 0
2463 0
2464 0
2465 78
2466 0
2467 0
2468 0
2469 0
2470 1
2471 0
2472 0
2473 4
2474 0
2475 5
2476 13
2477 0
2478 0
2479 8
2480 0
2481 0
2482 2
2483 187
2484 84
2485 1
2486 0
2487 311
2488 0
2489 28
2490 1
2491 0
2492 15
2493 0
2494 0
2495 1
2496 95
2497 176
2498 0
2499 37
2500 0
2501 0
2502 0
2503 0
2504 0
2505 0
2506 0
2507 0
2508 0
2509 0
2510 0
2511 0
2512 0
2513 0
2514 15
2515 1
2516 0
2517 0
2518 0
2519

3485 0
3486 3
3487 2
3488 0
3489 0
3490 0
3491 0
3492 28
3493 9
3494 0
3495 0
3496 0
3497 0
3498 68
3499 0
3500 0
3501 27
3502 0
3503 0
3504 0
3505 9
3506 26
3507 147
3508 3
3509 0
3510 3
3511 21
3512 0
3513 0
3514 4
3515 21
3516 0
3517 19
3518 8
3519 0
3520 245
3521 0
3522 18
3523 5
3524 15
3525 6
3526 1
3527 0
3528 23
3529 0
3530 0
3531 0
3532 5
3533 11
3534 339
3535 0
3536 0
3537 0
3538 0
3539 0
3540 0
3541 0
3542 67
3543 36
3544 0
3545 0
3546 12
3547 133
3548 28
3549 81
3550 568
3551 42
3552 70
3553 2
3554 60
3555 8
3556 49
3557 50
3558 2
3559 227
3560 0
3561 27
3562 0
3563 1
3564 0
3565 0
3566 0
3567 0
3568 37
3569 0
3570 11
3571 0
3572 1
3573 6
3574 0
3575 10
3576 0
3577 1
3578 12
3579 0
3580 0
3581 6
3582 0
3583 14
3584 27
3585 32
3586 0
3587 55
3588 0
3589 0
3590 6
3591 0
3592 0
3593 1
3594 0
3595 0
3596 0
3597 521
3598 344
3599 0
3600 0
3601 0
3602 8
3603 13
3604 27
3605 17
3606 0
3607 0
3608 0
3609 0
3610 0
3611 6
3612 0
3613 14
3614 22
3615 2
3616 24
3617 64
3618 90
3619 48


4615 0
4616 1
4617 0
4618 109
4619 0
4620 5
4621 0
4622 0
4623 1
4624 0
4625 11
4626 0
4627 0
4628 2
4629 0
4630 0
4631 0
4632 0
4633 2
4634 0
4635 0
4636 1
4637 0
4638 0
4639 1
4640 0
4641 4
4642 1
4643 17
4644 0
4645 13
4646 9
4647 0
4648 3
4649 0
4650 0
4651 0
4652 1
4653 0
4654 0
4655 6
4656 0
4657 0
4658 0
4659 0
4660 0
4661 0
4662 0
4663 0
4664 0
4665 0
4666 2
4667 1
4668 0
4669 6
4670 3
4671 0
4672 0
4673 0
4674 0
4675 10
4676 0
4677 0
4678 14
4679 2
4680 1
4681 0
4682 0
4683 7
4684 0
4685 0
4686 1
4687 0
4688 0
4689 0
4690 0
4691 0
4692 1
4693 0
4694 0
4695 0
4696 3
4697 0
4698 8
4699 0
4700 0
4701 0
4702 0
4703 0
4704 0
4705 0
4706 1
4707 2
4708 0
4709 0
4710 1
4711 0
4712 0
4713 0
4714 22
4715 4
4716 0
4717 18
4718 0
4719 0
4720 0
4721 0
4722 2
4723 0
4724 15
4725 0
4726 6
4727 0
4728 0
4729 0
4730 0
4731 0
4732 0
4733 5
4734 0
4735 0
4736 12
4737 0
4738 10
4739 53
4740 0
4741 1
4742 1
4743 0
4744 5
4745 6
4746 0
4747 0
4748 0
4749 7
4750 0
4751 0
4752 5
4753 0
4754 0
4755 0


5729 0
5730 0
5731 17
5732 0
5733 23
5734 5
5735 0
5736 5
5737 13
5738 22
5739 9
5740 0
5741 0
5742 0
5743 76
5744 335
5745 0
5746 99
5747 16
5748 0
5749 6
5750 0
5751 3
5752 0
5753 11
5754 1
5755 5
5756 34
5757 0
5758 3
5759 12
5760 7
5761 0
5762 17
5763 0
5764 0
5765 0
5766 22
5767 534
5768 11
5769 84
5770 20
5771 0
5772 0
5773 0
5774 25
5775 1
5776 19
5777 0
5778 6
5779 21
5780 0
5781 17
5782 40
5783 188
5784 0
5785 0
5786 5
5787 0
5788 0
5789 1736
5790 0
5791 4
5792 1587
5793 30
5794 44
5795 53
5796 0
5797 18
5798 14
5799 3
5800 0
5801 39
5802 0
5803 21
5804 0
5805 5
5806 89
5807 43
5808 50
5809 0
5810 101
5811 51
5812 0
5813 33
5814 0
5815 22
5816 0
5817 294
5818 1
5819 12
5820 18
5821 124
5822 17
5823 56
5824 42
5825 2
5826 0
5827 4
5828 443
5829 162
5830 0
5831 0
5832 0
5833 0
5834 0
5835 0
5836 50
5837 21
5838 0
5839 164
5840 0
5841 0
5842 36
5843 10
5844 0
5845 58
5846 22
5847 11
5848 37
5849 2
5850 1
5851 11
5852 124
5853 319
5854 3
5855 0
5856 591
5857 21
5858 94
5859 1
5860

6820 50
6821 219
6822 27
6823 0
6824 584
6825 0
6826 9
6827 0
6828 0
6829 1
6830 3
6831 0
6832 42
6833 2555
6834 0
6835 0
6836 0
6837 0
6838 0
6839 0
6840 336
6841 6
6842 0
6843 0
6844 0
6845 13
6846 9
6847 18
6848 17
6849 0
6850 0
6851 0
6852 13
6853 0
6854 5
6855 6
6856 0
6857 0
6858 16
6859 12
6860 37
6861 40
6862 0
6863 1
6864 0
6865 2
6866 8
6867 0
6868 0
6869 15
6870 0
6871 9
6872 0
6873 2
6874 28
6875 0
6876 18
6877 0
6878 0
6879 0
6880 0
6881 13
6882 33
6883 0
6884 6
6885 0
6886 0
6887 0
6888 11
6889 0
6890 0
6891 8
6892 15
6893 0
6894 0
6895 2
6896 0
6897 28
6898 7
6899 0
6900 0
6901 5
6902 26
6903 4
6904 1
6905 65
6906 29
6907 21
6908 0
6909 6
6910 8
6911 197
6912 69
6913 0
6914 2
6915 5
6916 3
6917 15
6918 52
6919 0
6920 0
6921 3
6922 22
6923 195
6924 47
6925 0
6926 0
6927 0
6928 0
6929 0
6930 0
6931 0
6932 0
6933 0
6934 40
6935 0
6936 0
6937 0
6938 0
6939 0
6940 1
6941 0
6942 0
6943 0
6944 0
6945 0
6946 0
6947 0
6948 0
6949 0
6950 0
6951 228
6952 0
6953 0
6954 0
6955 0
6956

7944 508
7945 6
7946 31
7947 44
7948 6
7949 1
7950 0
7951 4
7952 93
7953 0
7954 2
7955 68
7956 157
7957 11
7958 26
7959 0
7960 61
7961 0
7962 0
7963 6
7964 20
7965 0
7966 2
7967 0
7968 0
7969 0
7970 16
7971 1
7972 32
7973 22
7974 0
7975 61
7976 23
7977 0
7978 3
7979 0
7980 178
7981 0
7982 66
7983 2
7984 9
7985 0
7986 4
7987 0
7988 0
7989 0
7990 112
7991 14
7992 0
7993 0
7994 0
7995 0
7996 7
7997 22
7998 0
7999 29
8000 10
8001 0
8002 11
8003 25
8004 0
8005 161
8006 1
8007 0
8008 124
8009 565
8010 46
8011 312
8012 299
8013 0
8014 4
8015 0
8016 0
8017 0
8018 0
8019 0
8020 0
8021 0
8022 27
8023 0
8024 0
8025 35
8026 3
8027 12
8028 28
8029 0
8030 385
8031 15
8032 34
8033 0
8034 1
8035 346
8036 0
8037 14
8038 479
8039 0
8040 0
8041 0
8042 0
8043 0
8044 203
8045 108
8046 3112
8047 66
8048 0
8049 2
8050 153
8051 136
8052 0
8053 3
8054 39
8055 0
8056 89
8057 0
8058 0
8059 20
8060 0
8061 0
8062 62
8063 178
8064 0
8065 0
8066 25
8067 0
8068 1
8069 253
8070 0
8071 5
8072 0
8073 0
8074 3
8075 6
807

9034 0
9035 0
9036 0
9037 1
9038 16
9039 28
9040 1
9041 30
9042 36
9043 0
9044 4
9045 9
9046 1
9047 12
9048 0
9049 39
9050 79
9051 4
9052 36
9053 0
9054 1
9055 1
9056 0
9057 33
9058 15
9059 0
9060 0
9061 3
9062 52
9063 24
9064 79
9065 3
9066 10
9067 457
9068 138
9069 167
9070 3
9071 88
9072 10
9073 95
9074 1
9075 0
9076 23
9077 0
9078 0
9079 0
9080 0
9081 0
9082 33
9083 24
9084 43
9085 0
9086 0
9087 0
9088 97
9089 0
9090 28
9091 0
9092 0
9093 0
9094 32
9095 10
9096 0
9097 0
9098 98
9099 0
9100 101
9101 0
9102 0
9103 1
9104 0
9105 0
9106 0
9107 173
9108 114
9109 12
9110 38
9111 6
9112 0
9113 10
9114 10
9115 13
9116 19
9117 142
9118 0
9119 0
9120 8
9121 0
9122 0
9123 0
9124 0
9125 0
9126 94
9127 16
9128 22
9129 3
9130 0
9131 60
9132 0
9133 0
9134 0
9135 0
9136 0
9137 4
9138 1
9139 30
9140 13
9141 14
9142 0
9143 0
9144 0
9145 0
9146 389
9147 0
9148 3
9149 14
9150 7
9151 0
9152 8
9153 6
9154 5
9155 0
9156 2
9157 15
9158 8
9159 0
9160 15
9161 46
9162 16
9163 14
9164 21
9165 39
9166 69
9167 

In [13]:
from sklearn.externals import joblib

model = joblib.load('model/merged.pkl')

In [14]:
model

LGBMClassifier(bagging_fraction=0.7, bagging_freq=20, boosting_type='gbdt',
        class_weight=None, colsample_bytree=1.0, feature_fraction=0.4,
        importance_type='split', learning_rate=0.05, max_bin=1500,
        max_depth=-1, metric='multi_logloss', min_child_samples=100,
        min_child_weight=0.001, min_split_gain=0.0, n_estimators=2000,
        n_jobs=-1, num_leaves=120, objective='multiclass,multi_error',
        random_state=None, reg_alpha=4, reg_lambda=8, silent=True,
        subsample=1.0, subsample_for_bin=100000, subsample_freq=0,
        verbose=5)

In [None]:
# proba = model.predict_proba(test_df.to_coo())

In [None]:
# pd.DataFrame(proba).idxmax(axis=1)+1