In [1]:
import pandas as pd
from tqdm.autonotebook import tqdm
import os
import numpy as np
tqdm.pandas()



In [2]:
train_dir = './train-data'
train_tfidf = np.load(os.path.join(train_dir, 'train_tfidf.npy'))[()]
test_tfidf = np.load(os.path.join(train_dir, 'test_tfidf.npy'))[()]
# Class will start with 0
train_y = pd.read_csv(os.path.join(train_dir, 'train_y.csv'))
test_uid = pd.read_csv(os.path.join(train_dir, 'test_x.csv'))['uid']
train_x = pd.read_csv(os.path.join(train_dir, 'train_x.csv'))
test_x = pd.read_csv(os.path.join(train_dir, 'test_x.csv'))

In [3]:
train_x.fillna(train_x.mean(), inplace=True)
test_x.fillna(train_x.mean(), inplace=True)

In [4]:
train_df = pd.SparseDataFrame(train_tfidf,
                         default_fill_value=0).astype('float32')
del train_tfidf
test_df = pd.SparseDataFrame(test_tfidf,
                         default_fill_value=0).astype('float32')
del test_tfidf

In [5]:
for col in train_x.columns:
    train_df[col] = train_x[col]
del train_x
for col in test_x.drop('uid', axis=1).columns:
    test_df[col] = test_x[col]
del test_x

In [6]:
categorical_features = ['gender', 'city', 'prodName', 'color', 'carrier']

In [7]:
import sklearn.model_selection
import sklearn.metrics
import lightgbm as lgb
import sklearn.ensemble
import gc

def cv(clf, x, y, params={}, splits=3, fit_params={}):
    kfold = sklearn.model_selection.StratifiedKFold(splits, shuffle=True)
    cv_score = sklearn.model_selection.cross_validate(clf, x, y, cv=kfold, scoring={
        'accuracy': 'accuracy',
    }, return_train_score=True, verbose=5, fit_params=fit_params)
    return cv_score

In [8]:

params = {
        'boosting_type': 'gbdt',
        'objective': 'multiclass,multi_error',
        "metric": 'multi_logloss',
        'n_jobs': -1,
    
        #better
        'n_estimators': 2000,
        "num_leaves": 120,
        "subsample_for_bin": 100000,
        'max_bin': 1500,
    
        #fast
#         'n_estimators': 100,
#         "num_leaves": 36,
#         "subsample_for_bin": 20000,
#         'max_bin': 512,
    
        # Overfit
        'feature_fraction': 0.4,
        'bagging_fraction': 0.7,
        'bagging_freq': 10,
        'reg_alpha': 4,
        'reg_lambda': 8,
    
        
        "max_depth": -1,
        'min_child_samples': 100,
        'learning_rate': 0.05,
}
def run_cross_validation(clf, x, y, fit_params={}):
    cv_result = cv(clf, x, y, params=params, splits=3, fit_params=fit_params)
    for scorer, score in cv_result.items():
        print('%s: %s' % (scorer, score))
        print('Average %s: %f' % (scorer, score.mean()))

In [9]:
gc.collect()


1477

In [10]:
categorical_indices = [list(train_df.columns).index(cat) for cat in categorical_features]

In [None]:
gc.collect()
clf = lgb.LGBMClassifier(**params)
run_cross_validation(clf, train_df.to_coo(), train_y.values.ravel(), {'categorical_feature': categorical_indices})

In [11]:
def train(x, y, params={}):
    clf = lgb.LGBMClassifier(**params)
    clf.fit(x, y, categorical_feature=categorical_indices)
    return clf


In [12]:
# load model
# model = joblib.load('lgb.pkl')

In [13]:
model = train(train_df.to_coo(), train_y.values.ravel(), params)

New categorical_feature is [9401, 9402, 9403, 9408, 9410]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


In [14]:
pred_train_y  = model.predict(train_df.to_coo())
acc = (pred_train_y == train_y.values.ravel()).sum()
acc / pred_train_y.shape[0]



0.7613139303482587

In [15]:
pred_y = model.predict(test_df.to_coo())

In [16]:
result = pd.DataFrame(test_uid)
result.columns = ['id']
result['label'] = pred_y

In [17]:
result.to_csv('submission.csv', index=False)

In [18]:
for i in range(train_df.shape[1]):
    print(train_df.columns[i], model.feature_importances_[i])

0 0
1 17
2 0
3 0
4 0
5 0
6 0
7 0
8 13
9 0
10 12
11 0
12 4
13 0
14 1
15 14
16 0
17 0
18 2
19 0
20 6
21 0
22 0
23 2
24 1
25 0
26 0
27 37
28 0
29 0
30 99
31 1
32 44
33 0
34 0
35 0
36 500
37 146
38 17
39 19
40 0
41 4
42 1
43 2
44 0
45 1
46 0
47 86
48 314
49 0
50 0
51 8
52 0
53 7
54 0
55 1
56 0
57 55
58 269
59 10
60 5
61 11
62 0
63 0
64 5
65 148
66 14
67 0
68 2
69 66
70 0
71 0
72 0
73 22
74 36
75 175
76 38
77 8
78 0
79 8
80 85
81 315
82 217
83 6
84 68
85 3
86 0
87 0
88 0
89 19
90 0
91 23
92 0
93 0
94 0
95 0
96 73
97 1
98 0
99 23
100 0
101 45
102 0
103 152
104 16
105 11
106 2
107 0
108 0
109 0
110 134
111 0
112 1
113 1
114 3
115 25
116 32
117 0
118 0
119 0
120 0
121 0
122 11
123 96
124 0
125 23
126 29
127 43
128 3
129 33
130 181
131 68
132 0
133 123
134 3
135 0
136 15
137 17
138 41
139 61
140 0
141 29
142 39
143 0
144 20
145 20
146 0
147 1
148 0
149 2
150 0
151 0
152 31
153 17
154 74
155 0
156 3
157 12
158 239
159 3
160 0
161 96
162 29
163 0
164 176
165 2
166 156
167 17
168 50
169 0
170 0
17

2342 5
2343 40
2344 470
2345 104
2346 1
2347 24
2348 55
2349 6
2350 12
2351 27
2352 935
2353 26
2354 0
2355 2
2356 0
2357 0
2358 9
2359 4
2360 42
2361 7
2362 12
2363 27
2364 4
2365 0
2366 215
2367 0
2368 0
2369 0
2370 5
2371 0
2372 1
2373 26
2374 0
2375 0
2376 10
2377 0
2378 1
2379 0
2380 0
2381 0
2382 66
2383 39
2384 0
2385 1
2386 110
2387 3
2388 484
2389 0
2390 81
2391 11
2392 18
2393 0
2394 0
2395 61
2396 0
2397 406
2398 0
2399 46
2400 0
2401 44
2402 12
2403 1
2404 22
2405 0
2406 11
2407 11
2408 0
2409 0
2410 0
2411 0
2412 8
2413 0
2414 0
2415 0
2416 1
2417 44
2418 28
2419 23
2420 0
2421 80
2422 0
2423 16
2424 0
2425 3
2426 0
2427 54
2428 9
2429 16
2430 6
2431 16
2432 0
2433 1
2434 0
2435 1151
2436 0
2437 0
2438 0
2439 0
2440 0
2441 0
2442 19
2443 0
2444 0
2445 0
2446 13
2447 11
2448 4
2449 59
2450 0
2451 27
2452 2
2453 0
2454 0
2455 0
2456 6
2457 0
2458 0
2459 31
2460 100
2461 0
2462 0
2463 0
2464 0
2465 61
2466 0
2467 0
2468 0
2469 0
2470 3
2471 0
2472 0
2473 11
2474 0
2475 4
2476

4537 0
4538 0
4539 32
4540 2
4541 0
4542 45
4543 0
4544 0
4545 0
4546 341
4547 0
4548 0
4549 0
4550 2
4551 0
4552 258
4553 0
4554 202
4555 17
4556 0
4557 15
4558 0
4559 0
4560 3
4561 0
4562 0
4563 0
4564 75
4565 188
4566 544
4567 0
4568 168
4569 3554
4570 15
4571 36
4572 126
4573 0
4574 6
4575 0
4576 3
4577 79
4578 54
4579 3
4580 61
4581 0
4582 4
4583 0
4584 0
4585 6
4586 14
4587 10
4588 2
4589 17
4590 13
4591 40
4592 1758
4593 42
4594 4
4595 2
4596 3
4597 0
4598 0
4599 0
4600 1
4601 2
4602 0
4603 5
4604 0
4605 0
4606 0
4607 6
4608 1
4609 0
4610 0
4611 0
4612 0
4613 0
4614 0
4615 0
4616 0
4617 0
4618 95
4619 0
4620 6
4621 0
4622 0
4623 1
4624 0
4625 9
4626 0
4627 0
4628 5
4629 1
4630 0
4631 1
4632 0
4633 1
4634 0
4635 0
4636 3
4637 1
4638 0
4639 0
4640 0
4641 2
4642 0
4643 16
4644 0
4645 12
4646 5
4647 2
4648 0
4649 0
4650 0
4651 0
4652 1
4653 0
4654 0
4655 3
4656 0
4657 0
4658 1
4659 0
4660 0
4661 0
4662 2
4663 0
4664 0
4665 0
4666 0
4667 1
4668 0
4669 6
4670 8
4671 0
4672 0
4673 0
46

6716 954
6717 35
6718 0
6719 4
6720 0
6721 0
6722 0
6723 1
6724 12
6725 185
6726 0
6727 13
6728 195
6729 1
6730 30
6731 7
6732 0
6733 147
6734 0
6735 4
6736 5
6737 1
6738 0
6739 103
6740 38
6741 2
6742 13
6743 67
6744 4
6745 50
6746 0
6747 85
6748 209
6749 1
6750 0
6751 0
6752 10
6753 116
6754 0
6755 0
6756 185
6757 7
6758 68
6759 2
6760 31
6761 0
6762 107
6763 67
6764 1
6765 0
6766 37
6767 0
6768 0
6769 4
6770 0
6771 0
6772 0
6773 48
6774 0
6775 0
6776 0
6777 1
6778 0
6779 3
6780 0
6781 0
6782 0
6783 55
6784 0
6785 8
6786 0
6787 1
6788 226
6789 0
6790 5
6791 0
6792 428
6793 23
6794 7
6795 0
6796 47
6797 220
6798 187
6799 17
6800 0
6801 0
6802 15
6803 0
6804 0
6805 184
6806 24
6807 236
6808 0
6809 0
6810 0
6811 10
6812 0
6813 0
6814 102
6815 0
6816 75
6817 1246
6818 14
6819 6
6820 44
6821 234
6822 28
6823 0
6824 591
6825 0
6826 4
6827 0
6828 0
6829 1
6830 2
6831 0
6832 36
6833 2612
6834 0
6835 0
6836 0
6837 0
6838 0
6839 0
6840 367
6841 0
6842 0
6843 0
6844 0
6845 2
6846 6
6847 18
6848

8909 0
8910 126
8911 0
8912 1
8913 20
8914 47
8915 0
8916 0
8917 54
8918 0
8919 2
8920 0
8921 0
8922 30
8923 0
8924 0
8925 0
8926 0
8927 0
8928 0
8929 28
8930 9
8931 3
8932 52
8933 62
8934 62
8935 0
8936 0
8937 15
8938 73
8939 0
8940 20
8941 793
8942 0
8943 355
8944 0
8945 8
8946 0
8947 0
8948 0
8949 36
8950 0
8951 45
8952 21
8953 0
8954 51
8955 0
8956 5
8957 7
8958 0
8959 0
8960 0
8961 8
8962 0
8963 0
8964 10
8965 187
8966 1
8967 0
8968 0
8969 31
8970 84
8971 17
8972 3
8973 0
8974 1
8975 27
8976 0
8977 0
8978 4
8979 0
8980 2
8981 23
8982 0
8983 0
8984 8
8985 0
8986 83
8987 15
8988 15
8989 19
8990 1151
8991 0
8992 0
8993 143
8994 1
8995 20
8996 115
8997 153
8998 9
8999 1
9000 0
9001 31
9002 24
9003 1635
9004 69
9005 1
9006 0
9007 41
9008 0
9009 0
9010 0
9011 6
9012 0
9013 0
9014 48
9015 4
9016 101
9017 7
9018 11
9019 0
9020 59
9021 0
9022 4
9023 0
9024 17
9025 0
9026 0
9027 12
9028 0
9029 0
9030 0
9031 9
9032 0
9033 0
9034 0
9035 0
9036 0
9037 0
9038 16
9039 28
9040 3
9041 31
9042 42
9

In [19]:
from sklearn.externals import joblib
# save model
joblib.dump(model, 'model/merged.pkl')


['model/merged.pkl']

In [20]:
from sklearn.externals import joblib

model = joblib.load('model/merged.pkl')

In [21]:
model

LGBMClassifier(bagging_fraction=0.7, bagging_freq=10, boosting_type='gbdt',
        class_weight=None, colsample_bytree=1.0, feature_fraction=0.4,
        importance_type='split', learning_rate=0.05, max_bin=1500,
        max_depth=-1, metric='multi_logloss', min_child_samples=100,
        min_child_weight=0.001, min_split_gain=0.0, n_estimators=2000,
        n_jobs=-1, num_leaves=120, objective='multiclass,multi_error',
        random_state=None, reg_alpha=4, reg_lambda=8, silent=True,
        subsample=1.0, subsample_for_bin=100000, subsample_freq=0)

In [None]:
proba = model.predict_proba(test_df.to_coo())

In [None]:
pd.DataFrame(proba).idxmax(axis=1)+1