In [1]:
import pandas as pd
from tqdm.autonotebook import tqdm
import os
import numpy as np
tqdm.pandas()



In [2]:
train_dir = './train-data'
train_tfidf = np.load(os.path.join(train_dir, 'train_tfidf.npy'))[()]
test_tfidf = np.load(os.path.join(train_dir, 'test_tfidf.npy'))[()]
# Class will start with 0
train_y = pd.read_csv(os.path.join(train_dir, 'train_y.csv'))
test_uid = pd.read_csv(os.path.join(train_dir, 'test_x.csv'))['uid']

In [3]:
import sklearn.model_selection
import sklearn.metrics
import lightgbm as lgb
import sklearn.ensemble
import gc

def cv(clf, x, y, params={}, splits=3, fit_params={}):
    kfold = sklearn.model_selection.StratifiedKFold(splits, shuffle=True)
    cv_score = sklearn.model_selection.cross_validate(clf, x, y, cv=kfold, scoring={
        'accuracy': 'accuracy',
    }, return_train_score=True, verbose=5, fit_params=fit_params)
    return cv_score

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [4]:

params = {
        'boosting_type': 'gbdt',
        'objective': 'multiclass,multi_error',
        "metric": 'multi_logloss',
        'n_jobs': -1,
    
        #better
        'n_estimators': 1500,
        "num_leaves": 120,
        "subsample_for_bin": 90000,
        'max_bin': 1200,
    
        #fast
#         'n_estimators': 100,
#         "num_leaves": 36,
#         "subsample_for_bin": 20000,
#         'max_bin': 512,
    
        # Overfit
        'feature_fraction': 0.4,
        'bagging_fraction': 0.7,
        'bagging_freq': 10,
        'reg_alpha': 4,
        'reg_lambda': 8,
    
        
        "max_depth": -1,
        'min_child_samples': 100,
        'learning_rate': 0.05,
}
def run_cross_validation(clf, x, y, fit_params={}):
    cv_result = cv(clf, x, y, params=params, splits=3, fit_params=fit_params)
    for scorer, score in cv_result.items():
        print('%s: %s' % (scorer, score))
        print('Average %s: %f' % (scorer, score.mean()))

In [5]:
gc.collect()
clf = lgb.LGBMClassifier(**params)
run_cross_validation(clf, train_tfidf, train_y.values.ravel())

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  ................................................................
[CV] ................... , accuracy=0.6094065531245476, total=176.6min
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 194.9min remaining:    0.0s


[CV] ................... , accuracy=0.6089059701492537, total=176.2min
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed: 389.6min remaining:    0.0s


KeyboardInterrupt: 

In [None]:
def train(x, y, params={}):
    clf = lgb.LGBMClassifier(**params)
    clf.fit(x, y)
    return clf


In [None]:
# load model
# model = joblib.load('lgb.pkl')

In [None]:
model = train(train_tfidf, train_y.values.ravel(), params)

In [None]:
pred_train_y  = model.predict(train_tfidf)
acc = (pred_train_y == train_y.values.ravel()).sum()
acc / pred_train_y.shape[0]

In [None]:
pred_y = model.predict(test_tfidf)

In [None]:
result = pd.DataFrame(test_uid['uid'])
result.columns = ['id']
result['label'] = pred_y

In [None]:
result.to_csv('submission.csv', index=False)

In [None]:
for i in range(len(train_x.columns)):
    print(train_x.columns[i], model.feature_importances_[i])

In [None]:
from sklearn.externals import joblib
# save model
joblib.dump(model, 'model/tf-idf-lgb.pkl')


In [None]:
from sklearn.externals import joblib

model = joblib.load('model/tf-idf-lgb.pkl')

In [None]:
model