## 预测文言文还是现代文
```
train: [0: 5000）
test: [5000,8385)
```

In [4]:
import pandas as pd

train = pd.read_csv('data/train.txt')
test = pd.read_csv('data/test.txt')
submit = pd.read_csv('data/sample_submit.csv')
sample_submit = pd.read_csv('data/sample_submit.csv')


In [5]:
total = len(train) + len(test)
n_train = len(train)

### 当场训练w2v

In [12]:
from gensim.models import Word2Vec

texts = list(train['text']) + list(test['text'])
ndims = 100
model = Word2Vec(sentences=texts, size=ndims)

### 用w2v将所有句子加和平均，做嵌入

In [13]:
import numpy as np
vecs = np.zeros([total, ndims])
for i, sentence in enumerate(texts):
    counts, row = 0, 0
    for char in sentence:
        try:
            if char != ' ':
                row += model.wv[char]
                counts += 1
        except:
            pass
    if counts == 0:
        print(sentence)
    vecs[i, :] = row / counts

### 用xbg的分类器做预测(这里也可以用sklearn中的决策树来做)

In [19]:
import xgboost as xgb

clf = xgb.XGBClassifier(
    objective='binary:logistic',
    learning_rate=0.01,
    n_estimators=1000,
    max_depth=4,
    min_child_weight=5,
    seed=0,
    subsample=0.8,
    colsample_bytree=0.3,
    gamma=0.5,
    reg_alpha=3,
    reg_lambda=1,
    metrics='logloss')

clf.fit(vecs[:n_train], train['y'])
print(clf.score(vecs[:n_train], train['y']))
submit['y'] = clf.predict_proba(vecs[n_train:])[:, 1]
submit.to_csv('my_prediction.csv', index=False)


0.9456


### 网格搜索调参

In [None]:
from sklearn.model_selection import GridSearchCV

def model_cv(X_train, Y_train):
    cv_params = {
        'n_estimators': range(1000, 1050, 25),
        # 'max_depth': range(3, 8, 1),
        # 'min_child_weight': range(3, 8, 1),
        # 'subsample': [0.5, 0.6, 0.7, 0.8, 0.9],
        # 'colsample_bytree': [0.2, 0.3, 0.4],
        # 'gamma': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6],
        # 'reg_alpha': [2, 3, 4, 5, 6],
        # 'reg_lambda': [2, 3, 4, 5, 6, 7],
        # 'learning_rate': [0.001, 0.005, 0.01, 0.05, 0.1]
    }
    model = xgb.XGBClassifier(
        objective='binary:logistic',
        learning_rate=0.01,
        n_estimators=1000,
        max_depth=4,
        min_child_weight=5,
        seed=0,
        subsample=0.8,
        colsample_bytree=0.3,
        gamma=0.5,
        reg_alpha=3,
        reg_lambda=1,
        metrics='logloss')

    optimized_GBM = GridSearchCV(
        estimator=model,
        param_grid=cv_params,
        scoring='neg_log_loss',
        cv=5,
        verbose=1,
        n_jobs=4)

    optimized_GBM.fit(X_train, Y_train)
    evalute_result = optimized_GBM.grid_scores_
    print('每轮迭代运行结果:{0}'.format(evalute_result))
    print('参数的最佳取值：{0}'.format(optimized_GBM.best_params_))
    print('最佳模型得分:{0}'.format(optimized_GBM.best_score_))


if __name__ == "__main__":
    model_cv(vecs[:n_train], train['y'])