In [0]:
# 引入使用的依赖库
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import sklearn
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import GradientBoostingClassifier

In [19]:
# 读取数据，归一化
all_pd = pd.read_csv( "data_train.csv", sep=',', names=['f1', 'f2','f3','f4','f5','f6','label'])
X = np.array(all_pd.iloc[:,4:6].values.tolist(), dtype='float32')
y = np.array(all_pd.iloc[:,-1].values.tolist(), dtype='float32')

scaler = sklearn.preprocessing.StandardScaler().fit(X)
X = scaler.transform(X)

print(X.shape)
print(y.shape)

(85500, 2)
(85500,)


In [0]:
# 随机划分train/val, 80:20
train_X,val_X, train_y, val_y = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [0]:
# 定义f1 metric
from keras import backend as K

def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        """Recall metric.

        Only computes a batch-wise average of recall.

        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        """Precision metric.

        Only computes a batch-wise average of precision.

        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [0]:
# 定义MLP模型结构
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Dropout

model = Sequential()
model.add(Dense(8, input_dim=2, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(4, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(2, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=[f1])

In [0]:
# 定义callback
from keras.callbacks import EarlyStopping,ModelCheckpoint
callbacks = [EarlyStopping(monitor='val_f1', min_delta=0.001, patience=10, verbose=0, mode='max'), 
    ModelCheckpoint("w.h5", monitor='val_f1', verbose=0, save_best_only=True, save_weights_only=True, mode='max', period=1)]

In [28]:
# 训练
model.fit(train_X, train_y,epochs=100,batch_size=2048,validation_data=(val_X, val_y), callbacks=callbacks)

Train on 68400 samples, validate on 17100 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100


<keras.callbacks.History at 0x7f5c0746a518>

In [0]:
# 产出预测结果
test_data = pd.read_csv( "data_test.csv", sep=',', names=['f1', 'f2','f3','f4','f5','f6']).as_matrix()
result = open("result.csv", "w+", encoding="UTF-8")
test_data = scaler.transform(test_data)

res = model.predict(test_data)

for i,r in enumerate(res):
    result.write("{},{}\n".format(i+1,int(round(r[0])) ))
result.close()