In [1]:
# python 3.6
# tensorflow 2.3.1 cpu
# sklearn.__version__ == 0.23.2

import pandas as pd
import numpy as np
import pickle 
from time import time

from sklearn.utils import shuffle # shuffle打乱样本的顺序，它只会打乱样本的顺序，每个样本的数据维持不变。
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import GridSearchCV

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

def save_obj(obj, file):
    with open(file, 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(file ):
    with open(file, 'rb') as f:
        return pickle.load(f)

In [2]:
# 评价指标函数
def f1_score_get(precision, recall):
    # tf.keras.backend.epsilon() 的值为 1e-07
    # 1.0e-7 == tf.keras.backend.epsilon() 得到 True
    f1 = 2 * precision * recall/( precision + recall + 1.0e-7 )
    return f1

# valid_y:真实标签
# predict_y:预测标签
def eval_p_r_f1(valid_y, predict_y):
    # precision 0 1 2 3
    # setting labels=[pos_label] and average != 'binary' will report scores for that label only.
    accuracy = accuracy_score(
                    y_true = valid_y,
                    y_pred = predict_y
                              )
    
    precision_0 = precision_score(
                    y_true = valid_y,
                    y_pred = predict_y,
                    labels = [0],
                    pos_label = 0,
                    average = 'micro'
                    )

    precision_1 = precision_score(
                    y_true = valid_y,
                    y_pred = predict_y,
                    labels = [1],
                    pos_label = 1,
                    average = 'micro'
                    )

    precision_2 = precision_score(
                    y_true = valid_y,
                    y_pred = predict_y,
                    labels = [2],
                    pos_label = 2,
                    average = 'micro'
                    )

    precision_3 = precision_score(
                    y_true = valid_y,
                    y_pred = predict_y,
                    labels = [3],
                    pos_label = 3,
                    average = 'micro'
                    )
    # recall 0 1 2 3
    recall_0 = recall_score(
                    y_true = valid_y,
                    y_pred = predict_y,
                    labels = [0],
                    pos_label = 0,
                    average = 'micro'
                    )

    recall_1 = recall_score(
                    y_true = valid_y,
                    y_pred = predict_y,
                    labels = [1],
                    pos_label = 1,
                    average = 'micro'
                    )

    recall_2 = recall_score(
                    y_true = valid_y,
                    y_pred = predict_y,
                    labels = [2],
                    pos_label = 2,
                    average = 'micro'
                    )

    recall_3 = recall_score(
                    y_true = valid_y,
                    y_pred = predict_y,
                    labels = [3],
                    pos_label = 3,
                    average = 'micro'
                    )
    
    # f1_score 0 1 2 3
    f1_score_0 = f1_score_get( precision_0, recall_0 )
    f1_score_1 = f1_score_get( precision_1, recall_1 )
    f1_score_2 = f1_score_get( precision_2, recall_2 )
    f1_score_3 = f1_score_get( precision_3, recall_3 )
    
    #由y_true 计算各标签权重
    num_0 = np.sum(valid_y==0)
    num_1 = np.sum(valid_y==1)
    num_2 = np.sum(valid_y==2)
    num_3 = np.sum(valid_y==3)
    
    total = num_0 + num_1 + num_2 + num_3
    p_0 = num_0/total
    p_1 = num_1/total
    p_2 = num_2/total
    p_3 = num_3/total
    
    precision_avg = p_0 * precision_0 + p_1 * precision_1 + p_2 * precision_2 + p_3 * precision_3
    recall_avg = p_0 * recall_0 + p_1 * recall_1 + p_2 * recall_2 + p_3 * recall_3
    f1_score_avg = p_0 * f1_score_0 + p_1 * f1_score_1 + p_2 * f1_score_2 + p_3 * f1_score_3
    
    return accuracy, precision_avg, recall_avg, f1_score_avg


In [27]:
# train = pd.read_csv('./data/train_location.csv', usecols=['content', 'label'])
# valid = pd.read_csv('./data/valid_location.csv', usecols=['content', 'label'])

# train = pd.read_csv('./data/train_service.csv', usecols=['content', 'label'])
# valid = pd.read_csv('./data/valid_service.csv', usecols=['content', 'label'])

# train = pd.read_csv('./data/train_price.csv', usecols=['content', 'label'])
# valid = pd.read_csv('./data/valid_price.csv', usecols=['content', 'label'])

# train = pd.read_csv('./data/train_environment.csv', usecols=['content', 'label'])
# valid = pd.read_csv('./data/valid_environment.csv', usecols=['content', 'label'])

train = pd.read_csv('./data/train_dish.csv', usecols=['content', 'label'])
valid = pd.read_csv('./data/valid_dish.csv', usecols=['content', 'label'])

train = shuffle(train, random_state = 2020)
valid = shuffle(valid, random_state = 2020)

train = shuffle(train, random_state = 42)
valid = shuffle(valid, random_state = 42)

print(train.shape)
print(valid.shape)


(105000, 2)
(15000, 2)


In [28]:
# 读取 X
# 不同aspect，文本数据一样，不同体现在标签上

train_x = train.content.values.tolist()
valid_x = valid.content.values.tolist()

# print(train_x[0:3])

In [29]:
# 读取 Y
# 标签
train_y = train.label.values+2
valid_y = valid.label.values+2

In [30]:
n_features = 1000

tfidf_vectorizer = TfidfVectorizer(
                                        max_df=0.95, 
                                        min_df=2,
                                        max_features=n_features,
                                   )
corpus = train_x + valid_x
print(len(corpus))

t0 = time()
tfidf = tfidf_vectorizer.fit(corpus)

120000


In [31]:
print("done in %0.3fs." % (time() - t0))
train_x = tfidf.transform(train_x)
valid_x = tfidf.transform(valid_x)

done in 8.560s.


In [32]:
print(train_x.shape)
print(valid_x.shape)

(105000, 1000)
(15000, 1000)


In [33]:
# 标准化
scaler = StandardScaler(with_mean=False)
train_x = scaler.fit_transform(train_x)
valid_x = scaler.fit_transform(valid_x)

In [10]:
hidden_layer_sizes_range = [32, 64, 128, 256] # h越大，拟合能力越强
alpha_range = [0.1, 0.01, 0.001, 0.0001] # alpha越小，拟合能力越强
tol_range = [0.01, 0.001, 0.0001] # tol_range 越小，拟合能力越强

In [11]:
for hidden in hidden_layer_sizes_range:
    for alpha in alpha_range:
        for tol in tol_range:
            print('hidden_layer_sizes: %d' % hidden)
            print('alpha: %f' % alpha)
            print('tol: %f' % tol)
            clf = MLPClassifier(
                                    hidden_layer_sizes = (hidden,),
                                    activation = 'relu',
                                    solver = 'adam',
                                    alpha = alpha,
                                    batch_size = 128,
                                    learning_rate = 'adaptive',
                                    learning_rate_init = 0.001,
                                    max_iter = 10000,
                                    shuffle=True,
                                    random_state = 2020,
                                    tol = tol,
                                    verbose = True,
                                    early_stopping = True,
                                    validation_fraction = 0.1,
                                    n_iter_no_change = 10
                                )
            t0 = time()
            clf.fit(train_x, train_y)
            print("done in %0.3fs." % (time() - t0))
            predict_y = clf.predict(valid_x)
            # predict_prob_y = clf.predict_proba(valid_x)
            # 评价指标值
            accuracy, precision_avg, recall_avg, f1_score_avg = eval_p_r_f1(valid_y, predict_y)
            print('accuracy：%0.5f' % accuracy)
            print('precision_avg: %0.5f' % precision_avg)
            print('recall_avg: %0.5f' % recall_avg)
            print('f1_score_avg: %0.5f' % f1_score_avg)
            print('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>')
            

hidden_layer_sizes: 32
alpha: 0.100000
tol: 0.010000
Iteration 1, loss = 0.72881718
Validation score: 0.786286
Iteration 2, loss = 0.60309326
Validation score: 0.790952
Iteration 3, loss = 0.58067367
Validation score: 0.790381
Iteration 4, loss = 0.56586654
Validation score: 0.788762
Iteration 5, loss = 0.55131942
Validation score: 0.784952
Iteration 6, loss = 0.53667248
Validation score: 0.781619
Iteration 7, loss = 0.52284411
Validation score: 0.775905
Iteration 8, loss = 0.51152959
Validation score: 0.775619
Iteration 9, loss = 0.50052654
Validation score: 0.768571
Iteration 10, loss = 0.49208090
Validation score: 0.761429
Iteration 11, loss = 0.48459131
Validation score: 0.759619
Iteration 12, loss = 0.47694508
Validation score: 0.758952
Validation score did not improve more than tol=0.010000 for 10 consecutive epochs. Stopping.
done in 31.672s.
accuracy：0.79547
precision_avg: 0.76734
recall_avg: 0.79547
f1_score_avg: 0.77285
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
hidden_la

In [10]:
clf = MLPClassifier(
                        hidden_layer_sizes = (32,),
                        activation = 'relu',
                        solver = 'adam',
                        alpha = 0.001,
                        batch_size = 128,
                        learning_rate = 'adaptive',
                        learning_rate_init = 0.001,
                        max_iter = 10000,
                        shuffle=True,
                        random_state = 2020,
                        tol = 0.01,
                        verbose = True,
                        early_stopping = True,
                        validation_fraction = 0.1,
                        n_iter_no_change = 10
                    )

t0 = time()
clf.fit(train_x, train_y)
print("done in %0.3fs." % (time() - t0))
predict_y = clf.predict(valid_x)
# predict_prob_y = clf.predict_proba(valid_x)
# 评价指标值
accuracy, precision_avg, recall_avg, f1_score_avg = eval_p_r_f1(valid_y, predict_y)

print('accuracy：%0.5f' % accuracy)
print('precision_avg: %0.5f' % precision_avg)
print('recall_avg: %0.5f' % recall_avg)
print('f1_score_avg: %0.5f' % f1_score_avg)
print('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>')

Iteration 1, loss = 1.00129246
Validation score: 0.647048
Iteration 2, loss = 0.85572982
Validation score: 0.654000
Iteration 3, loss = 0.82261760
Validation score: 0.646762
Iteration 4, loss = 0.79443682
Validation score: 0.643714
Iteration 5, loss = 0.76908106
Validation score: 0.639048
Iteration 6, loss = 0.74541722
Validation score: 0.634000
Iteration 7, loss = 0.72562899
Validation score: 0.627810
Iteration 8, loss = 0.70916985
Validation score: 0.629143
Iteration 9, loss = 0.69299927
Validation score: 0.623905
Iteration 10, loss = 0.68046127
Validation score: 0.618000
Iteration 11, loss = 0.66826883
Validation score: 0.619048
Iteration 12, loss = 0.65881605
Validation score: 0.610667
Validation score did not improve more than tol=0.010000 for 10 consecutive epochs. Stopping.
done in 66.232s.
accuracy：0.64673
precision_avg: 0.60509
recall_avg: 0.64673
f1_score_avg: 0.60932
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>


In [18]:
clf = MLPClassifier(
                        hidden_layer_sizes = (32,),
                        activation = 'relu',
                        solver = 'adam',
                        alpha = 0.001,
                        batch_size = 128,
                        learning_rate = 'adaptive',
                        learning_rate_init = 0.001,
                        max_iter = 10000,
                        shuffle=True,
                        random_state = 2020,
                        tol = 0.01,
                        verbose = True,
                        early_stopping = True,
                        validation_fraction = 0.1,
                        n_iter_no_change = 10
                    )

t0 = time()
clf.fit(train_x, train_y)
print("done in %0.3fs." % (time() - t0))
predict_y = clf.predict(valid_x)
# predict_prob_y = clf.predict_proba(valid_x)
# 评价指标值
accuracy, precision_avg, recall_avg, f1_score_avg = eval_p_r_f1(valid_y, predict_y)

print('accuracy：%0.5f' % accuracy)
print('precision_avg: %0.5f' % precision_avg)
print('recall_avg: %0.5f' % recall_avg)
print('f1_score_avg: %0.5f' % f1_score_avg)
print('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>')

Iteration 1, loss = 1.10354964
Validation score: 0.587524
Iteration 2, loss = 0.96335374
Validation score: 0.593905
Iteration 3, loss = 0.93080368
Validation score: 0.595524
Iteration 4, loss = 0.90278592
Validation score: 0.592190
Iteration 5, loss = 0.87696316
Validation score: 0.587333
Iteration 6, loss = 0.85306272
Validation score: 0.580190
Iteration 7, loss = 0.83347039
Validation score: 0.575238
Iteration 8, loss = 0.81649123
Validation score: 0.569429
Iteration 9, loss = 0.80104054
Validation score: 0.564381
Iteration 10, loss = 0.78759077
Validation score: 0.559524
Iteration 11, loss = 0.77646669
Validation score: 0.564476
Iteration 12, loss = 0.76587291
Validation score: 0.559143
Validation score did not improve more than tol=0.010000 for 10 consecutive epochs. Stopping.
done in 64.282s.
accuracy：0.59400
precision_avg: 0.56406
recall_avg: 0.59400
f1_score_avg: 0.56653
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>


In [26]:
clf = MLPClassifier(
                        hidden_layer_sizes = (32,),
                        activation = 'relu',
                        solver = 'adam',
                        alpha = 0.001,
                        batch_size = 128,
                        learning_rate = 'adaptive',
                        learning_rate_init = 0.001,
                        max_iter = 10000,
                        shuffle=True,
                        random_state = 2020,
                        tol = 0.01,
                        verbose = True,
                        early_stopping = True,
                        validation_fraction = 0.1,
                        n_iter_no_change = 10
                    )

t0 = time()
clf.fit(train_x, train_y)
print("done in %0.3fs." % (time() - t0))
predict_y = clf.predict(valid_x)
# predict_prob_y = clf.predict_proba(valid_x)
# 评价指标值
accuracy, precision_avg, recall_avg, f1_score_avg = eval_p_r_f1(valid_y, predict_y)

print('accuracy：%0.5f' % accuracy)
print('precision_avg: %0.5f' % precision_avg)
print('recall_avg: %0.5f' % recall_avg)
print('f1_score_avg: %0.5f' % f1_score_avg)
print('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>')

Iteration 1, loss = 0.95341149
Validation score: 0.675238
Iteration 2, loss = 0.81123457
Validation score: 0.684095
Iteration 3, loss = 0.77914034
Validation score: 0.679238
Iteration 4, loss = 0.75176375
Validation score: 0.680190
Iteration 5, loss = 0.72618962
Validation score: 0.673238
Iteration 6, loss = 0.70425256
Validation score: 0.667429
Iteration 7, loss = 0.68460786
Validation score: 0.661143
Iteration 8, loss = 0.66749715
Validation score: 0.652762
Iteration 9, loss = 0.65330401
Validation score: 0.658667
Iteration 10, loss = 0.63999857
Validation score: 0.653429
Iteration 11, loss = 0.62779777
Validation score: 0.650000
Iteration 12, loss = 0.61790047
Validation score: 0.649905
Validation score did not improve more than tol=0.010000 for 10 consecutive epochs. Stopping.
done in 70.249s.
accuracy：0.68907
precision_avg: 0.64075
recall_avg: 0.68907
f1_score_avg: 0.64657
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>


In [34]:
clf = MLPClassifier(
                        hidden_layer_sizes = (32,),
                        activation = 'relu',
                        solver = 'adam',
                        alpha = 0.001,
                        batch_size = 128,
                        learning_rate = 'adaptive',
                        learning_rate_init = 0.001,
                        max_iter = 10000,
                        shuffle=True,
                        random_state = 2020,
                        tol = 0.01,
                        verbose = True,
                        early_stopping = True,
                        validation_fraction = 0.1,
                        n_iter_no_change = 10
                    )

t0 = time()
clf.fit(train_x, train_y)
print("done in %0.3fs." % (time() - t0))
predict_y = clf.predict(valid_x)
# predict_prob_y = clf.predict_proba(valid_x)
# 评价指标值
accuracy, precision_avg, recall_avg, f1_score_avg = eval_p_r_f1(valid_y, predict_y)

print('accuracy：%0.5f' % accuracy)
print('precision_avg: %0.5f' % precision_avg)
print('recall_avg: %0.5f' % recall_avg)
print('f1_score_avg: %0.5f' % f1_score_avg)
print('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>')

Iteration 1, loss = 0.79921953
Validation score: 0.695333
Iteration 2, loss = 0.69819248
Validation score: 0.700571
Iteration 3, loss = 0.66713951
Validation score: 0.699714
Iteration 4, loss = 0.64094907
Validation score: 0.693429
Iteration 5, loss = 0.61645216
Validation score: 0.686762
Iteration 6, loss = 0.59464975
Validation score: 0.680667
Iteration 7, loss = 0.57491549
Validation score: 0.676476
Iteration 8, loss = 0.55722720
Validation score: 0.669238
Iteration 9, loss = 0.54261450
Validation score: 0.666952
Iteration 10, loss = 0.52977499
Validation score: 0.670857
Iteration 11, loss = 0.51733043
Validation score: 0.672286
Iteration 12, loss = 0.50622859
Validation score: 0.666762
Validation score did not improve more than tol=0.010000 for 10 consecutive epochs. Stopping.
done in 66.807s.
accuracy：0.69313
precision_avg: 0.64821
recall_avg: 0.69313
f1_score_avg: 0.65135
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
