## 达观杯2021

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

%cd ../../

E:\兼职\深度之眼\比赛训练营\21年8月-达观-风险事件标签识别\ppts\第一课-开营\codes


### 加载数据集，并切分train/dev

In [2]:
# 加载数据
df_train = pd.read_csv("./datasets/phase_1/splits/fold_0/train.txt")
df_train.columns = ["id", "text", "label"]
df_val = pd.read_csv("./datasets/phase_1/splits/fold_0/dev.txt")
df_val.columns = ["id", "text", "label"]
df_test = pd.read_csv("./datasets/phase_1/splits/fold_0/test.txt")
df_test.columns = ["id", "text", ]

# 构建词表
charset = set()
for text in df_train['text']:
    for char in text.split(" "):
        charset.add(char)
id2char = ['OOV', '，', '。', '！', '？'] + list(charset)
char2id = {id2char[i]: i for i in range(len(id2char))}

# 标签集
id2label = list(df_train['label'].unique())
label2id = {id2label[i]: i for i in range(len(id2label))}

### 定义模型

In [3]:
# 定义模型

from tensorflow.keras.layers import *
from tensorflow.keras.models import *

MAX_LEN = 128
input_layer = Input(shape=(MAX_LEN,))
layer = Embedding(input_dim=len(id2char), output_dim=128)(input_layer)
layer = Bidirectional(LSTM(128, return_sequences=True))(layer)
layer = Flatten()(layer)  # [*, 128, 256] --> [*, 128 * 256]
output_layer = Dense(len(id2label), activation='softmax')(layer)
model = Model(inputs=input_layer, outputs=output_layer)
model.summary()
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 128)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 128, 128)          402176    
_________________________________________________________________
bidirectional (Bidirectional (None, 128, 256)          263168    
_________________________________________________________________
flatten (Flatten)            (None, 32768)             0         
_________________________________________________________________
dense (Dense)                (None, 35)                1146915   
Total params: 1,812,259
Trainable params: 1,812,259
Non-trainable params: 0
_________________________________________________________________


### 准备输入数据

对训练集、验证集、测试集进行输入转换，构造模型输入。

In [4]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

X_train, X_val, X_test = [], [], []
y_train = np.zeros((len(df_train), len(id2label)), dtype=np.int8)
y_val = np.zeros((len(df_val), len(id2label)), dtype=np.int8)

for i in range(len(df_train)):
    X_train.append([char2id[char] for char in df_train.loc[i, 'text'].split(" ")])
    y_train[i][label2id[df_train.loc[i, 'label']]] = 1
for i in range(len(df_val)):
    X_val.append([char2id[char] if char in char2id else 0 for char in df_val.loc[i, 'text'].split(" ")])
    y_val[i][label2id[df_val.loc[i, 'label']]] = 1
for i in range(len(df_test)):
    X_test.append([char2id[char] if char in char2id else 0 for char in df_test.loc[i, 'text'].split(" ")])

X_train = pad_sequences(X_train, maxlen=MAX_LEN, padding='post', truncating='post')
X_val = pad_sequences(X_val, maxlen=MAX_LEN, padding='post', truncating='post')
X_test = pad_sequences(X_test, maxlen=MAX_LEN, padding='post', truncating='post')

### 模型训练

In [5]:
model.fit(x=X_train, y=y_train, validation_data=(X_val, y_val), epochs=1, batch_size=4)



<tensorflow.python.keras.callbacks.History at 0x16923977a90>

In [6]:
y_val_pred = model.predict(X_val).argmax(axis=-1)
print(y_val_pred[: 20])
y_val = []
for i in range(len(df_val)):
    y_val.append(label2id[df_val.loc[i, 'label']])
y_val = [int(w) for w in y_val]
print(y_val[: 20])

from sklearn.metrics import classification_report
results = {}
classification_report_dict = classification_report(y_val_pred, y_val, output_dict=True)
for key0, val0 in classification_report_dict.items():
    if isinstance(val0, dict):
        for key1, val1 in val0.items():
            results[key0 + "__" + key1] = val1

    else:
        results[key0] = val0

import json
print(json.dumps(results, indent=2, ensure_ascii=False))

[0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 2 0 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
{
  "0__precision": 0.8443579766536965,
  "0__recall": 0.6977491961414791,
  "0__f1-score": 0.7640845070422536,
  "0__support": 311,
  "1__precision": 0.547945205479452,
  "1__recall": 0.4332129963898917,
  "1__f1-score": 0.4838709677419355,
  "1__support": 277,
  "2__precision": 0.599483204134367,
  "2__recall": 0.3642072213500785,
  "2__f1-score": 0.453125,
  "2__support": 637,
  "3__precision": 0.6,
  "3__recall": 0.6190476190476191,
  "3__f1-score": 0.609375,
  "3__support": 63,
  "4__precision": 0.6,
  "4__recall": 0.6097560975609756,
  "4__f1-score": 0.6048387096774194,
  "4__support": 123,
  "5__precision": 0.35714285714285715,
  "5__recall": 0.39473684210526316,
  "5__f1-score": 0.37500000000000006,
  "5__support": 38,
  "6__precision": 0.23684210526315788,
  "6__recall": 0.4090909090909091,
  "6__f1-score": 0.3,
  "6__support": 22,
  "7__precision": 0.75,
  "7__recall": 0.854

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### 输出预测结果

In [7]:
y_pred = model.predict(X_test).argmax(axis=-1)
pred_labels = [id2label[i] for i in y_pred]
pd.DataFrame({"id": df_test['id'], "label": pred_labels}).to_csv("submission.csv", index=False)



# 提交结果：
# 0.36730954652