In [1]:
import numpy as np
import pandas as pd 
import jieba 
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression
from collections import Counter
from math import isnan
from sklearn.feature_extraction.text import TfidfTransformer



In [2]:
corpus = [line.strip('\n') for line in open('../input/guo_corpus.txt','r',encoding = 'utf8').readlines()]
labels = [line.strip('\n') for line in open('../input/guo_labels.txt','r',encoding = 'utf8').readlines()]
print('数据量：%d'%len(corpus),'类目数：%d'%len(Counter(labels)))

数据量：20059 类目数：816


In [3]:
txt = pd.DataFrame(columns=['商品描述','标签'])
txt['商品描述'] = pd.Series(corpus)
txt['标签'] = pd.Series(labels)

In [5]:
tfidf = TfidfVectorizer().fit_transform(txt['商品描述'])

fre = CountVectorizer().fit_transform(txt['商品描述'])

lf = LogisticRegression()
X_test,X_train,Y_test,Y_train = train_test_split(fre,txt['标签'],test_size = 0.2)
lf.fit(X_test,Y_test)
y_pre = lf.predict(X_train)
np.mean(y_pre == Y_train)

0.9112662013958126

In [4]:
frame = pd.DataFrame(columns=['商品描述','标签'])
dic = Counter(labels)
for i in dic.keys():
    data = txt.loc[txt['标签'] == i]
    if dic[i] > 200:
        frame = pd.concat([frame,data[:200]],axis = 0)
        continue
    elif dic[i]>=5:
        frame = pd.concat([frame,txt.loc[txt['标签'] == i]],axis = 0)
frame = frame.reset_index(drop= True)

In [5]:
print('数据量：%d'%len(frame),'类目数：%d'%len(Counter(frame['标签'])))

数据量：10235 类目数：283


In [6]:
from gensim.models import word2vec

sentences = word2vec.Text8Corpus('../input/guo_corpus.txt')

#sentences = [i.split() for i in frame['商品描述']]

model = word2vec.Word2Vec(sentences,size = 100,window = 600 ,min_count = 3)



In [7]:
word2vec_matrix = []
for line in range(frame.shape[0]):
    vector = []
    num = 0
    words = frame['商品描述'][line].split()
    for word in words:
        try:
            temp = model[word]
        except:
            continue 
        else:
            vector += list(temp)
            num += 1
    vector = vector + [0]*100*(26 - num)
    word2vec_matrix.append(vector)

  


In [8]:
from sklearn import preprocessing 
le = preprocessing.LabelEncoder()
y_label = le.fit_transform(frame['标签'])

In [9]:
import tensorflow as tf
from tensorflow.contrib import rnn
import random

input_size = 100
timestep_size = 26 
hidden_size = 256
layer_num = 2
class_num = len(Counter(frame['标签']))
learning_rate = 0.001
with tf.name_scope('inputs'):
    keep_prob = tf.placeholder(tf.float32)
    batch_size = tf.placeholder(tf.int32, [], name='batch_size_input')
    _X = tf.placeholder(tf.float32,[None,2600])
    y = tf.placeholder(tf.int64,[None])
    x = tf.reshape(_X,[-1,timestep_size,input_size])

with tf.name_scope('weights'):
    W = tf.Variable(tf.truncated_normal([hidden_size, class_num], stddev=0.1), dtype=tf.float32)
with tf.name_scope('biases'):
    bias = tf.Variable(tf.constant(0.1,shape=[class_num]), dtype=tf.float32)
def attn_cell():
    lstm_cell = tf.contrib.rnn.BasicLSTMCell(hidden_size)
    with tf.name_scope('lstm_dropout'):
        return tf.contrib.rnn.DropoutWrapper(lstm_cell, output_keep_prob=keep_prob)
enc_cells = []
for i in range(0, layer_num):
    enc_cells.append(attn_cell())
mlstm_cell = rnn.MultiRNNCell(enc_cells,state_is_tuple=True)
init_state = mlstm_cell.zero_state(batch_size,dtype=tf.float32)
outputs, state = tf.nn.dynamic_rnn(mlstm_cell, x, initial_state=init_state, dtype=tf.float32,time_major=False)

In [10]:
h_state = state[-1][1]
logits = tf.matmul(h_state, W) + bias #维度 batch_size * class_num
loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits = logits+1e-10,labels = y)
original_cost_function = tf.reduce_mean(loss)

tv = tf.trainable_variables()
regularization_cost = 0.001* tf.reduce_sum([ tf.nn.l2_loss(v) for v in tv ])
cost = original_cost_function + regularization_cost

train_op = tf.train.AdamOptimizer(learning_rate).minimize(cost)

In [11]:
with tf.name_scope("accuracy"):
    prediction = tf.argmax(logits,1)
    correct_prediction = tf.equal(prediction,y)
    correct_num=tf.reduce_sum(tf.cast(correct_prediction,tf.float32))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction,tf.float32),name="accuracy")

In [12]:
X_train,X_test,Y_train,Y_test = train_test_split(word2vec_matrix,y_label,test_size = 0.2)

In [13]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    listnum = []
    Iter = 0
    for i in range(20001):
        _batch_size = 1000
        if len(listnum)<_batch_size:
            listnum = list(range(len(X_train)))
            Iter += 1
        temp_x =[]
        temp_y = []
        for _ in range(_batch_size):
            j = random.randint(0,len(listnum)-1) #生成一个包括0，len(listnum)-1之间的随机数
            temp_x.append(X_train[listnum[j]])
            temp_y.append(Y_train[listnum[j]])
            del listnum[j]
        # temp_x 是一个1000*1200维度的向量
        if (i)%500 == 0:
            train_accuracy = sess.run(accuracy, feed_dict={_X:X_test, y: Y_test, keep_prob: 1.0, batch_size:len(X_test)})
            print("Iter %d: step %d, training accuracy %g" % ( Iter,(i+1), train_accuracy))
        sess.run(train_op,feed_dict={_X:temp_x,y:temp_y,keep_prob: 0.6, batch_size: _batch_size})

Iter 1: step 1, training accuracy 0.00097704
Iter 63: step 501, training accuracy 0.850024
Iter 126: step 1001, training accuracy 0.843674
Iter 188: step 1501, training accuracy 0.832926
Iter 251: step 2001, training accuracy 0.860283
Iter 313: step 2501, training accuracy 0.84807
Iter 376: step 3001, training accuracy 0.851001
Iter 438: step 3501, training accuracy 0.868588
Iter 501: step 4001, training accuracy 0.860772
Iter 563: step 4501, training accuracy 0.842208
Iter 626: step 5001, training accuracy 0.852956
Iter 688: step 5501, training accuracy 0.861749
Iter 751: step 6001, training accuracy 0.843674
Iter 813: step 6501, training accuracy 0.852467
Iter 876: step 7001, training accuracy 0.862237
Iter 938: step 7501, training accuracy 0.860283
Iter 1001: step 8001, training accuracy 0.85491
Iter 1063: step 8501, training accuracy 0.860772
Iter 1126: step 9001, training accuracy 0.844651
Iter 1188: step 9501, training accuracy 0.853933


KeyboardInterrupt: 

In [14]:
len(Counter(Y_train))

283

In [15]:
len(Counter(Y_test))

258