In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB,MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder

## 读取数据

In [48]:
path='D:\\NLP_datasets\\'
df_train = pd.read_csv(path+'train.csv',encoding='utf-8',lineterminator='\n')
df_test = pd.read_csv(path+'test.csv',encoding='utf-8',lineterminator='\n')

In [49]:
## map函数：实现元素转换；可以接受一个函数或含有映射关系的字典型
df_train['label'] = df_train['label'].map({'Negative':0,'Positive':1})
df_train.head()

Unnamed: 0,ID,review,label
0,1,Jo bhi ap se tou behtar hoon,0
1,2,ya Allah meri sister Affia ki madad farma,1
2,3,Yeh khud chahta a is umar main shadi krna. ha...,0
3,4,Tc ? Apky mun xe exe alfax achy nae lgty 😒💃,0
4,5,Good,1


## 做简要的数据分析

In [50]:
## 检测是否有空缺值
df_train.isnull().sum()

ID        0
review    0
label     0
dtype: int64

In [51]:
## 检测数据分布是否均衡
df_train['label'].value_counts()

1    3361
0    2967
Name: label, dtype: int64

In [52]:
## 数据转化为np格式
np_train = df_train.as_matrix()
np_test = df_test.as_matrix()
np_train[:4]

array([[1, 'Jo bhi ap se tou behtar hoon', 0],
       [2, 'ya Allah meri sister Affia ki madad farma', 1],
       [3, 'Yeh khud chahta a is umar main shadi krna.  had ogi', 0],
       [4, 'Tc ? Apky mun xe exe alfax achy nae lgty 😒💃', 0]],
      dtype=object)

In [7]:
print(len(np_train))
print(len(np_test))

6328
2712


In [8]:
## 两种方式做数据清理
def cleaner(word):
    word = re.sub(r'\#\.', '', word)
    word = re.sub(r'\n', '', word)
    word = re.sub(r',', '', word)
    word = re.sub(r'\-', ' ', word)
    word = re.sub(r'\.', '', word)
    word = re.sub(r'\\', ' ', word)
    word = re.sub(r'\\x\.+', '', word)
    word = re.sub(r'\d', '', word)
    word = re.sub(r'^_.', '', word)
    word = re.sub(r'_', ' ', word)
    word = re.sub(r'^ ', '', word)
    word = re.sub(r' $', '', word)
    word = re.sub(r'\?', '', word)
    word = re.sub(r'é', '', word)
    word = re.sub(r'§', '', word)
    word = re.sub(r'¦', '', word)
    word = re.sub(r'æ', '', word)
    word = re.sub(r'\d+', '', word)
    word = re.sub('(.*?)\d+(.*?)', '', word)
    return word.lower()
def hashing(word):
    word = re.sub(r'ain$', r'ein', word)
    word = re.sub(r'ai', r'ae', word)
    word = re.sub(r'ay$', r'e', word)
    word = re.sub(r'ey$', r'e', word)
    word = re.sub(r'ie$', r'y', word)
    word = re.sub(r'^es', r'is', word)
    word = re.sub(r'a+', r'a', word)
    word = re.sub(r'j+', r'j', word)
    word = re.sub(r'd+', r'd', word)
    word = re.sub(r'u', r'o', word)
    word = re.sub(r'o+', r'o', word)
    word = re.sub(r'ee+', r'i', word)
    if not re.match(r'ar', word):
        word = re.sub(r'ar', r'r', word)
    word = re.sub(r'iy+', r'i', word)
    word = re.sub(r'ih+', r'eh', word)
    word = re.sub(r's+', r's', word)
    if re.search(r'[rst]y', 'word') and word[-1] != 'y':
        word = re.sub(r'y', r'i', word)
    if re.search(r'[bcdefghijklmnopqrtuvwxyz]i', word):
        word = re.sub(r'i$', r'y', word)
    if re.search(r'[acefghijlmnoqrstuvwxyz]h', word):
        word = re.sub(r'h', '', word)
    word = re.sub(r'k', r'q', word)
    return word

def array_cleaner(array):
    X = []
    for sentence in array:
        clean_sentence = ''
        words = sentence.split(' ')
        for word in words:
            clean_sentence = clean_sentence +' '+ cleaner(word)
        X.append(clean_sentence)
    return X

In [9]:
## 自定义一种数据清理方式
def deleteUncharacter(texts):
    clean_text=[]
    for cur in texts:
        cur = re.sub('[^a-zA-Z]',' ',cur)
        cur=cur.lower()
        clean_text.append(cur)
    return clean_text

In [76]:
y_train = np.array(np_train[:,2])
y_train = y_train.astype('int8')
x_train = np_train[:,1]
x_train = deleteUncharacter(x_train)
# x_train = array_cleaner(x_train)
x_test = np_test[:,1]
x_test = deleteUncharacter(x_test)
# x_test = array_cleaner(x_test)
len_train = len(x_train)

In [31]:
print(len(x_train))
print(len(x_test))

6328
2712


### keras.preprocessing.text.Tokenizer 编码

In [12]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense,Embedding,LSTM,Flatten
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [54]:
tokenizer = Tokenizer(nb_words=2000,
    filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
    lower=True,split=' ')



In [77]:
x_all = x_train+x_test
tokenizer.fit_on_texts(x_all)

### 科学使用Toknizer
* 用Tokenizer的fit_on_texts学习文本的字典
* word_index就是对应单词和数字的映射关系dic
* texts_to_sequences 用dic将每个string的每个词转成数字
* padding方法补齐
* 若要用LSTM训练，可用keras的embedding层进行一个向量化

In [33]:
somestr = ['haha gue angry','howa ha gua excited naive']
tok = Tokenizer()
tok.fit_on_texts(somestr)
tok.word_index
tok.texts_to_sequences(somestr)

[[1, 2, 3], [4, 5, 6, 7, 8]]

In [None]:
tokenizer.word_index ##输出字典

In [78]:
x_all = tokenizer.texts_to_sequences(x_all)
x_all[:2]

[[32, 11, 35, 8, 100, 287, 268], [51, 16, 87, 1, 255, 367]]

### keras.preprocessing.sequence.pad_sequences(sequences, maxlen=None, dtype='int32', padding='pre', truncating='pre', value=0.0)
* sequences: 列表的列表，每一个元素是一个序列。
* maxlen: 整数，所有序列的最大长度。
* dtype: 输出序列的类型。 要使用可变长度字符串填充序列，可以使用 object。
* padding: 字符串，'pre' 或 'post' ，在序列的前端补齐还是在后端补齐。
* truncating: 字符串，'pre' 或 'post' ，移除长度大于 maxlen 的序列的值，要么在序列前端截断，要么在后端。
* value: 浮点数，表示用来补齐的值。


In [79]:
## 输出的字典长短不一，需要pad补充

X_all = pad_sequences(x_all)
X_train = X_all[:len_train]
X_test = X_all[len_train:]

In [66]:
X_all[:2]

array([[  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0

In [67]:
print(X_train.shape)
print(X_test.shape)

(6328, 219)
(2712, 219)


In [68]:
y_binary = to_categorical(y_train)

### to_categorical函数
## keras.utils.to_categorical(y, num_classes=None)
## 将类向量（整数）转换为二进制类矩阵 one-hot编码

### 建模--搭建LSTM训练模型

In [82]:
embed_dim = 128
lstm_out = 256
batch_size = 32
input_length = X_test.shape[1]


model = Sequential()
model.add(Embedding(2000,embed_dim,input_length = input_length,dropout=0.2))
model.add(LSTM(lstm_out,dropout_U=0.2,dropout_W=0.2,return_sequences=True))
model.add(Flatten())
model.add(Dense(2,activation='softmax'))
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())



_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 226, 128)          256000    
_________________________________________________________________
lstm_4 (LSTM)                (None, 226, 256)          394240    
_________________________________________________________________
flatten_4 (Flatten)          (None, 57856)             0         
_________________________________________________________________
dense_4 (Dense)              (None, 2)                 115714    
Total params: 765,954
Trainable params: 765,954
Non-trainable params: 0
_________________________________________________________________
None


## 构建评测指标

In [27]:
from sklearn.metrics import roc_auc_score
from keras.callbacks import Callback

class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()
        self.interval = interval
        self.x_val,self.y_val = validation_data
    def on_epoch_end(self, epoch, log={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.x_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print('\n ROC_AUC - epoch:%d - score:%.6f \n' % (epoch+1, score))

In [80]:
x_train5,y_train5,x_label5,y_label5 = train_test_split(X_train,y_binary, train_size=0.8, random_state=234)
RocAuc = RocAucEvaluation(validation_data=(y_train5,y_label5), interval=1)



In [83]:
hist = model.fit(x_train5, x_label5, batch_size=batch_size, epochs=1, validation_data=(y_train5, y_label5), callbacks=[RocAuc], verbose=2)

Train on 5062 samples, validate on 1266 samples
Epoch 1/1
 - 239s - loss: 0.6223 - acc: 0.6333 - val_loss: 0.5251 - val_acc: 0.7536

 ROC_AUC - epoch:1 - score:0.816133 



In [72]:
y_lstm = model.predict_proba(X_test,batch_size=batch_size)[:,1]

In [84]:
lstm_output = pd.DataFrame(data={'ID':df_test['ID'],'pred':y_lstm})
lstm_output.to_csv('D:/NLP_datasets/lstm_new.csv',index=False,quoting=3)

In [85]:
y_lstm[:7]

array([0.41213256, 0.6298175 , 0.87724465, 0.7674661 , 0.22996518,
       0.85551775, 0.19632514], dtype=float32)