In [1]:
from keras.optimizers import SGD, Adam, Nadam, RMSprop
from keras.models import Sequential,Model,load_model
from keras.layers import Embedding,Conv1D,MaxPooling1D
from keras.layers.core import Dense, Activation,Dropout ,Flatten
from keras.layers.recurrent import LSTM
from keras.utils import np_utils
from keras.preprocessing.image import ImageDataGenerator
from keras.preprocessing import sequence
from keras.preprocessing.text import text_to_word_sequence,one_hot,Tokenizer
from keras.constraints import maxnorm
from keras.callbacks import ModelCheckpoint,TensorBoard, ReduceLROnPlateau,EarlyStopping
from keras.applications import Xception
from keras import regularizers
from keras import backend as K
import keras
import numpy as np
import pandas as pd
import cv2
import os
import glob
import math

Using TensorFlow backend.


In [2]:
seed=7
np.random.seed(seed)

In [3]:
def load_TrainingData(path):
    D=pd.read_csv(path,sep='\t',header=0)
    feature_names=np.array(list(D.columns.values))
    X_train=np.array(list(D['Phrase']))
    Y_train=np.array(list(D['Sentiment']))
    return X_train,Y_train,feature_names

def load_TestingData(path):     #loads data , caluclate Mean & subtract it data, gets the COV. Matrix.
    D = pd.read_csv(path, sep='\t', header=0)
    X_test=np.array(list(D['Phrase']))
    X_test_PhraseID=np.array(list(D['PhraseId']))
    return  X_test,X_test_PhraseID

def shuffle_2(a, b): # Shuffles 2 arrays with the same order
    s = np.arange(a.shape[0])
    np.random.shuffle(s)
    return a[s], b[s]

In [4]:
X_train,Y_train,feature_names=load_TrainingData('train.tsv')
X_test,X_test_PhraseID = load_TestingData('test.tsv')
print ('============================== Training data shapes ==============================')
print ('X_train.shape is ', X_train.shape)
print ('Y_train.shape is ',Y_train.shape)

X_train.shape is  (156060,)
Y_train.shape is  (156060,)


In [6]:
Tokenizer = Tokenizer()
Tokenizer.fit_on_texts(np.concatenate((X_train, X_test), axis=0))
# Tokenizer.fit_on_texts(X_train)
Tokenizer_vocab_size = len(Tokenizer.word_index) + 1
print("Vocab size",Tokenizer_vocab_size)

Vocab size 17781


In [7]:
#masking
num_test = 32000
mask = range(num_test)

Y_Val = Y_train[:num_test]
Y_Val2 = Y_train[:num_test]
X_Val = X_train[:num_test]


X_train = X_train[num_test:]
Y_train = Y_train[num_test:]

In [8]:
maxWordCount= 60
maxDictionary_size=Tokenizer_vocab_size

In [9]:
encoded_words = Tokenizer.texts_to_sequences(X_train)
encoded_words2 = Tokenizer.texts_to_sequences(X_Val)
encoded_words3 = Tokenizer.texts_to_sequences(X_test)

In [10]:
#padding all text to same size
X_Train_encodedPadded_words = sequence.pad_sequences(encoded_words, maxlen=maxWordCount)
X_Val_encodedPadded_words = sequence.pad_sequences(encoded_words2, maxlen=maxWordCount)
X_test_encodedPadded_words = sequence.pad_sequences(encoded_words3, maxlen=maxWordCount)

# One Hot Encoding
Y_train = keras.utils.to_categorical(Y_train, 5)
Y_Val   = keras.utils.to_categorical(Y_Val, 5)

In [11]:
#shuffling the traing Set
shuffle_2(X_Train_encodedPadded_words,Y_train)

(array([[   0,    0,    0, ...,  184, 8871,   55],
        [   0,    0,    0, ..., 3758,    3, 2537],
        [   0,    0,    0, ...,    1,   96, 1300],
        ..., 
        [   0,    0,    0, ...,   32,    1,  276],
        [   0,    0,    0, ..., 6577,    4, 5931],
        [   0,    0,    0, ...,    0,    0, 7287]], dtype=int32),
 array([[ 0.,  1.,  0.,  0.,  0.],
        [ 0.,  1.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  1.,  0.],
        ..., 
        [ 0.,  1.,  0.,  0.,  0.],
        [ 0.,  1.,  0.,  0.,  0.],
        [ 0.,  0.,  1.,  0.,  0.]]))

In [12]:
print('features are',feature_names)
print("-------After extracting a validation set of "+str(num_test)+"--------")
print ('============================== Training data shapes ==============================')
print ('X_train.shape is ', X_train.shape)
print ('Y_train.shape is ',Y_train.shape)
print ('============================== Validation data shapes ==============================')
print ('Y_Val.shape is ',Y_Val.shape)
print ('X_Val.shape is ', X_Val.shape)
print ('============================== Test data shape ==============================')
print ('X_test.shape is ', X_test.shape)

features are ['PhraseId' 'SentenceId' 'Phrase' 'Sentiment']
-------After extracting a validation set of 32000--------
X_train.shape is  (124060,)
Y_train.shape is  (124060, 5)
Y_Val.shape is  (32000, 5)
X_Val.shape is  (32000,)
X_test.shape is  (66292,)


In [13]:
print ('============================== After padding all text to same size of '+ str(maxWordCount)+' ==============================')
print ('============================== Training data shapes ==============================')
print ('X_Train_encodedPadded_words.shape is ', X_Train_encodedPadded_words.shape)
print ('Y_train.shape is ',Y_train.shape)
print ('============================== Validation data shapes ==============================')
print ('X_Val_encodedPadded_words.shape is ', X_Val_encodedPadded_words.shape)
print ('Y_Val.shape is ',Y_Val.shape)
print ('============================== Test data shape ==============================')
print ('X_test_encodedPadded_words.shape is ', X_test_encodedPadded_words.shape)

X_Train_encodedPadded_words.shape is  (124060, 60)
Y_train.shape is  (124060, 5)
X_Val_encodedPadded_words.shape is  (32000, 60)
Y_Val.shape is  (32000, 5)
X_test_encodedPadded_words.shape is  (66292, 60)


In [14]:
# model
model=Sequential()
model.add(Embedding(maxDictionary_size,210,input_length=maxWordCount))
model.add(LSTM(128))
model.add(Dropout(0.6))
model.add(Dense(128, activation='relu',W_constraint=maxnorm(1)))
model.add(Dense(5, activation='softmax'))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 60, 32)            568992    
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               82432     
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 128)               16512     
_________________________________________________________________
dense_2 (Dense)              (None, 5)                 645       
Total params: 668,581
Trainable params: 668,581
Non-trainable params: 0
_________________________________________________________________


  


In [15]:
learning_rate=0.0001
epochs = 2
batch_size = 256 #32
sgd = SGD(lr=learning_rate, nesterov=True, momentum=0.7, decay=1e-4)
Nadam = keras.optimizers.Nadam(lr=0.002, beta_1=0.9, beta_2=0.999, epsilon=1e-08, schedule_decay=0.004)
model.compile(loss='categorical_crossentropy', optimizer=Nadam, metrics=['accuracy'])


In [16]:
tensorboard=keras.callbacks.TensorBoard(log_dir='./logs/log_25',histogram_freq=0,write_graph=True,write_images=False) #该回调函数是一个可视化的展示器
checkpointer=ModelCheckpoint(filepath='./weights/weights_25.hdf5',verbose=1,save_best_only=True,monitor='val_loss') #该回调函数将在每个epoch后保存模型到filepath
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.8, patience=0, verbose=1, mode='auto', cooldown=0, min_lr=1e-6) #当评价指标不在提升时，减少学习率
earlyStopping = EarlyStopping(monitor='val_loss', min_delta=0, patience=6, verbose=1) #当监测值不再改善时，该回调函数将中止训练

In [17]:
print ("=============================== Training =========================================")
history  = model.fit(X_Train_encodedPadded_words, Y_train, epochs = epochs, batch_size=batch_size, verbose=1,
                    validation_data=(X_Val_encodedPadded_words, Y_Val), callbacks=[tensorboard, reduce_lr,checkpointer,earlyStopping])

Train on 124060 samples, validate on 32000 samples
Epoch 1/2
Epoch 2/2


In [18]:
print ("=============================== Score =========================================")
scores = model.evaluate(X_Val_encodedPadded_words, Y_Val, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 64.03%


In [19]:
print ("=============================== Predicting =========================================")
f = open('Submission.csv', 'w')
f.write('PhraseId,Sentiment\n')
predicted_classes = model.predict_classes(X_test_encodedPadded_words, batch_size=batch_size, verbose=1)
for i in range(0,X_test_PhraseID.shape[0]):
    # pred =np.argmax(predictions[i])
    f.write(str(X_test_PhraseID[i])+","+str(predicted_classes[i])+'\n')
    # print predictions[i],"=>",pred

f.close()



In [20]:
# tensorboard --logdir=./logs/log_25