In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import re

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
import time
tick = time.time()


<h1> Reading input from CSV into Pandas DataFrame object </h1>

In [3]:
data = pd.read_csv('train.csv')
# Keeping only the neccessary columns
data = data[['text','label']]
data.describe()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,text,label
count,400000,400001
unique,399958,7
top,After reading reviews we thought we would give...,2
freq,2,164201


<h1> Cleaning mislabelled rows </h1>

In [4]:
data = data[data.label !='My husband and I had not purchased a home before and we definitely needed some hand holding. They were patient and professional. We got our dream home and the entire experience was awesome! Thank you so much ladies for a job well done!']
data.label[data.label == '1'] = 1
data.label[data.label == '0']= 0
data.label[data.label == '2']= 2
data.describe()

Unnamed: 0,text,label
count,400000,400000
unique,399958,3
top,After reading reviews we thought we would give...,2
freq,2,250535


<h1> Text Pre-Processing and Tokenizing </h1>

<li> We remove capilization and retain only letters and numbers from the data. </li>
<br> <li>
We define the number of max features as 2000 and use Tokenizer to vectorize and convert text into Sequences so the Network can deal with it as input. </li>
<br> <li>
Then we use padding to make sure the input is of the same size into the model</li>

In [5]:
#data = data[data.label != 0]
data['text'] = data['text'].apply(lambda x: x.lower())
data['text'] = data['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))

#print(data[ data['label'] == 1].size)
#print(data[ data['label'] == 2].size)
#print(data[ data['label'] == 0].size)

for idx,row in data.iterrows():
    row[0] = row[0].replace('rt',' ')
    
max_fatures = 2000
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(data['text'].values)
X = tokenizer.texts_to_sequences(data['text'].values)
X = pad_sequences(X)


<h3> embed_dim :</h3> The embedding layer encodes the input sequence into a sequence of dense vectors of dimension embed_dim.
<br>
<h3> lstm_out : </h3>The LSTM transforms the vector sequence into a single vector of size lstm_out, containing information about the entire sequence.
<br><br>
The Sequential model is a linear stack of layers.<br>
We add an Embedding layer, layer for Dropout, define our LSTM model and a Softmax layer for our final output.
The first two LSTMs return their full output sequences, but the last one only returns the last step in its output sequence, thus dropping the temporal dimension (i.e. converting the input sequence into a single vector).
<br>
We then use categorical Cross entropy as our loss function and Our optimizer is Adams </h3>

In [6]:
embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(max_fatures, embed_dim,input_length = X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(3,activation='softmax',kernel_initializer='glorot_uniform',bias_initializer='zeros'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 943, 128)          256000    
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 943, 128)          0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 196)               254800    
_________________________________________________________________
dense_1 (Dense)              (None, 3)                 591       
Total params: 511,391
Trainable params: 511,391
Non-trainable params: 0
_________________________________________________________________
None


In [7]:
Y = pd.get_dummies(data['label']).values
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.33, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(268000, 943) (268000, 3)
(132000, 943) (132000, 3)


In [8]:
batch_size = 32
model.fit(X_train, Y_train, epochs = 1, batch_size=batch_size, verbose = 1)

Epoch 1/1
  2912/268000 [..............................] - ETA: 2:19:26 - loss: 0.8512 - acc: 0.6429

KeyboardInterrupt: 

In [None]:
print('total time for training is %f seconds' % (tick - time.time()))

In [None]:
validation_size = 80000
tick2=time.time()
X_validate = X_test[-validation_size:]
Y_validate = Y_test[-validation_size:]
X_test = X_test[:-validation_size]
Y_test = Y_test[:-validation_size]
score,acc = model.evaluate(X_test, Y_test, verbose = 2, batch_size = batch_size)
print("score: %.2f" % (score))
print("acc: %.2f" % (acc))

In [None]:
print('total time for evaluating is %f seconds' % (tick2 - time.time()))

In [None]:
test_data = pd.read_csv('test_data.csv')

In [None]:
final=[]
for i in range(len(test_data.index)):
    twt=[test_data.text[i]]
    twt=tokenizer.texts_to_sequences(twt)
    twt=pad_sequences(twt,maxlen=943,dtype='int32',value=0)
    sentiment=model.predict(twt,batch_size=1,verbose=1)[0]
    final.append(np.argmax(sentiment))
    #print(i)

import pandas as pd 
df = pd.DataFrame(final)
df.index.name='ID'
df.index = np.arange(1, len(df) + 1)
df.columns.name='label'
df.to_csv("Output.csv",header='label')

In [None]:
print('total time for running program is %f seconds' % (tick - time.time()))