In [1]:
import re
import gensim
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("../input/id_reviews.csv")
print(len(data))
data.head()

364


Unnamed: 0,ID,Month_ago,Score,Reviews
0,72863,10.0,5,Really comfortable.
1,72863,12.0,1,"Horrible management, \nmore people live in the..."
2,72863,12.0,1,After a 5 year lease !!!!!! There's a small ho...
3,72863,12.0,1,0 star if I was able to
4,72863,12.0,4,NIL


In [3]:
d = {'reviews': data["Reviews"], 'rating': data['Score']}
df = pd.DataFrame(data=d)
df.head()

Unnamed: 0,reviews,rating
0,Really comfortable.,5
1,"Horrible management, \nmore people live in the...",1
2,After a 5 year lease !!!!!! There's a small ho...,1
3,0 star if I was able to,1
4,NIL,4


In [4]:
def mark_sentiment(rating):
    if(rating <= 3):
        return 0
    else:
        return 1

df['sentiment'] = df['rating'].apply(mark_sentiment)
df.drop(['rating'], axis = 1, inplace=True)
df.head()

Unnamed: 0,reviews,sentiment
0,Really comfortable.,1
1,"Horrible management, \nmore people live in the...",0
2,After a 5 year lease !!!!!! There's a small ho...,0
3,0 star if I was able to,0
4,NIL,1


In [5]:
import nltk
from nltk.corpus import stopwords  #stopwords
from nltk import word_tokenize,sent_tokenize # tokenizing
from nltk.stem import PorterStemmer,LancasterStemmer  # using the Porter Stemmer and Lancaster Stemmer and others
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer  # lammatizer from WordNet

# for part-of-speech tagging
from nltk import pos_tag

# for named entity recognition (NER)
from nltk import ne_chunk

In [6]:
def clean_reviews(review_text):
    
    review_text = re.sub("[^a-zA-Z]"," ",review_text)
    
    # 3. Converting to lower case and splitting
    word_tokens= review_text.lower().split()
    
    # 4. Remove stopwords
    le=WordNetLemmatizer()
    stop_words= set(stopwords.words("english"))     
    word_tokens= [le.lemmatize(w) for w in word_tokens if not w in stop_words]
    
    cleaned_review=" ".join(word_tokens)
    return cleaned_review

In [7]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
sentences = []
sum = 0
for review in df['reviews']:
    sents = tokenizer.tokenize(review.strip())
    sum = len(sents)
    for sent in sents:
        cleaned_sent = clean_reviews(sent)
        sentences.append(cleaned_sent.split())
print(sum)
print(len(sentences))  

3
1656


In [8]:
max_len = 0
for m in sentences:
    if(max_len < len(m)):
        max_len = len(m)
print(max_len)

72


In [9]:
import gensim
word_2_vec_model = gensim.models.Word2Vec(sentences = sentences, size=300,window=10,min_count = 1)

In [10]:
word_2_vec_model.train(sentences,epochs=10,total_examples=len(sentences))

(99841, 111670)

In [11]:
print(sentences[1:2])

[['horrible', 'management', 'people', 'live', 'apartment', 'without', 'management', 'knowing', 'result', 'bearly', 'parking', 'vistors', 'parking', 'front', 'always', 'full', 'extra', 'people', 'living', 'car', 'park', 'visor', 'section', 'every', 'single', 'day']]


In [12]:
vocab=word_2_vec_model.wv.vocab
vocab=list(vocab.keys())
word_vec_dict={}
for word in vocab:
  word_vec_dict[word]=word_2_vec_model.wv.get_vector(word)

In [13]:
import keras
from keras.preprocessing.text import one_hot,Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense , Flatten ,Embedding,Input,CuDNNLSTM,LSTM
from keras.models import Model
from keras.preprocessing.text import text_to_word_sequence

Using TensorFlow backend.


In [14]:
df['clean_review']=df['reviews'].apply(clean_reviews)
tok = Tokenizer()
tok.fit_on_texts(df['clean_review'])
vocab_size = len(tok.word_index) + 1
encd_rev = tok.texts_to_sequences(df['clean_review'])

In [15]:
pad_rev= pad_sequences(encd_rev, maxlen=max_len, padding='post')
pad_rev.shape

(364, 72)

In [16]:
embed_matrix=np.zeros(shape=(vocab_size,300))
for word,i in tok.word_index.items():
  embed_vector=word_vec_dict.get(word)
  if embed_vector is not None:  # word is in the vocabulary learned by the w2v model

        embed_matrix[i]=embed_vector

In [17]:
print(df['sentiment'][:5])

0    1
1    0
2    0
3    0
4    1
Name: sentiment, dtype: int64


In [18]:

Y=keras.utils.to_categorical(df['sentiment'])  # one hot target as required by NN.
print(Y[:5])

[[0. 1.]
 [1. 0.]
 [1. 0.]
 [1. 0.]
 [0. 1.]]


In [19]:
from sklearn.model_selection import train_test_split,cross_validate
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

In [20]:
x_train,x_test,y_train,y_test=train_test_split(pad_rev,Y,test_size=0.20,random_state=42)

In [21]:
from keras.initializers import Constant
from keras.layers import ReLU
from keras.layers import Dropout


model=Sequential()

model.add(Embedding(input_dim=vocab_size,output_dim=300,input_length=max_len,embeddings_initializer=Constant(embed_matrix)))
 
model.add(Flatten())
model.add(Dense(16,activation='relu'))
model.add(Dropout(0.50))
model.add(Dense(2,activation='sigmoid'))

In [22]:
model.compile(optimizer=keras.optimizers.RMSprop(lr=1e-3),loss='binary_crossentropy',metrics=['accuracy'])

In [23]:
epochs=100
batch_size=64
model.fit(x_train,y_train,epochs=epochs,batch_size=batch_size,validation_data=(x_test,y_test))

Train on 291 samples, validate on 73 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Ep

<keras.callbacks.History at 0x7f5ed37dae80>

In [24]:
predictions = model.predict(x_test)

In [25]:
(rows,cols) = predictions.shape
op = np.zeros((rows,cols))

for i in range(rows):
    for j in range(cols):
        if(predictions[i,j] < 0.5):
            op[i,j] = 0
        else:
            op[i,j] = 1


In [26]:
print(op[:10])

[[1. 0.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [1. 0.]]


In [27]:
print(y_test[:10])

[[0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [1. 0.]
 [0. 1.]
 [0. 1.]
 [1. 0.]]


In [28]:
score, acc = model.evaluate(x_test, y_test,
                            batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

Test score: 1.72625985210889
Test accuracy: 0.6575342469835934


In [29]:
text = tok.sequences_to_texts(x_test[6:7])
print(text)

['happy problem getting solved professionally good management']
