## Introduction

In this project I will use Ye

## Import Necessary libraries

In [1]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation
from keras.layers.embeddings import Embedding
import pandas as pd
import numpy as np

Using TensorFlow backend.


### Get  data

In [2]:
df = pd.read_csv('reduced_review.csv', sep = '|', names = ['stars', 'text'], error_bad_lines=False)

In [3]:
df= df.dropna()
df = df[df.stars.apply(lambda x: x.isnumeric())]
df = df[df.stars.apply(lambda x: x !="")]

In [4]:
df = df[df.text.apply(lambda x: x !="")]

In [5]:
df.describe()

Unnamed: 0,stars,text
count,1673870,1673870
unique,5,1673452
top,5,Good stuff
freq,709732,6


In [6]:
df.head()

Unnamed: 0,stars,text
0,5,The minute I realized that Conflict was a bloc...
2,5,I love Conflict Kitchen. The food is fantasti...
3,4,Holy moly! I'm addicted!\n\nI first heard of C...
4,4,"Had some great Persian food, though it was mor..."
5,4,Yummy food. Good prices. Encourages me to try ...


In [7]:
labels = df['stars'].map(lambda x : 1 if int(x) > 3 else 0)

In [8]:
t = Tokenizer()
t.fit_on_texts(df['text'])
encoded_docs = t.texts_to_sequences(df['text'])

In [9]:
tokenizer = Tokenizer(num_words=20000)
tokenizer.fit_on_texts(df['text'])
sequences = tokenizer.texts_to_sequences(df['text'])
data = pad_sequences(sequences, maxlen=50)

In [10]:
print(data[0])

[    0     0     1   747     3  1513    13 12870     6     4  2348   250
   359    12   246    65   132     6     3  1050   320   401   196   126
    48   120    74   157    37    21   654     7 12870   574    55    42
     4   217   268    64  1086    11   355    37     7     1    28  3580
     3  1503]


## Build LSTM model

In [13]:
model = Sequential()
model.add(Embedding(20000, 100, input_length=50))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [14]:
model.fit(data, np.array(labels), validation_split=0.4, epochs=3)

Train on 1004322 samples, validate on 669548 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


NameError: name 'padded_docs' is not defined

## Adding 1D CNN layer
Our LSTM model worked well enough, but it takes forever to train 3 epochs. One way to speed up the training time is to improve our network architecture and add a “Convolutional” layer. Convolutional Neural Networks (CNNs) come from image processing. They pass a “filter” over the data, and calculate a higher-level representation. They have been shown to work surprisingly well for text, even though they have none of the sequence processing ability of LSTMs.

In [16]:
model_conv = Sequential()
model_conv.add(Embedding(20000, 100, input_length=50))
model_conv.add(Dropout(0.2))
model_conv.add(Conv1D(64, 5, activation='relu'))
model_conv.add(MaxPooling1D(pool_size=4))
model_conv.add(LSTM(100))
model_conv.add(Dense(1, activation='sigmoid'))
model_conv.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [17]:
model_conv.fit(data, np.array(labels), validation_split=0.4, epochs=3)

Train on 1004322 samples, validate on 669548 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x17b50ceb8>

In [28]:
df_save = pd.DataFrame(data)
df_label = pd.DataFrame(np.array(labels))

In [29]:
result = pd.concat([df_save, df_label], axis = 1)

In [31]:
result.to_csv('train_dense_word_vectors.csv', index=False)

### Word2Vec

In [None]:
from gensim.models import Word2Vec

In [None]:
model = Word2Vec.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True, norm_only=True)
# getting word vectors of a word
dog = model['dog']
#performing king queen magic
print(model.most_similar(positive=['woman', 'king'], negative=['man']))

#picking odd one out
print(model.doesnt_match("breakfast cereal dinner lunch".split()))

#printing similarity index
print(model.similarity('woman', 'man'))

In [None]:
sentence=[[‘Neeraj’,’Boy’],[‘Sarwan’,’is’],[‘good’,’boy’]]
model = gensim.models.Word2Vec(sentence, min_count=1,size=300,workers=4)
print(model.similarity('woman', 'man'))