# Binary Classification with LSTM+Word2Vec


In the previous LSTM model, since we self-train embeddings, our model becomes overfit during training. Therefore, I wrote this LSTM_Wordembedding model using word2vec word embeddings to test if the model will perform better when it is trained on word embedding features. Due to time concerns, I did binary classification instead of multi-classification. I put 1,2,3 stars reviews as negative and 4,5 stars reviews as positive and balanced the dataset by extracting 4000 reviews from each(pos/neg) class. The resulting accuracy is around 80%, and the model doesn't get overfit. Thus, the LSTM_Wordembedding model performs better than the previous LSTM model, which is what we expected. 

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Ignore  the warnings
import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')

# data visualisation and manipulation
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns
#configure
#sets matplotlib to inline and displays graphs below the corressponding cell.
%matplotlib inline  
style.use('fivethirtyeight')
sns.set(style='whitegrid',color_codes=True)

#nltk
import nltk

#preprocessing
from nltk.corpus import stopwords  #stopwords
from nltk import word_tokenize,sent_tokenize # tokenizing
from nltk.stem import PorterStemmer,LancasterStemmer  # using the Porter Stemmer and Lancaster Stemmer and others
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer  # lammatizer from WordNet

# for part-of-speech tagging
from nltk import pos_tag

# for named entity recognition (NER)
from nltk import ne_chunk

# vectorizers for creating the document-term-matrix (DTM)
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer

# BeautifulSoup libraray
from bs4 import BeautifulSoup 

import re # regex

#model_selection
from sklearn.model_selection import train_test_split,cross_validate
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

#evaluation
from sklearn.metrics import accuracy_score,roc_auc_score 
from sklearn.metrics import classification_report
from mlxtend.plotting import plot_confusion_matrix

#preprocessing scikit
from sklearn.preprocessing import MinMaxScaler,StandardScaler,LabelEncoder
from sklearn.impute import SimpleImputer
import numpy as np

imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
#classifiaction.
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC,SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB,MultinomialNB
 
#stop-words
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = stopwords.words('english')

#keras
import keras
from keras.preprocessing.text import one_hot,Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense , Flatten ,Embedding,Input,CuDNNLSTM,LSTM
from keras.models import Model
from keras.preprocessing.text import text_to_word_sequence

#gensim w2v
#word2vec
from gensim.models import Word2Vec

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
df = pd.read_csv('/content/drive/MyDrive/LSTM_Wordembedding/data.csv')  
shuffled_df = df.sample(frac=1)
shuffled_df.to_csv("shuffled_data.csv", index=False)

In [None]:
df = pd.read_csv('shuffled_data.csv')  
drop_columns = ['review_id','user_id','business_id','useful','funny','cool','date']
df = df.drop(drop_columns, axis=1)
df.head()

Unnamed: 0,stars,text
0,4,Italian food is one of my favorite types of fo...
1,3,"Met a friend for dinner there about 5pm, based..."
2,3,The second time I went here wasn't nearly a go...
3,5,Always worth the wait. Filling this review out...
4,4,These are the best donuts around! A lot of var...


In [None]:
def mark_sentiment(stars):
    if(stars<=3):
        return 0
    else:
        return 1

In [None]:
df['sentiment']=df['stars'].apply(mark_sentiment)

In [None]:
df.drop(['stars'],axis=1,inplace=True)

In [None]:
df.head()

Unnamed: 0,text,sentiment
0,Italian food is one of my favorite types of fo...,1
1,"Met a friend for dinner there about 5pm, based...",0
2,The second time I went here wasn't nearly a go...,0
3,Always worth the wait. Filling this review out...,1
4,These are the best donuts around! A lot of var...,1


In [None]:
df['sentiment'].value_counts()

0    6000
1    4000
Name: sentiment, dtype: int64

In [None]:
# function to clean and pre-process the text.
def clean_reviews(review):  
    
    # 1. Removing html tags
    review_text = BeautifulSoup(review,"lxml").get_text()
    
    # 2. Retaining only alphabets.
    review_text = re.sub("[^a-zA-Z]"," ",review_text)
    
    # 3. Converting to lower case and splitting
    word_tokens= review_text.lower().split()
    
    # 4. Remove stopwords
    le=WordNetLemmatizer()
    stop_words= set(stopwords.words("english"))     
    word_tokens= [le.lemmatize(w) for w in word_tokens if not w in stop_words]
    
    cleaned_review=" ".join(word_tokens)
    return cleaned_review

In [None]:
pos_df=df.loc[df.sentiment==1,:][:4000]
neg_df=df.loc[df.sentiment==0,:][:4000]

In [None]:
pos_df.head()

Unnamed: 0,text,sentiment
0,Italian food is one of my favorite types of fo...,1
3,Always worth the wait. Filling this review out...,1
4,These are the best donuts around! A lot of var...,1
5,It was pretty crowded when we went but it was ...,1
6,Magical vegan heaven. My wife and I stopped he...,1


In [None]:
neg_df.head()

Unnamed: 0,text,sentiment
1,"Met a friend for dinner there about 5pm, based...",0
2,The second time I went here wasn't nearly a go...,0
7,There was a time when all i wanted to do was h...,0
8,"Delivery was on time, but the pizza missed th...",0
9,Nice location and nice looking restaurant. Ou...,0


In [None]:
#combining
df=pd.concat([pos_df,neg_df],ignore_index=True)
print(df.shape)
df.head()

(8000, 2)


Unnamed: 0,text,sentiment
0,Italian food is one of my favorite types of fo...,1
1,Always worth the wait. Filling this review out...,1
2,These are the best donuts around! A lot of var...,1
3,It was pretty crowded when we went but it was ...,1
4,Magical vegan heaven. My wife and I stopped he...,1


In [None]:
# shuffling rows
df = df.sample(frac=1).reset_index(drop=True)
print(df.shape) 
df.head()

(8000, 2)


Unnamed: 0,text,sentiment
0,It has been quite awhile since I have been to ...,1
1,I enjoyed a wonderful dinner here with a frien...,1
2,Went here for lunch. Ordered the pork rice bow...,0
3,I was in town on business and feeling on the e...,0
4,My date and I came here for dessert only and w...,1


In [None]:
import nltk
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk import word_tokenize,sent_tokenize
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
sentences=[]
sum=0
for review in df['text']:
    sents=tokenizer.tokenize(review.strip())
    sum+=len(sents)
    for sent in sents:
        cleaned_sent=clean_reviews(sent)
        sentences.append(cleaned_sent.split()) # can use word_tokenize also

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [None]:
# trying to print few sentences
for te in sentences[:5]:
    print(te,"\n")

['quite', 'awhile', 'since', 'silver', 'peak', 'pleasantly', 'surprised'] 

['group', 'u', 'went', 'lunch', 'movie'] 

['able', 'get', 'seat', 'patio'] 

['waitress', 'attentive', 'even', 'coming', 'back', 'one', 'point', 'tell', 'u', 'food', 'way'] 

['turkey', 'sandwich', 'great', 'flavor'] 



In [None]:
import gensim
w2v_model=gensim.models.Word2Vec(sentences=sentences,size=300,window=10,min_count=1)

In [None]:
w2v_model.train(sentences,epochs=10,total_examples=len(sentences))



(3780574, 4053010)

In [None]:
# embedding of a particular word.
w2v_model.wv.get_vector('food')

array([-1.2519074e-02,  3.0157462e-02, -2.4161012e-01, -4.2597109e-01,
       -8.0857575e-01,  1.0164573e+00, -1.3604687e-01, -2.9193634e-01,
        6.5294519e-02,  4.9009079e-01, -2.3208708e-02, -6.0725611e-01,
        1.9603798e-01,  2.8376508e-01, -4.5399731e-01,  2.1580699e-01,
       -5.0581926e-01,  2.9682201e-01,  8.6455965e-01, -3.1399676e-01,
       -1.0142533e+00, -5.2604433e-02, -6.0910004e-01,  4.1506499e-01,
        3.7061450e-01,  7.2691232e-01, -9.3420708e-01,  1.3068402e+00,
        5.9924638e-01, -6.2094164e-01,  5.3530306e-01, -6.2002993e-01,
        1.2970396e+00, -7.1154636e-01,  2.7356005e-01,  6.7948145e-01,
        2.1156073e-01,  1.2184693e+00,  5.8060344e-02, -1.3461924e-01,
       -8.9386225e-01,  1.8773535e-01,  2.4944596e-01,  2.5982824e-01,
       -1.2155795e+00,  4.2707613e-01,  9.9672027e-02,  4.5422956e-01,
        8.2563430e-01,  2.2515388e-01,  1.0961478e-01,  9.3297613e-01,
        7.4269456e-01, -3.2314561e-02, -2.0898415e-01, -2.4610500e-01,
      

In [None]:
# total numberof extracted words.
vocab=w2v_model.wv.vocab
print("The total number of words are : ",len(vocab))

The total number of words are :  17138


In [None]:
# similaraity b/w two words
w2v_model.wv.similarity('good','ok')

0.44149297

In [None]:
word_vec_dict={}
for word in vocab:
  word_vec_dict[word]=w2v_model.wv.get_vector(word)
print("The number of key-value pairs : ",len(word_vec_dict)) # should come equal to vocab size
  

The number of key-value pairs :  17138


In [None]:
# cleaning reviews
df['clean_review']=df['text'].apply(clean_reviews)

In [None]:
# number of unique words = 17266
# now since we will have to pad we need to find the maximum lenght of any document

maxi=-1
for i,rev in enumerate(df['clean_review']):
    tokens=rev.split()
    if(len(tokens)>maxi):
        maxi=len(tokens)
print(maxi)

465


In [None]:
tok = Tokenizer()
tok.fit_on_texts(df['clean_review'])
vocab_size = len(tok.word_index) + 1
encd_rev = tok.texts_to_sequences(df['clean_review'])
max_rev_len=500  # max lenght of a review
vocab_size = len(tok.word_index) + 1  # total no of words
embed_dim=300 # embedding dimension as choosen in word2vec constructor
# now padding to have a amximum length of 500
pad_rev= pad_sequences(encd_rev, maxlen=max_rev_len, padding='post')
pad_rev.shape   # note that we had 100K reviews and we have padded each review to have  a lenght of 500 words

(8000, 500)

In [None]:
# now creating the embedding matrix
embed_matrix=np.zeros(shape=(vocab_size,embed_dim))
for word,i in tok.word_index.items():
  embed_vector=word_vec_dict.get(word)
  if embed_vector is not None:  # word is in the vocabulary learned by the w2v model
    embed_matrix[i]=embed_vector
  # if word is not found then embed_vector corressponding to that vector will stay zero.
# checking.
print(embed_matrix[14])

[-0.15329008  0.19205789  0.33048382 -0.29182404 -0.6070174   0.62978333
  0.7098375   0.06378524  0.08979648  0.07521322 -0.47140661  0.36344141
 -0.7640112   0.09809756  0.37205693  0.46921232 -0.18176049 -0.27860013
  0.11301555 -0.00168485  0.03173685  0.51631939 -0.12238616  0.36571446
 -0.02551958 -0.12248346  0.06422092  0.67846024 -0.22855788 -0.01474925
  0.48981869 -0.84647423  0.84645599  0.06675145 -0.38953519  0.00483096
  0.3633531   0.01628862 -0.24026129  0.15120988  0.04586197 -0.24636896
  0.08834372  0.195199   -0.2607317   0.47687247 -0.47219157 -0.06249231
  0.2629613   0.48581231  0.34527403 -1.01799011  0.44886872 -0.17510682
  0.29230276 -0.17993848  0.35733572  0.48357856  0.77734751 -0.13269538
  0.13037434  0.08009788  0.01265465  0.85501468  0.6014961  -0.01155237
  0.35262269 -0.04540459 -0.19311917  0.00468632  0.90316802  0.06126091
  0.1632857  -0.45787105 -0.68557423 -0.21797258 -0.22660227  0.25799742
  0.44598156 -0.57502228  0.57460749 -0.14871721  0

In [None]:
# prepare train and val sets first
Y=keras.utils.to_categorical(df['sentiment'])  # one hot target as required by NN
x_train,x_test,y_train,y_test=train_test_split(pad_rev,Y,test_size=0.10,random_state=42)

In [None]:
from keras.initializers import Constant
from keras.layers import ReLU
from keras.layers import Dropout
model=Sequential()
model.add(Embedding(input_dim=vocab_size,output_dim=embed_dim,input_length=max_rev_len,embeddings_initializer=Constant(embed_matrix)))
# model.add(CuDNNLSTM(64,return_sequences=False)) # loss stucks at about 
model.add(Flatten())
model.add(Dense(16,activation='relu'))
model.add(Dropout(0.50))
# model.add(Dense(16,activation='relu'))
# model.add(Dropout(0.20))
model.add(Dense(2,activation='sigmoid'))  # sigmod for bin. classification

In [None]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 500, 300)          5141700   
                                                                 
 flatten_2 (Flatten)         (None, 150000)            0         
                                                                 
 dense_5 (Dense)             (None, 16)                2400016   
                                                                 
 dropout_4 (Dropout)         (None, 16)                0         
                                                                 
 dense_6 (Dense)             (None, 2)                 34        
                                                                 
Total params: 7,541,750
Trainable params: 7,541,750
Non-trainable params: 0
_________________________________________________________________


In [None]:
# compile the model
model.compile(optimizer=keras.optimizers.RMSprop(lr=1e-3),loss='binary_crossentropy',metrics=['accuracy'])
# specify batch size and epochs for training
epochs=5
batch_size=64


In [None]:
# fitting the model
model.fit(x_train,y_train,epochs=epochs,batch_size=batch_size,validation_data=(x_test,y_test))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fc956439400>

In [None]:
model.evaluate(x_test,y_test)



[0.5044351816177368, 0.8125]