## Categorical Embeddings on new features.
### In this Section
1. I have created features like pos_or_neg. i.e. In a review which words are occuring more times. Based on this I have classified reviews as p, n or neutral.
2. Derived polarity from TextBlob and added it as a numerical feature.
3. Embedded all the layers after preprocessing and applied 1D CNN and LSTM on the embedded set.

Note
- This file requires glove.6B.50d.txt to run.

In [1]:
import pandas as pd
import numpy as np

import nltk
import re
from nltk.corpus import stopwords
from nltk.classify.scikitlearn import SklearnClassifier
from nltk.tokenize import sent_tokenize, word_tokenize
from wordcloud import WordCloud

pd.set_option('display.max_colwidth', -1)

In [2]:
data1 = pd.read_csv('alreviews_df_3000.csv') ## Allreviews data with more positive and less negative reviews
valid2 = pd.read_csv('alreviews_df_1100_validation.csv') ## Extra validation set to perform second validation

In [3]:
data1.drop(['createDate','displayImageUrl','displayName','hasProfanity','hasSpoilers','isSuperReviewer','isVerified','rating','timeFromCreation','updateDate','primary_key'],axis=1,inplace=True)
valid2.drop(['createDate','displayImageUrl','displayName','hasProfanity','hasSpoilers','isSuperReviewer','isVerified','rating','timeFromCreation','updateDate'],axis=1,inplace=True)

In [4]:
data1['sentiment'] = np.where((data1['score']>3.0),0,1)
valid2['sentiment'] = np.where((valid2['score']>3.0),0,1)

In [5]:
data1.drop('score',axis=1,inplace=True)
valid2.drop('score',inplace = True, axis=1)

In [6]:
CONTRACTION_MAP = {"ain't": 'is not', "aren't": 'are not', "can't": 'cannot', "can't've": 'cannot have', "'cause": 'because', "could've": 'could have', "couldn't": 'could not', "couldn't've": 'could not have', "didn't": 'did not', "doesn't": 'does not', "don't": 'do not', "hadn't": 'had not', "hadn't've": 'had not have', "hasn't": 'has not', "haven't": 'have not', "he'd": 'he would', "he'd've": 'he would have', "he'll": 'he will', "he'll've": 'he he will have', "he's": 'he is', "how'd": 'how did', "how'd'y": 'how do you', "how'll": 'how will', "how's": 'how is', "I'd": 'I would', "I'd've": 'I would have', "I'll": 'I will', "I'll've": 'I will have', "I'm": 'I am', "I've": 'I have', "i'd": 'i would', "i'd've": 'i would have', "i'll": 'i will', "i'll've": 'i will have', "i'm": 'i am', "i've": 'i have', "isn't": 'is not', "it'd": 'it would', "it'd've": 'it would have', "it'll": 'it will', "it'll've": 'it will have', "it's": 'it is', "let's": 'let us', "ma'am": 'madam', "mayn't": 'may not', "might've": 'might have', "mightn't": 'might not', "mightn't've": 'might not have', "must've": 'must have', "mustn't": 'must not', "mustn't've": 'must not have', "needn't": 'need not', "needn't've": 'need not have', "o'clock": 'of the clock', "oughtn't": 'ought not', "oughtn't've": 'ought not have', "shan't": 'shall not', "sha'n't": 'shall not', "shan't've": 'shall not have', "she'd": 'she would', "she'd've": 'she would have', "she'll": 'she will', "she'll've": 'she will have', "she's": 'she is', "should've": 'should have', "shouldn't": 'should not', "shouldn't've": 'should not have', "so've": 'so have', "so's": 'so as', "that'd": 'that would', "that'd've": 'that would have', "that's": 'that is', "there'd": 'there would', "there'd've": 'there would have', "there's": 'there is', "they'd": 'they would', "they'd've": 'they would have', "they'll": 'they will', "they'll've": 'they will have', "they're": 'they are', "they've": 'they have', "to've": 'to have', "wasn't": 'was not', "we'd": 'we would', "we'd've": 'we would have', "we'll": 'we will', "we'll've": 'we will have', "we're": 'we are', "we've": 'we have', "weren't": 'were not', "what'll": 'what will', "what'll've": 'what will have', "what're": 'what are', "what's": 'what is', "what've": 'what have', "when's": 'when is', "when've": 'when have', "where'd": 'where did', "where's": 'where is', "where've": 'where have', "who'll": 'who will', "who'll've": 'who will have', "who's": 'who is', "who've": 'who have', "why's": 'why is', "why've": 'why have', "will've": 'will have', "won't": 'will not', "won't've": 'will not have', "would've": 'would have', "wouldn't": 'would not', "wouldn't've": 'would not have', "y'all": 'you all', "y'all'd": 'you all would', "y'all'd've": 'you all would have', "y'all're": 'you all are', "y'all've": 'you all have', "you'd": 'you would', "you'd've": 'you would have', "you'll": 'you will', "you'll've": 'you will have', "you're": 'you are', "you've": 'you have'}

In [7]:
def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):
    #re.compile(regex).search(subject) is equivalent to re.search(regex, subject).
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())),
                                      flags=re.IGNORECASE | re.DOTALL)

    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
            if contraction_mapping.get(match)\
            else contraction_mapping.get(match.lower())
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction

    expanded_text = re.sub("’", "'", text)
    expanded_text = contractions_pattern.sub(expand_match, expanded_text)

    return expanded_text

In [8]:
# Function to Preprocess the Reviews
def clean_doc(doc):
    # Removing contractions
    doc = expand_contractions(doc)
    
    # split into tokens by white space
    tokens = doc.split(' ')
    
    # Converting into lower case
    tokens = [w.lower() for w in tokens]
    
    # remove special characters from each token
    tokens = [re.sub(r"[^a-zA-Z#\s]",'',i) for i in tokens]
    tokens = [re.sub(r"[\r\n]",'',i) for i in tokens]
    # filter out stop words
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    
    # lemmatizing
    lmtzr = nltk.stem.WordNetLemmatizer()
    tokens = [lmtzr.lemmatize(w) for w in tokens]
    
    # filter out short tokens
    tokens = [word for word in tokens if len(word) > 1]
    return tokens

In [9]:
#Preprocessig the reviews
data1['modified_review'] = data1.review.apply(lambda x: ' '.join(clean_doc(x)))
valid2['modified_review'] = valid2.review.apply(lambda x: ' '.join(clean_doc(x)))

In [10]:
# Collecting Words from negative reviews
negative_reviews = data1.loc[data1['sentiment']== 1,'modified_review']

negative_words = ' '.join(negative_reviews)
negative_words = negative_words.split(' ')
negative_words = np.array(negative_words)

print('Total Number of words in all negative reviews',len(negative_words))

Total Number of words in all negative reviews 16772


In [11]:
positive_reviews = data1.loc[data1['sentiment']== 0,'modified_review']

positive_words = ' '.join(positive_reviews)
positive_words = positive_words.split(' ')
positive_words = np.array(positive_words)

print('Total Number of words in all positive reviews',len(positive_words))

Total Number of words in all positive reviews 23041


In [12]:
# All words in our corpus
all_words = ' '.join(data1.modified_review)
all_words = all_words.split(' ')
all_words = np.array(all_words)
print('Total number of words in corpus',len(all_words))

Total number of words in corpus 39813


### Deriving pos_or_neg
- The idea here is to create a variable pos_or_neg based on the number of positive or negative words present in the review.
- Positive and Negative words list is created from the number of times the word is appearing in positive or negative reviews.

In [13]:
# Function to check if the word is occuring more in positive or negative reviews
def word_check(x,positive1,negative1,neutral1):
    if ((pd.Series(positive_words) == x).sum()) > ((pd.Series(negative_words)==x).sum()):
        positive1.append(x)
    elif ((pd.Series(positive_words) == x).sum()) < ((pd.Series(negative_words)==x).sum()):
        negative1.append(x)
    else:
        neutral1.append(x)

In [14]:
more_in_positive=[]
more_in_negative=[]
neutral=[]

pd.Series(all_words).apply(lambda x:word_check(x,more_in_positive,more_in_negative,neutral))

0        None
1        None
2        None
3        None
4        None
5        None
6        None
7        None
8        None
9        None
10       None
11       None
12       None
13       None
14       None
15       None
16       None
17       None
18       None
19       None
20       None
21       None
22       None
23       None
24       None
25       None
26       None
27       None
28       None
29       None
         ... 
39783    None
39784    None
39785    None
39786    None
39787    None
39788    None
39789    None
39790    None
39791    None
39792    None
39793    None
39794    None
39795    None
39796    None
39797    None
39798    None
39799    None
39800    None
39801    None
39802    None
39803    None
39804    None
39805    None
39806    None
39807    None
39808    None
39809    None
39810    None
39811    None
39812    None
Length: 39813, dtype: object

In [15]:
print('Number of unique words that are occuring more in positive reviews:',len(more_in_positive))
print('Number of unique words that are occuring more in negative reviews :',len(more_in_negative))
print('Number of unique words that are neutral :',len(neutral))

Number of unique words that are occuring more in positive reviews: 26612
Number of unique words that are occuring more in negative reviews : 11643
Number of unique words that are neutral : 1558


In [16]:
# Function to check if a reviews is having more positive or negative words.
def check_sent(sent):
    p=[]
    n=[]
    neutral=[]
    tokens = sent.split(' ')
    for w in tokens:
        if w in more_in_positive:
            p.append(w)
        elif w in more_in_negative:
            n.append(w)
        elif w in neutral:
            neutral.append(w)
    if (len(p)>len(n))&(len(p)>len(neutral)):
        code='p'
    elif (len(n)>len(p))&(len(n)>len(neutral)):
        code='n'
    elif (len(neutral)>len(p))&(len(neutral)>len(n)):
        code='neutral'
    elif len(n)==0 & len(p)==0:
        code='neutral'
    else:
        code='neutral'
    return(p,n,code)

In [17]:
data1['pos_or_neg'] = data1.modified_review.apply(lambda x:check_sent(x)[2])
data1['pos_or_neg'].value_counts()

p          2468
n          396 
neutral    136 
Name: pos_or_neg, dtype: int64

In [18]:
valid2['pos_or_neg'] = valid2.modified_review.apply(lambda x:check_sent(x)[2])
valid2['pos_or_neg'].value_counts()

p          948
n          92 
neutral    60 
Name: pos_or_neg, dtype: int64

In [19]:
pd.crosstab(data1['pos_or_neg'],data1.sentiment)

sentiment,0,1
pos_or_neg,Unnamed: 1_level_1,Unnamed: 2_level_1
n,32,364
neutral,49,87
p,2089,379


In [20]:
pd.crosstab(valid2['pos_or_neg'],valid2.sentiment)

sentiment,0,1
pos_or_neg,Unnamed: 1_level_1,Unnamed: 2_level_1
n,29,63
neutral,34,26
p,762,186


In [21]:
data1.sample()

Unnamed: 0,review,sentiment,modified_review,pos_or_neg
699,Great movie to see with the kids.,0,great movie see kid,p


### Deriving features on Test set

In [22]:
test = pd.read_csv('test-1566381431512.csv')

In [23]:
test['modified_review'] = test.Review.apply(lambda x: ' '.join(clean_doc(x)))

In [24]:
test['pos_or_neg'] = test.modified_review.apply(lambda x:check_sent(x)[2])

In [25]:
valid2.sample()

Unnamed: 0,review,sentiment,modified_review,pos_or_neg
383,Love the new version. We took our grandson with us. We al agreed it was wonderful from start to finish.,0,love new version took grandson al agreed wonderful start finish,p


In [26]:
test.sample()

Unnamed: 0,ReviewID,Review,modified_review,pos_or_neg
684,93560,Best movie I've ever seen in my entire life. All of the critics are stupid. And wrong! It was perfect. They added and changed just enough. Perfect!,best movie ever seen entire life critic stupid wrong perfect added changed enough perfect,p


## Deriving polarity
- Using textblob I have derived a polarity rating on the reviews.
- -1 being extremely negative and 1 being extremely positive.

In [27]:
from textblob import TextBlob

data1['polarity'] = data1['review'].map(lambda text: TextBlob(text).sentiment.polarity)
valid2['polarity'] = valid2['review'].map(lambda text: TextBlob(text).sentiment.polarity)
test['polarity'] = test['Review'].map(lambda text: TextBlob(text).sentiment.polarity)


data1['number_of_sentences'] = [len(sent_tokenize(i)) for i in data1.review]
valid2['number_of_sentences'] = [len(sent_tokenize(i)) for i in valid2.review]
test['number_of_sentences'] = [len(sent_tokenize(i)) for i in test.Review]

data1['number_of_words'] = [len(word_tokenize(i)) for i in data1.modified_review]
valid2['number_of_words'] = [len(word_tokenize(i)) for i in valid2.modified_review]
test['number_of_words'] = [len(word_tokenize(i)) for i in test.modified_review]

In [28]:
data1['polarity'].dtype

dtype('float64')

In [29]:
pd.crosstab(data1.sentiment,data1.number_of_words)

number_of_words,1,2,3,4,5,6,7,8,9,10,...,169,175,180,187,200,209,221,243,366,401
sentiment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,6,128,257,228,199,191,156,140,109,90,...,0,0,1,1,0,0,0,0,0,1
1,1,31,50,48,50,48,43,38,35,35,...,1,1,0,0,3,1,1,1,1,0


In [30]:
pd.crosstab(valid2.sentiment,valid2.number_of_words)

number_of_words,1,2,3,4,5,6,7,8,9,10,...,89,91,92,97,99,102,113,127,164,213
sentiment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,4,49,102,94,81,55,53,52,42,24,...,1,0,1,0,0,1,2,0,0,0
1,0,8,19,14,18,8,11,11,17,8,...,0,1,0,1,1,0,1,1,1,1


In [31]:
data1.columns

Index(['review', 'sentiment', 'modified_review', 'pos_or_neg', 'polarity',
       'number_of_sentences', 'number_of_words'],
      dtype='object')

In [32]:
valid2.columns

Index(['review', 'sentiment', 'modified_review', 'pos_or_neg', 'polarity',
       'number_of_sentences', 'number_of_words'],
      dtype='object')

In [33]:
test.columns

Index(['ReviewID', 'Review', 'modified_review', 'pos_or_neg', 'polarity',
       'number_of_sentences', 'number_of_words'],
      dtype='object')

In [34]:
numerical = ['number_of_sentences','number_of_words']
numerical_float = ['polarity']
categorical =['pos_or_neg']
string = ['modified_review']

In [35]:
for num in numerical:
    test[num] = test[num].astype('int64')
    
for cat in categorical:
    test[cat] = test[cat].astype('category')

In [36]:
categorical_attr = ['pos_or_neg']

In [37]:
target_attr = 'sentiment'

In [38]:
numerical_attr = data1.select_dtypes(['int64','float64']).columns
numerical_df = data1[numerical_attr]

In [39]:
numerical_df=numerical_df.astype('float')
numerical_df.head()

Unnamed: 0,polarity,number_of_sentences,number_of_words
0,0.503125,4.0,21.0
1,0.288939,2.0,14.0
2,0.9375,2.0,2.0
3,0.85,2.0,8.0
4,0.5,1.0,3.0


In [40]:
from sklearn.preprocessing import MinMaxScaler,LabelEncoder,OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

from keras.layers import Input,Embedding,Dense,Flatten,concatenate
from keras.models import Model

from sklearn.model_selection import train_test_split

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [41]:
le = LabelEncoder()
for i in categorical_attr:
    data1[i] = le.fit_transform(data1[i])
    valid2[i] = le.transform(valid2[i])

In [42]:
data_categorical_train, data_categorical_valid1, \
data_numerical_train, data_numerical_valid1, \
data_string_train, data_string_valid1, \
Y_train, Y_valid1 = train_test_split(data1[categorical_attr],
                                   numerical_df,
                                   data1[string],
                                   data1[target_attr],
                                   test_size=0.3, random_state=123) 

In [43]:
data_categorical_valid2 = valid2[categorical_attr]
data_numerical_valid2 = valid2[numerical_attr]
data_string_valid2 = valid2[string]
Y_valid2 = valid2[target_attr]

### Preprocessing of categorical variables
#### Convert categorical attributes to numeric


In [44]:
onehotencoder = OneHotEncoder(handle_unknown='ignore')

In [45]:
OneHotEncoder = onehotencoder.fit(data_categorical_train)

In [46]:
OneHotEncoder_train = OneHotEncoder.transform(data_categorical_train).toarray()
OneHotEncoder_valid1 = OneHotEncoder.transform(data_categorical_valid1).toarray()
OneHotEncoder_valid2 = OneHotEncoder.transform(data_categorical_valid2).toarray()

## Preprocessing of Target variables

#### Min Max Scaling

In [47]:
Scalar= MinMaxScaler()
scaled_attr = Scalar.fit(data_numerical_train)
scaled_attr_train= scaled_attr.transform(data_numerical_train)
scaled_attr_valid1= scaled_attr.transform(data_numerical_valid1)
scaled_attr_valid2= scaled_attr.transform(data_numerical_valid2)

#### Stack both numerical and Categorical feautures

In [48]:
X_train = np.hstack((scaled_attr_train, OneHotEncoder_train))
X_train.shape

(2100, 6)

In [49]:
X_valid1 = np.hstack((scaled_attr_valid1, OneHotEncoder_valid1))
X_valid1.shape

(900, 6)

In [50]:
X_valid2 = np.hstack((scaled_attr_valid2, OneHotEncoder_valid2))
X_valid2.shape

(1100, 6)

### Pre-Processing of Text
##### Preprocessing of Review Text
##### Get the length of the text having maximum number of occurances
##### Get the unique count of text length

In [51]:
unique_elements, counts_elements = np.unique(data_string_train['modified_review'].apply(len),return_counts=True)

In [52]:
unique_elements

array([   5,    6,    7,    8,    9,   10,   11,   12,   13,   14,   15,
         16,   17,   18,   19,   20,   21,   22,   23,   24,   25,   26,
         27,   28,   29,   30,   31,   32,   33,   34,   35,   36,   37,
         38,   39,   40,   41,   42,   43,   44,   45,   46,   47,   48,
         49,   50,   51,   52,   53,   54,   55,   56,   57,   58,   59,
         60,   61,   62,   63,   64,   65,   66,   67,   68,   69,   70,
         71,   72,   73,   74,   75,   76,   77,   78,   79,   80,   81,
         82,   83,   84,   85,   86,   87,   88,   89,   90,   91,   92,
         93,   94,   95,   96,   97,   98,   99,  100,  101,  102,  103,
        104,  105,  106,  107,  108,  109,  110,  111,  112,  113,  114,
        115,  116,  117,  118,  119,  120,  121,  122,  123,  124,  125,
        126,  127,  128,  129,  130,  131,  132,  133,  134,  135,  136,
        137,  138,  139,  140,  141,  142,  145,  146,  147,  148,  149,
        150,  151,  152,  153,  154,  155,  156,  1

In [53]:
counts_elements

array([ 1,  2,  1,  1,  5,  5, 14,  5, 18, 12, 28, 26, 30, 34, 28, 31, 34,
       35, 22, 34, 37, 26, 30, 19, 31, 27, 26, 24, 30, 35, 23, 27, 19, 24,
       17, 30, 29, 17, 26, 26, 27, 20, 17, 17, 14, 22, 21, 23, 15, 15, 19,
       14,  7, 26, 16, 21, 19, 18, 14,  8, 18,  8,  7, 18, 11,  7, 10, 17,
       14,  8, 14, 10, 12, 13, 11,  3, 15,  9, 10, 12,  6, 11,  4, 11, 11,
       10, 10,  9,  7, 12,  6,  5,  9,  8, 12,  7,  8,  6,  4,  9,  9,  8,
        8,  3,  8,  7,  7,  6,  5,  7,  3,  4,  2,  2,  2, 10,  7,  4,  5,
        4,  4,  3,  2,  6,  3,  7,  3,  2,  5,  2,  3,  5,  2,  3,  3,  2,
        5,  2,  6,  5,  3,  2,  4,  4,  3,  4,  2,  2,  1,  2,  3,  2,  4,
        2,  2,  3,  2,  2,  3,  3,  3,  1,  2,  2,  1,  3,  4,  1,  3,  1,
        1,  1,  2,  1,  2,  2,  2,  1,  7,  2,  1,  3,  1,  1,  2,  3,  2,
        1,  2,  1,  2,  2,  3,  3,  2,  2,  1,  2,  2,  1,  1,  1,  2,  1,
        1,  1,  1,  1,  3,  2,  1,  3,  2,  1,  1,  1,  2,  1,  2,  1,  1,
        1,  1,  1,  1,  1

In [54]:
max_text_count_length = list(counts_elements).index(max(counts_elements))
REVIEW_TEXT_MAX_SEQUENCE_LENGTH = unique_elements[max_text_count_length]
REVIEW_TEXT_MAX_SEQUENCE_LENGTH

25

In [55]:
tokenizer = Tokenizer(oov_token='None')
tokenizer.fit_on_texts(data_string_train['modified_review'])
review_text_train = tokenizer.texts_to_sequences(data_string_train['modified_review'])
review_text_valid1 = tokenizer.texts_to_sequences(data_string_valid1['modified_review'])
review_text_valid2 = tokenizer.texts_to_sequences(data_string_valid2['modified_review'])


word_index_review_text = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index_review_text))
NUM_WORDS_REVIEW_TEXT = len(word_index_review_text)+1

review_text_seq_train = pad_sequences(review_text_train, maxlen=REVIEW_TEXT_MAX_SEQUENCE_LENGTH)
review_text_seq_valid1 = pad_sequences(review_text_valid1, maxlen=REVIEW_TEXT_MAX_SEQUENCE_LENGTH)
review_text_seq_valid2 = pad_sequences(review_text_valid2, maxlen=REVIEW_TEXT_MAX_SEQUENCE_LENGTH)

Found 3618 unique tokens.


In [56]:
# load the whole embedding into memory
embeddings_index = dict()
f = open('glove.6B.50d.txt',encoding='utf8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

FileNotFoundError: [Errno 2] No such file or directory: 'glove.6B.50d.txt'

In [None]:
# create a weight matrix for words in training docs
review_embedding_matrix = np.zeros((NUM_WORDS_REVIEW_TEXT,50))
review_word_not_in_glove_count = 0
review_word_not_in_glove =[]
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        review_embedding_matrix[i] = embedding_vector
    else:
        review_word_not_in_glove.append(word)
        review_word_not_in_glove_count = review_word_not_in_glove_count+1

In [None]:
print(review_word_not_in_glove)

In [None]:
print(review_word_not_in_glove_count)

## Dense layer for numerical features


In [None]:
num_cat_inputs = Input(shape=(X_train.shape[1],),name='num_cat_inputs')
out_num_cat = Dense(64, activation='relu')(num_cat_inputs)

### Embedding layer for Review Text
#### If there are more than one word in the training data which are not present in Glove then train the embedding layer

In [None]:
from keras import regularizers
from keras.optimizers import Adam
from keras.layers import Dropout
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
from keras.layers import GlobalMaxPooling1D, AveragePooling1D,Conv1D,Dropout,Embedding, MaxPooling1D,GRU,LSTM,SpatialDropout1D,GlobalMaxPool1D

In [None]:
review_text_input= Input(shape=(REVIEW_TEXT_MAX_SEQUENCE_LENGTH,),name='review_text_input')
if (review_word_not_in_glove_count<=1):
    text_embed = Embedding(input_dim=NUM_WORDS_REVIEW_TEXT,output_dim=50,weights=[review_embedding_matrix],trainable=False)(review_text_input)
    con1d = Conv1D(64,kernel_size=3,padding='same',activation='relu',strides=1,kernel_initializer='normal')(text_embed)
else:
    text_embed = Embedding(input_dim=NUM_WORDS_REVIEW_TEXT,output_dim=50,weights=[review_embedding_matrix],trainable=True)(review_text_input)
    con1d = Conv1D(64,kernel_size=3,padding='same',activation='relu',strides=1,kernel_initializer='normal')(text_embed)

review_out_text = Flatten()(con1d)

#### Concatenate the output of above layers.

In [None]:
from keras import backend as K

def Recall_score(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

def Precision_score(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision

def F1_score(y_true, y_pred):
    precision = Precision_score(y_true, y_pred)
    recall = Recall_score(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [None]:
adam = Adam(lr=0.01, decay=0.0005)

## Callbacks
earlystopper = EarlyStopping(monitor='val_loss', patience=10, verbose=1)
reduce_lr = ReduceLROnPlateau(patience=5, verbose=1)

concatenated = concatenate([out_num_cat,review_out_text],axis=-1)
drop1 = Dropout(0.9)(concatenated)
X1 = Dense(32, activation='relu',kernel_regularizer=regularizers.l2(),kernel_initializer='normal')(drop1)
# drop2 = Dropout(0.2)(X1)
# X2 = Dense(8, activation='relu',kernel_regularizer=regularizers.l2(),kernel_initializer='uniform')(drop2)
final_out = Dense(1, activation='sigmoid')(X1)

In [None]:
model = Model(inputs=[num_cat_inputs,review_text_input], outputs=final_out)

In [None]:
model.summary()

In [None]:
model.compile(loss='binary_crossentropy', optimizer=adam, metrics=[F1_score])

In [None]:
model.fit([X_train,review_text_seq_train,],
          y=Y_train,
          epochs=50,
          batch_size=64,
          callbacks=[reduce_lr, earlystopper],
          validation_data=([X_valid1,review_text_seq_valid1,],
                           Y_valid1),
          verbose=2 )

In [None]:
train_classes = model.predict([X_train,review_text_seq_train])
valid1_classes = model.predict([X_valid1,review_text_seq_valid1])
valid2_classes = model.predict([X_valid2,review_text_seq_valid2])

In [None]:
train_classes = np.where(train_classes>0.5,1,0).flatten()
valid1_classes = np.where(valid1_classes>0.5,1,0).flatten()
valid2_classes = np.where(valid2_classes>0.5,1,0).flatten()

In [None]:
print('Train F1 Score :',round(f1_score(Y_train,train_classes),2))
print('Valid1 F1 Score :',round(f1_score(Y_valid1,valid1_classes),2))
print('Valid2 F1 Score :',round(f1_score(Y_valid2,valid2_classes),2))

## LSTM

In [None]:
review_text_input= Input(shape=(REVIEW_TEXT_MAX_SEQUENCE_LENGTH,),name='review_text_input')
if (review_word_not_in_glove_count<=1):
    text_embed = Embedding(input_dim=NUM_WORDS_REVIEW_TEXT,output_dim=50,weights=[review_embedding_matrix],trainable=False)(review_text_input)
    lstm1 = LSTM(32,dropout=0.4,return_sequences=True,kernel_initializer='normal',kernel_regularizer=regularizers.l2())(text_embed)
else:
    text_embed = Embedding(input_dim=NUM_WORDS_REVIEW_TEXT,output_dim=50,weights=[review_embedding_matrix],trainable=True)(review_text_input)
    lstm1 = LSTM(32,dropout=0.4,return_sequences=True,kernel_initializer='normal',kernel_regularizer=regularizers.l2())(text_embed)
review_out_text = Flatten()(lstm1)

In [None]:
adam = Adam(lr=0.01, decay=0.0005)

## Callbacks
earlystopper = EarlyStopping(monitor='val_loss', patience=10, verbose=1)
reduce_lr = ReduceLROnPlateau(patience=5, verbose=1)

concatenated = concatenate([out_num_cat,review_out_text],axis=-1)
drop1 = Dropout(0.9)(concatenated)
X1 = Dense(32, activation='relu',kernel_regularizer=regularizers.l2(),kernel_initializer='normal')(drop1)
# drop2 = Dropout(0.2)(X1)
# X2 = Dense(8, activation='relu',kernel_regularizer=regularizers.l2(),kernel_initializer='uniform')(drop2)
final_out = Dense(1, activation='sigmoid')(X1)
model = Model(inputs=[num_cat_inputs,review_text_input], outputs=final_out)

In [None]:
model.compile(loss='binary_crossentropy', optimizer=adam, metrics=[F1_score])
model.fit([X_train,review_text_seq_train,],
          y=Y_train,
          epochs=50,
          batch_size=64,
          callbacks=[reduce_lr, earlystopper],
          validation_data=([X_valid1,review_text_seq_valid1,],
                           Y_valid1),
          verbose=2 )

In [None]:
train_classes = model.predict([X_train,review_text_seq_train])
valid1_classes = model.predict([X_valid1,review_text_seq_valid1])
valid2_classes = model.predict([X_valid2,review_text_seq_valid2])

In [None]:
train_classes = np.where(train_classes>0.5,1,0).flatten()
valid1_classes = np.where(valid1_classes>0.5,1,0).flatten()
valid2_classes = np.where(valid2_classes>0.5,1,0).flatten()

In [None]:
print('Train F1 Score :',round(f1_score(Y_train,train_classes),2))
print('Valid1 F1 Score :',round(f1_score(Y_valid1,valid1_classes),2))
print('Valid2 F1 Score :',round(f1_score(Y_valid2,valid2_classes),2))