In [1]:
# Spam Classification - Feature Engineering, Classifiers

In [2]:
import pandas as pd

data = pd.read_csv("spamdata.csv",encoding = "latin-1")

data.head(5)

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


To work on Feature Engineering, first we need to work on Text Preprocessing to get rid of Noise or remove unnecessary/reduntant information present in the data.

In [3]:
import string
punctuations = string.punctuation

from nltk.corpus import stopwords

stopword_list = stopwords.words("english")

from nltk.stem.wordnet import WordNetLemmatizer
lem = WordNetLemmatizer()

def _clean(text):
    
    cleaned_text = text.lower()
    
    cleaned_text = "".join(c for c in cleaned_text if c not in punctuations)
    
    words = cleaned_text.split()
    
    words = [w for w in words if w not in stopword_list]
    
    words = [lem.lemmatize(word,"v") for word in words] # Stemming doesn't need any POS Tagging but Lemmatization requrires
    words = [lem.lemmatize(word,"n") for word in words] # Generally Verbs and Nouns are the most important
    
    cleaned_text = " ".join(words)
    
    return cleaned_text

_clean("I will by playing a game today !! ")

'play game today'

In [4]:
# we can the same function to run on all the dataset

data["cleaned"] = data["text"].apply(_clean)

data.head(5)

Unnamed: 0,label,text,cleaned
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...
1,ham,Ok lar... Joking wif u oni...,ok lar joke wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,U dun say so early hor... U c already then say...,u dun say early hor u c already say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah dont think go usf live around though


In [5]:
# This is preprocessed data. We are yet to perform text cleaning techniques.

# We didn't remove other support or domain specific words, didn't corrected spellings such as "tkts" instead of "tickets"

# This can be improved further according to the problems

In [6]:
## feature Engineering

# We need to generate different types of feature to convert the text data to useful for ML Algorithms. Examples are below



In [7]:
# meta features - Count of attributes associated with the text data

data["word_count"] = data["text"].apply(lambda x: len(x.split()))

data["word_count_cleaned"] = data["cleaned"].apply(lambda x: len(x.split()))

data["char_count"] = data["text"].apply(lambda x: len(x)) # Total number of characters in the text

data["char_count_without_spaces"] = data["text"].apply(lambda x: len(x.replace(" ",""))) # Getting rid of spaces

# very important for SPAM as they always say free ticket number

data["num_dig"]= data["text"].apply(lambda x: sum([1 if w.isdigit() else 0 for w in x.split()]))


In [8]:
data.head(5)

Unnamed: 0,label,text,cleaned,word_count,word_count_cleaned,char_count,char_count_without_spaces,num_dig
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...,20,16,111,92,0
1,ham,Ok lar... Joking wif u oni...,ok lar joke wif u oni,6,6,29,24,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...,28,23,155,128,2
3,ham,U dun say so early hor... U c already then say...,u dun say early hor u c already say,11,9,49,39,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah dont think go usf live around though,13,8,61,49,0


In [9]:
# these are the meta features of these dataframe. These features can be used in regression, classification, recommendation engines et

In [10]:
# NLP Based features as every text is associated with NLP Properties lik3

# Dependency grammar, relationship among other words, POS Tags

# E.g. Noun represents the kind of entities, subjects or objects whereas Verb represents the actions or action keywords


In [11]:
#To generate POS Tags features, we can first create POS Dictionary

# NNP - proper noun, NN - Regular Nouns, NNS - Singular Nouns
# Verbs for noun families are adding

pos_dic = {"noun":["NNP","NN","NNS","NNPS"], "verb":["VBZ","VB","VBD","VBN","VBG"]} 
import nltk                       

def pos_check(txt,family):
    tags = nltk.pos_tag(nltk.word_tokenize(txt))
    count = 0
    for tag in tags:
        tag = tag[1] #to obtain the proper POS
        if tag in pos_dic[family]:
            count +=1
    return count

pos_check("They are playing in the ground","noun")
    

1

Only one noun is present

In [12]:
pos_check("They are playing in the ground","verb")

1

In [13]:
data["noun_count"] = data["text"].apply(lambda x : pos_check(x,"noun"))
data["verb_count"] = data["text"].apply(lambda x : pos_check(x,"verb"))

In [14]:
data.head(5)

Unnamed: 0,label,text,cleaned,word_count,word_count_cleaned,char_count,char_count_without_spaces,num_dig,noun_count,verb_count
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...,20,16,111,92,0,10,1
1,ham,Ok lar... Joking wif u oni...,ok lar joke wif u oni,6,6,29,24,0,4,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...,28,23,155,128,2,13,3
3,ham,U dun say so early hor... U c already then say...,u dun say early hor u c already say,11,9,49,39,0,3,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah dont think go usf live around though,13,8,61,49,0,1,4


In [15]:
# Apart from these Meta Feature, NLP Based Features, POS Tag Features....we can also work on vectors

In [16]:
# Advanced Feature Engineering

# One of the idea to create Word vectors is count as the features

In [17]:
# Countvectorizer is a class which can be used to generate count vectors for all the particular documents present in the corpus

# Similar to Countvectorizer, Tfidfvectorizer generates term frequency and inverse document frequency and their corresponding

# product as the elements of the word vectors


from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

cvz = CountVectorizer()
cvz.fit(data["cleaned"].values) # we are fitting to cleaned data as we don't unnecessary noise

count_vectors = cvz.transform(data["cleaned"].values) # this transforms original data into vectors

In [18]:
count_vectors # This matrix will contain all word vectors corresponding to their counts

<5572x8206 sparse matrix of type '<class 'numpy.int64'>'
	with 46827 stored elements in Compressed Sparse Row format>

19.00

In [19]:
# Generating Tfidf 

word_tfidf = TfidfVectorizer()
word_tfidf.fit(data["cleaned"].values)
word_vectors_tfidf = word_tfidf.transform(data["cleaned"].values)


In [20]:
# This computes Tf-idf for every word present in the cleaned data for us and in the end, they replace their corresponding tfidf

# score in the location where their words are present.

word_vectors_tfidf 

<5572x8206 sparse matrix of type '<class 'numpy.float64'>'
	with 46827 stored elements in Compressed Sparse Row format>

In [21]:
tfidf = dict(zip(word_tfidf.get_feature_names(),word_tfidf.idf_))

pd.DataFrame(columns=["word_tfidf"]).from_dict(tfidf, orient = "index")

# This gives all the keywords and corresponding Tfidf score

Unnamed: 0,0
008704050406,8.527076
0089my,8.932542
0121,8.932542
01223585236,8.932542
01223585334,8.527076
...,...
¹ã,8.932542
âªm,8.932542
âªt,8.932542
âªve,8.932542


In [22]:
tfidf_idf = pd.DataFrame(columns=["word_tfidf"]).from_dict(tfidf, orient = "index")

tfidf_idf.columns =["word_tfidf"]

tfidf_idf # Where so ever these words will be present, they will be now represented by tfidf score

Unnamed: 0,word_tfidf
008704050406,8.527076
0089my,8.932542
0121,8.932542
01223585236,8.932542
01223585334,8.527076
...,...
¹ã,8.932542
âªm,8.932542
âªt,8.932542
âªve,8.932542


we have created so many features like meta features, NLP based features, raw counts like noun, verb family, frequency based etc. now to use them we need to combine them as a matrix which will be a Sparse Matrix

### Combining Features

In [23]:
data.columns

Index(['label', 'text', 'cleaned', 'word_count', 'word_count_cleaned',
       'char_count', 'char_count_without_spaces', 'num_dig', 'noun_count',
       'verb_count'],
      dtype='object')

In [24]:
# We will stack all our features into horizontally hence hstack

from scipy.sparse import hstack, csr_matrix

meta_features = ['word_count', 'word_count_cleaned',
       'char_count', 'char_count_without_spaces', 'num_dig', 'noun_count',
       'verb_count']

features_set1 = data[meta_features]

train = hstack([word_vectors_tfidf, csr_matrix(features_set1)],"csr")

train

<5572x8213 sparse matrix of type '<class 'numpy.float64'>'
	with 80199 stored elements in Compressed Sparse Row format>

5572 rows X 8213 columns 

These many columns is due to as it is obtained from tfidf and some of the stacked columns of the features_set1. Notice that there are less number rows and more number of columns. This is because in tfidf we generated so many tfidf features. 


Inside the tfidf, we can also control the number of features like below

In [25]:
word_tfidf = TfidfVectorizer(max_features = 500)
word_tfidf.fit(data["cleaned"].values)
word_vectors_tfidf = word_tfidf.transform(data["cleaned"].values)

In [26]:
word_vectors_tfidf 

<5572x500 sparse matrix of type '<class 'numpy.float64'>'
	with 28313 stored elements in Compressed Sparse Row format>

You can see only 500 Features are generated

In [27]:
tfidf = dict(zip(word_tfidf.get_feature_names(),word_tfidf.idf_))

pd.DataFrame(columns=["word_tfidf"]).from_dict(tfidf, orient = "index")


tfidf_idf = pd.DataFrame(columns=["word_tfidf"]).from_dict(tfidf, orient = "index")

tfidf_idf.columns =["word_tfidf"]

from scipy.sparse import hstack, csr_matrix

meta_features = ['word_count', 'word_count_cleaned',
       'char_count', 'char_count_without_spaces', 'num_dig', 'noun_count',
       'verb_count']

features_set1 = data[meta_features]

train = hstack([word_vectors_tfidf, csr_matrix(features_set1)],"csr")

train


<5572x507 sparse matrix of type '<class 'numpy.float64'>'
	with 61685 stored elements in Compressed Sparse Row format>

500 Columns are tfidf featues and 7 correspoonds to meata features

Notice that we used word vectors for tfidf, there are some variations we can also do in word level tfidf. Instead of word, we can use n-gram level tfidf

In [28]:
# 1 corresponds to unigram and 2 corresponds to bigrams

# when we specify 1,4 it will generate all the possible phrases between 1 and 4

ngram_tfidf = TfidfVectorizer(max_features = 500, ngram_range = (1,2)) 
ngram_tfidf.fit(data["cleaned"].values)
ngram_vectors_tfidf = ngram_tfidf.transform(data["cleaned"].values)

So instead of stacking word level tfidf, we can also do n-gram level tfidf

Another variation can be character level tfidf. Instead of counting word, this counts each and every character.

In [29]:
char_tfidf = TfidfVectorizer(max_features = 500,analyzer = "char")
char_tfidf.fit(data["cleaned"].values)
char_vectors_tfidf = char_tfidf.transform(data["cleaned"].values)

This gives character counts and corresponding tfidf scores

While stacking we have a choice of using either only word level, ngram or character or we can use both. 

Along with word_count, we can generate upper case, lower case count, special case count, punctuation count. In POS tag family, we can use adjective count, adverb count etc. 

At the end, train matrix is our final input which can be given to ML Algorithms, recommendations and search engines etc.

In [30]:
train

<5572x507 sparse matrix of type '<class 'numpy.float64'>'
	with 61685 stored elements in Compressed Sparse Row format>

Finally, this is about Feature Engineering. We discussed preprocessing on a given dataset and different featurs can be generated.

We will be using the generated features in ML Algo to classify whether given text is Spam or Ham

In [31]:
train 

<5572x507 sparse matrix of type '<class 'numpy.float64'>'
	with 61685 stored elements in Compressed Sparse Row format>

This is a matrix where we horizontally stacked on the features

Step1: Label encode our target variable which is Spam or Ham but ML Models need numbers to work with. We can convert the categorical variable into label encoded(0/1).

In [32]:
from sklearn.preprocessing import LabelEncoder

target = data["label"].values

target = LabelEncoder().fit_transform(target)

In [33]:
target # this is an array where 0 corresponds to Ham and 1 corresponds to Spam

array([0, 0, 1, ..., 0, 0, 0])

## Test-Train Split

In [34]:
from sklearn.model_selection import train_test_split

train_x, val_x,train_y, val_y = train_test_split(train, target)

In [35]:
train_x.shape

(4179, 507)

In [36]:
val_x.shape

(1393, 507)

In [37]:
from sklearn import naive_bayes
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn import ensemble
from sklearn.metrics import accuracy_score

In [38]:
model = naive_bayes.MultinomialNB()
model.fit(train_x,train_y)
preds = model.predict(val_x)
accuracy_score(preds,val_y)

0.968413496051687

In [39]:
model = LogisticRegression()
model.fit(train_x,train_y)
preds = model.predict(val_x)
accuracy_score(preds,val_y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.9597989949748744

Almost similar accuracy, this is because the dataset has less number of rows

5.00

In [40]:
model = svm.SVC()
model.fit(train_x,train_y)
preds = model.predict(val_x)
accuracy_score(preds,val_y)

0.9267767408470926

SVM is not a good choice here. SVM works well for text classification when Tfidf scores are good with good amount of data.

In [41]:
model = ensemble.ExtraTreesClassifier()
model.fit(train_x,train_y)
preds = model.predict(val_x)
accuracy_score(preds,val_y)

0.9806173725771715

The idea in this problem is don't need to go for a complex model because simple models are able to give good accuracy.

When we add more amount of data, this model show variations. Hence bagging (random forests) or boosting models (XGboos, Lightgbm, catboost) gives good accuracy.

## Spam Classification using Deep Learning

Specifically CNN for text classifications purposes. To perform CNN on our text data we first need to represent our text inputs in word embedding format. 

The idea to use the word embedding format is either pretrained word models provided by the libraries such as word2vec, Glove, fasttext etc. or train the models and their corresponding word embeddings from scratch. The idea from scratch works well when we have huge amount of data.

In this case we have only limited (5500 rows) hence let us pretrained embeddings.

In [42]:
for i,line in enumerate(open("crawl-300d-2M.vec",encoding="utf8")):
    if i ==1:
        print(i,line)

1 , -0.0282 -0.0557 -0.0451 -0.0434 0.0712 -0.0855 -0.1085 -0.0561 -0.4523 -0.0202 0.0975 0.1047 0.1962 -0.0693 0.0213 -0.0235 0.1336 -0.0420 -0.0564 -0.0798 0.0424 -0.0409 -0.0536 -0.0252 0.0135 0.0064 0.1235 0.0461 0.0120 -0.0372 0.0650 0.0041 -0.1074 -0.0263 0.1133 -0.0029 0.0671 0.1065 0.0234 -0.0160 0.0070 0.4355 -0.0752 -0.4328 0.0457 0.0604 -0.0740 -0.0055 -0.0089 -0.2926 -0.0545 -0.1519 0.0990 -0.0193 -0.0050 0.0511 0.0404 0.1023 -0.0128 0.0488 -0.1567 -0.0759 -0.0190 0.1442 0.0047 -0.0186 0.0140 -0.0385 -0.0853 0.1572 0.1770 0.0084 -0.0250 -0.1145 -0.0663 -0.1244 -0.3977 -0.0124 -0.4586 -0.0220 0.5746 0.0218 -0.0754 0.0099 0.0397 -0.0154 0.0424 -0.0150 -0.0016 0.0305 0.0101 0.2266 0.1394 0.0189 0.0069 0.0394 0.0355 -0.0111 -0.0687 -0.0078 0.0224 0.0817 -0.1949 0.0001 0.4047 -0.0237 -0.0656 -0.0684 0.0233 0.0438 0.1203 -0.0276 0.0416 0.0114 -0.4529 0.1538 0.1323 -0.0186 -0.0914 -0.0312 0.1051 0.0212 0.0798 -0.0104 -0.0206 -0.0025 0.0043 -0.0378 0.2689 0.0747 -0.0418 -0.0048 -0.

This is .vec file shared by Analytics vidhya..but i see the file name as pretrained.vec in video. So we will use this .vec file and proceed for project

In [43]:
import numpy as np

embeddings_index = {}

for i, line in enumerate(open("crawl-300d-2M.vec",encoding="utf8")):
    value = line.split()
    print(line)
    break
    embeddings_index[value[0]] = values[1:]
    

1999995 300



First line contains the dimensions

In [44]:
import numpy as np

embeddings_index = {}

for i, line in enumerate(open("crawl-300d-2M.vec",encoding="utf8")):
    if i ==0:
        continue
    value = line.split()
    print(line)
    break
    embeddings_index[value[0]] = values[1:]

, -0.0282 -0.0557 -0.0451 -0.0434 0.0712 -0.0855 -0.1085 -0.0561 -0.4523 -0.0202 0.0975 0.1047 0.1962 -0.0693 0.0213 -0.0235 0.1336 -0.0420 -0.0564 -0.0798 0.0424 -0.0409 -0.0536 -0.0252 0.0135 0.0064 0.1235 0.0461 0.0120 -0.0372 0.0650 0.0041 -0.1074 -0.0263 0.1133 -0.0029 0.0671 0.1065 0.0234 -0.0160 0.0070 0.4355 -0.0752 -0.4328 0.0457 0.0604 -0.0740 -0.0055 -0.0089 -0.2926 -0.0545 -0.1519 0.0990 -0.0193 -0.0050 0.0511 0.0404 0.1023 -0.0128 0.0488 -0.1567 -0.0759 -0.0190 0.1442 0.0047 -0.0186 0.0140 -0.0385 -0.0853 0.1572 0.1770 0.0084 -0.0250 -0.1145 -0.0663 -0.1244 -0.3977 -0.0124 -0.4586 -0.0220 0.5746 0.0218 -0.0754 0.0099 0.0397 -0.0154 0.0424 -0.0150 -0.0016 0.0305 0.0101 0.2266 0.1394 0.0189 0.0069 0.0394 0.0355 -0.0111 -0.0687 -0.0078 0.0224 0.0817 -0.1949 0.0001 0.4047 -0.0237 -0.0656 -0.0684 0.0233 0.0438 0.1203 -0.0276 0.0416 0.0114 -0.4529 0.1538 0.1323 -0.0186 -0.0914 -0.0312 0.1051 0.0212 0.0798 -0.0104 -0.0206 -0.0025 0.0043 -0.0378 0.2689 0.0747 -0.0418 -0.0048 -0.03

The first word in this case is symbol which is comma (,) and you can see the word vector for this comma

In [45]:
import numpy as np

embeddings_index = {} # Embedding index which will contain what are the corresponding word vector values corresponding to a word



# opening and iterating the file to see the values

# In video, they mentioned pretrained vec contains word vectors for 2 million words

for i, line in enumerate(open("crawl-300d-2M.vec",encoding="utf8")):
    if i ==0:
        continue
    value = line.split()
    embeddings_index[value[0]] = np.array(value[1:],dtype ="float32")
    
                                            # passing value of 0 into from the first element to the last element
                                            # what we are doing here the line essentially contains first element as word and
                                            # next element as word vectors
                                            # converted into numpy instead of raw strings
        
    

Now, all of our words are loaded and they will be stored into word embeddings index dictionary (word as key and corresponding values as their word vector notations). This step will take time.

In [46]:
# Next step : Convert the text data into these word embedding representations

from keras.preprocessing import text,sequence


# Step 1: Convert data into tokens

token = text.Tokenizer()
token.fit_on_texts(data["text"])
word_index = token.word_index


# Step 2: Convert text into sequence of tokens and padd them

# This will generate sequence of words as the input and pad them to the equal length (70)


trainx, valx, trainy, valy = train_test_split(data["text"],target)

trainx = sequence.pad_sequences(token.texts_to_sequences(trainx), maxlen = 70) # maxlength is given as 70

valx = sequence.pad_sequences(token.texts_to_sequences(valx),maxlen = 70)


# Step 3: Create an embedding matrix which corresponds to different embedding vectors for different key words present in every sentence

# 300 is length of word vector provided for the pre trained models, look at the dimensions

# This embedding matrix is similar to document word matrix (rows x columns = word X vector notations)

#embedding_matrix = np.zeros((len(word_index)+1),300)) # this code was throwing error
embedding_matrix = np.zeros((len(word_index) + 1, 300))

# To fill this matrix, we need to iterate word by word

for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word) # fetches embedding vector for each word
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector 

In [47]:
word_index # This gives info like which word corresponds to which index

{'i': 1,
 'to': 2,
 'you': 3,
 'a': 4,
 'the': 5,
 'u': 6,
 'and': 7,
 'in': 8,
 'is': 9,
 'me': 10,
 'my': 11,
 'for': 12,
 'your': 13,
 'it': 14,
 'of': 15,
 'call': 16,
 'have': 17,
 'on': 18,
 '2': 19,
 'that': 20,
 'now': 21,
 'are': 22,
 'so': 23,
 'but': 24,
 'not': 25,
 'or': 26,
 'do': 27,
 'can': 28,
 'at': 29,
 "i'm": 30,
 'get': 31,
 'be': 32,
 'will': 33,
 'if': 34,
 'ur': 35,
 'with': 36,
 'just': 37,
 'no': 38,
 'we': 39,
 'this': 40,
 'gt': 41,
 '4': 42,
 'lt': 43,
 'up': 44,
 'when': 45,
 'ok': 46,
 'free': 47,
 'from': 48,
 'how': 49,
 'go': 50,
 'all': 51,
 'out': 52,
 'what': 53,
 'know': 54,
 'like': 55,
 'good': 56,
 'then': 57,
 'got': 58,
 'was': 59,
 'come': 60,
 'its': 61,
 'am': 62,
 'time': 63,
 'only': 64,
 'day': 65,
 'love': 66,
 'there': 67,
 'send': 68,
 'he': 69,
 'want': 70,
 'text': 71,
 'as': 72,
 'txt': 73,
 'one': 74,
 'going': 75,
 'by': 76,
 'home': 77,
 "i'll": 78,
 'need': 79,
 'about': 80,
 'r': 81,
 'lor': 82,
 'sorry': 83,
 'stop': 84,
 'st

In [48]:
embedding_matrix # All of the words have been essentially filled into embedding matrix. Inside this word and corresponding values
                 # will be present

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.48210001,  0.0885    , -0.0782    , ..., -0.0317    ,
        -0.1591    , -0.1301    ],
       [-0.0175    , -0.2189    ,  0.0353    , ..., -0.28459999,
         0.0509    ,  0.0229    ],
       ...,
       [ 0.1224    ,  0.0217    , -0.34729999, ..., -0.54530001,
        -0.0505    , -0.1023    ],
       [ 0.1039    , -0.37940001, -0.0344    , ...,  0.0994    ,
        -0.25979999, -0.17110001],
       [ 0.32229999, -0.42629999,  0.4325    , ...,  0.14229999,
         0.0177    ,  0.0414    ]])

17.24

In [49]:
# Next Step: Training the model

def train_model(classifier, feature_vector_train,label,feature_vector_val,valid_y): 
    classifier.fit(feature_vector_train,label)
    predictions = classifier.predict(feature_vector_val)
    predictions = predictions.argmax(axis = -1) # converts the probabilities into classes
    return accuracy_score(predictions,valid_y)

In [56]:
# writing CNN as Classifier

from keras import layers, models, optimizers

def create_cnn():
    
    input_layer = layers.Input(70) # First Layer. In previous steps, we defined input size as 70
                                   # Every sequence is pallet sequence which contains 70 element at max
    
    embedding_layer = layers.Embedding(len(word_index) + 1, 300, weights = [embedding_matrix],trainable = False)(input_layer)
                        
                                # 300 is the length of the word vectors
                                # Since we are using pre-trained word model we will pass those weights to this layer
                                # trainable = False, we don't want to train our own word vector because we are using pre-trained model
    
    conv_layer = layers.Convolution1D(100, 3, activation ="relu")(embedding_layer)
    
    pooling_layer = layers.GlobalMaxPool1D()(conv_layer)
    
    
    # This dense part will be like a Neural Network
    
    output_layer = layers.Dense(50, activation ="relu") (pooling_layer) # feature extraction happens here. Conolutional features
    output_layer = layers.Dropout(0.25) (output_layer)          # Dropout is 25%. This is nothing but Regularization in NN   
    output_layer = layers.Dense(1, activation ="sigmoid") (output_layer) # clasification layer. Sigmoid ensures the range 0 to 1
    
    
    model = models.Model(inputs = input_layer,outputs = output_layer)
    # model.compile(optmizer = optimizers.Adam(), loss = "binary_crossentropy")
    model.compile(optimizer=optimizers.Adam(), loss='binary_crossentropy')
                                    
                                                        # We can also use Adamdelta or RMSProp instead of Adam
                                                        # Since it is binary classification hence it is binary_crossentropy
    return models 
                                                        # Layers are connected (previous layer name is mentioned in the line)
    
                    

In [57]:
classifier = create_cnn()

train_model(classifier, trainx, trainy, valx, valy)

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: Bad argument number for Name: 4, expecting 3
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: Bad argument number for Name: 4, expecting 3
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: Bad argument number for Name: 4, expecting 3
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: Bad argument number for Name: 4, expecting 3


0.8722182340272793

When I have mistakenly written like below...it was throwing error...hence it is important to understand to pass the paramters

 output_layer = layers.Dense(50, activation ="sigmoid") (output_layer)

When we have more data...more numbr of epochs can be done. If we the accuracy of CNN model is very less when compare to simple models. Deep Learning models works well with Large amount of data.