# Crete AirBNB

In [1]:
import pandas as pd
import nltk
import numpy as np
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from gensim.utils import simple_preprocess
from keras.preprocessing import sequence
nltk.download('vader_lexicon')

analyzer = SentimentIntensityAnalyzer()
df = pd.read_csv("reviews.csv")
df.head()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\zazat\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
0,27966,495260,2011-09-02,730165,Maja,Manolis and his family were wonderful hosta. I...
1,27966,1088520,2012-04-06,1463423,Russell,Nice apartment. Manolis and his family are lov...
2,27966,1638334,2012-07-05,2761210,Jessica,Antonia was a fantastic host! She will no doub...
3,27966,1884289,2012-08-04,2814925,Paul,Manolis has an excellent apartment in Amoudara...
4,27966,1972549,2012-08-13,3039803,Lavinia,"Everything was wonderful! The weather, beach, ..."


## Data preprocessing

In [2]:
labels = []
comments = df["comments"].convert_dtypes()
for i in range(len(df)):
    try:
        pol_score = analyzer.polarity_scores(comments[i])
    except:
        labels.append(-1)
    else:
        if pol_score['neg'] > pol_score['pos']:
            labels.append(0)
        else:
            labels.append(1)
        
df["label"] = labels
df.head()

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments,label
0,27966,495260,2011-09-02,730165,Maja,Manolis and his family were wonderful hosta. I...,1
1,27966,1088520,2012-04-06,1463423,Russell,Nice apartment. Manolis and his family are lov...,1
2,27966,1638334,2012-07-05,2761210,Jessica,Antonia was a fantastic host! She will no doub...,1
3,27966,1884289,2012-08-04,2814925,Paul,Manolis has an excellent apartment in Amoudara...,1
4,27966,1972549,2012-08-13,3039803,Lavinia,"Everything was wonderful! The weather, beach, ...",1


In [3]:
print(df[df["label"] == 0].shape)
print(df[df["label"] == 1].shape)

(24276, 7)
(247595, 7)


It is an unbalanced dataset, with ratio 90 positive- 10 negative

In [4]:
df[df["label"] == -1]

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments,label
118161,17679432,465056953,2019-06-07,111548578,Mohsen,,-1
187676,25838702,588211111278640865,2022-03-22,436186279,Denise,,-1
247483,42148921,587406873715874033,2022-03-21,442559359,Christina,,-1


In [139]:
df  = df.dropna()
df[df["label"] == -1]

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments,label


In [5]:
df_pos = df[df["label"] == 1]
df_neg = df[df["label"] == 0]


In [6]:
df_pos = df_pos.sample(len(df_neg))
print(df_pos.shape)
print(df_neg.shape)

(24276, 7)
(24276, 7)


Now it is a balanced dataset

In [7]:
df = pd.concat([df_neg, df_pos], axis=0)

In [8]:
df["comments"]

69        Wir hatten eine schöne Zeit in Antonia Apartme...
71        Les moments que nous avons passés avec Antonia...
85        Assurément ma meilleure expérience Airbnb! Man...
112       Sehr empfehlenswert! Antonia und Manolis sind ...
129       Eirini und Vincent waren fabelhafte Gastgeber....
                                ...                        
72497     Find something better seems hard.<br/>The vill...
108713                                   This is very clean
66235     This is our first stay with AirBNB. We have st...
15061     I cannot say enough good things about this vil...
212614    What a lovely experience at Isabella’s home in...
Name: comments, Length: 48552, dtype: object

In [9]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [10]:

def remove_punctuation(text):
    punctuationfree="".join([i for i in text if i not in string.punctuation])
    return punctuationfree
#storing the puntuation free text
df['comments_str'] = df['comments'].apply(lambda x:str(x))
df['clean_comments'] = df['comments_str'].apply(lambda x:remove_punctuation(x))
df['comments_lower'] = df['clean_comments'].apply(lambda x: x.lower())
df.head()

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments,label,comments_str,clean_comments,comments_lower
69,27966,40643115,2015-08-01,37775738,Angelika,Wir hatten eine schöne Zeit in Antonia Apartme...,0,Wir hatten eine schöne Zeit in Antonia Apartme...,Wir hatten eine schöne Zeit in Antonia Apartme...,wir hatten eine schöne zeit in antonia apartme...
71,27966,44093265,2015-08-24,20693705,Karim,Les moments que nous avons passés avec Antonia...,0,Les moments que nous avons passés avec Antonia...,Les moments que nous avons passés avec Antonia...,les moments que nous avons passés avec antonia...
85,27966,102259818,2016-09-17,6941208,Annik,Assurément ma meilleure expérience Airbnb! Man...,0,Assurément ma meilleure expérience Airbnb! Man...,Assurément ma meilleure expérience Airbnb Mano...,assurément ma meilleure expérience airbnb mano...
112,27966,402574129623837353,2021-07-09,177831668,Olesya,Sehr empfehlenswert! Antonia und Manolis sind ...,0,Sehr empfehlenswert! Antonia und Manolis sind ...,Sehr empfehlenswert Antonia und Manolis sind s...,sehr empfehlenswert antonia und manolis sind s...
129,28970,3320683,2013-01-10,4016237,Gudrun,Eirini und Vincent waren fabelhafte Gastgeber....,0,Eirini und Vincent waren fabelhafte Gastgeber....,Eirini und Vincent waren fabelhafte Gastgeber ...,eirini und vincent waren fabelhafte gastgeber ...


In [11]:
# tokenizing the comments
tokenized_comments = []

for comment in df["comments_lower"]:
    tokenized_comments.append(comment.split())
    
tokenized_comments

[['wir',
  'hatten',
  'eine',
  'schöne',
  'zeit',
  'in',
  'antonia',
  'apartment',
  'alles',
  'war',
  'so',
  'wie',
  'beschrieben',
  'täglich',
  'wurden',
  'wir',
  'mit',
  'griechischen',
  'köstlichkeiten',
  'verwöhnt',
  'und',
  'auch',
  'bei',
  'sonnenbrand',
  'gab',
  'es',
  'hilfe',
  'das',
  'apartment',
  'war',
  'sehr',
  'schön',
  'und',
  'ruhig',
  'gelegen',
  'vielen',
  'dank'],
 ['les',
  'moments',
  'que',
  'nous',
  'avons',
  'passés',
  'avec',
  'antonia',
  'sa',
  'famille',
  'et',
  'ses',
  'amies',
  'étaient',
  'absolument',
  'fabuleux',
  'elle',
  'a',
  'été',
  'très',
  'attentive',
  'à',
  'tous',
  'nos',
  'besoins',
  'très',
  'serviable',
  'et',
  'toutes',
  'ses',
  'petites',
  'attentions',
  'nous',
  'ont',
  'beaucoup',
  'touchés',
  'nous',
  'avons',
  'grâce',
  'à',
  'elle',
  'pu',
  'découvrir',
  'la',
  'vraie',
  'hospitalité',
  'des',
  'crétois',
  'ses',
  '15',
  'jours',
  'nous',
  'ont',
  'f

In [12]:
from itertools import chain

flatten_comments = list(chain.from_iterable(tokenized_comments))
unique_words = set(flatten_comments)

In [13]:
WordsForids = {"":0}
idsForWords = {0:""}

for Id , word in enumerate(unique_words):
    WordsForids[word] = Id + 1

for Id , word in enumerate(unique_words):
    idsForWords[Id + 1] = word

In [14]:
# from word to number

temp = []
X = []

for comment in tokenized_comments:
    temp = []
    for word in comment:
        temp.append(WordsForids[word])
    X.append(temp)

X

[[73308,
  42835,
  40315,
  792,
  56469,
  92396,
  121134,
  29666,
  36206,
  67927,
  72509,
  65898,
  10228,
  106249,
  101978,
  73308,
  111253,
  53890,
  121772,
  74087,
  115739,
  21633,
  122112,
  94965,
  47071,
  26124,
  103233,
  13500,
  29666,
  67927,
  112397,
  79716,
  115739,
  118688,
  19467,
  40617,
  124069],
 [28425,
  55462,
  64910,
  39005,
  60449,
  67671,
  87777,
  121134,
  36220,
  97870,
  49703,
  168,
  68040,
  80638,
  51616,
  67965,
  34677,
  69766,
  116768,
  58894,
  30820,
  23098,
  28294,
  52981,
  930,
  58894,
  82492,
  49703,
  108968,
  168,
  118218,
  65673,
  39005,
  2207,
  811,
  21576,
  39005,
  60449,
  89274,
  23098,
  34677,
  98042,
  82355,
  69449,
  94373,
  39700,
  44640,
  123495,
  168,
  127412,
  10384,
  39005,
  2207,
  62184,
  102617,
  85595,
  58894,
  93111,
  49123,
  64910,
  39005,
  13838,
  23098,
  111361,
  78627,
  107682,
  44640,
  100529,
  106889,
  46020,
  26525,
  29368,
  97870,


In [15]:
max_len = len(max(X, key=len))
X = sequence.pad_sequences(X, maxlen=max_len)
X

array([[     0,      0,      0, ...,  19467,  40617, 124069],
       [     0,      0,      0, ..., 125356,  85595,  49020],
       [     0,      0,      0, ...,  21166,  27533,  93277],
       ...,
       [     0,      0,      0, ...,   7841,  19381,  65351],
       [     0,      0,      0, ...,  58864,  85103, 104767],
       [     0,      0,      0, ...,  20285,  33761,  85886]])

In [16]:
from sklearn.model_selection import train_test_split
y = df["label"]

X = np.array(X)
y = np.array(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size = 0.3, shuffle = True)
X_train

array([[     0,      0,      0, ..., 119850,  69810,   2530],
       [     0,      0,      0, ...,  33080,  25496,  44280],
       [     0,      0,      0, ...,  53899,  69396,  84799],
       ...,
       [     0,      0,      0, ...,  98337, 104651,  24173],
       [     0,      0,      0, ...,  17627,  58081,  32133],
       [     0,      0,      0, ...,  97297,  97228,  36380]])

In [17]:
del(df, comments)

In [18]:
del(unique_words, temp, labels, tokenized_comments)

In [19]:
del(flatten_comments)

## Logistic Regression

In [22]:
from sklearn.linear_model import LogisticRegression

Cs = [0.01, 1, 100]

for c in Cs:
    logisticRegr = LogisticRegression(C = c)
    logisticRegr.fit(X_train, y_train)
    print(f"C: {c} , acurracy: {logisticRegr.score(X_test, y_test)}")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


C: 0.01 , acurracy: 0.5950157901963476


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


C: 1 , acurracy: 0.5940546478099684
C: 100 , acurracy: 0.5953590553343402


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Naive Bayes

In [23]:
import pandas as pd
from sklearn.metrics import accuracy_score, log_loss, classification_report
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB, ComplementNB
classifiers = [
    GaussianNB(),
    MultinomialNB(),
    BernoulliNB(),
    ComplementNB(),               
                  ]
 
# Logging for Visual Comparison
log_cols=["Classifier", "Accuracy", "Log Loss"]
log = pd.DataFrame(columns=log_cols)
 
for clf in classifiers:
    clf.fit(X_train, y_train)
    name = clf.__class__.__name__
    
    print("="*30)
    print(name)
    
    print('****Results****')
    train_predictions = clf.predict(X_test)
    acc = accuracy_score(y_test, train_predictions)
    print("Accuracy: {:.4%}".format(acc))
    
    log_entry = pd.DataFrame([[name, acc*100, 11]], columns=log_cols)
    log = log.append(log_entry)
    
    print("="*30)

GaussianNB
****Results****
Accuracy: 50.9337%
MultinomialNB
****Results****
Accuracy: 55.9659%
BernoulliNB
****Results****
Accuracy: 59.4810%
ComplementNB
****Results****
Accuracy: 55.9659%


## Decision Tree (Random Forest, Adamboost)

In [24]:
from sklearn.ensemble import RandomForestClassifier

max_depths = [2, 5, 7, 10]
max_features = ["sqrt", "log2"]

for max_depth in max_depths:
    for n_feat in max_features:
        rfc = RandomForestClassifier(max_depth = max_depth, max_features = n_feat)
        rfc.fit(X_train, y_train)
        print(f"Max Depth: {max_depth}, Max features: {n_feat}, Accuracy: {rfc.score(X_test, y_test)}")

Max Depth: 2, Max features: sqrt, Accuracy: 0.6127282713167651
Max Depth: 2, Max features: log2, Accuracy: 0.6141013318687354
Max Depth: 5, Max features: sqrt, Accuracy: 0.6316765069339558
Max Depth: 5, Max features: log2, Accuracy: 0.6173280241658657
Max Depth: 7, Max features: sqrt, Accuracy: 0.6453384594260607
Max Depth: 7, Max features: log2, Accuracy: 0.62453659206371
Max Depth: 10, Max features: sqrt, Accuracy: 0.6833035836880407
Max Depth: 10, Max features: log2, Accuracy: 0.6397089111629823


In [25]:
from sklearn.ensemble import AdaBoostClassifier

n_estimators_array = [10, 50, 100]
learning_rates = [0.1, 0.5, 1]

# The base estimator is Desicion Tree Classifier
for n_estimators in n_estimators_array:
    for lr in learning_rates:
        adc = AdaBoostClassifier(n_estimators = n_estimators, learning_rate = lr)
        adc.fit(X_train, y_train)
        print(f"Number of estimators: {n_estimators}, Learning rate: {lr}, Accuracy: {adc.score(X_test, y_test)}")

Number of estimators: 10, Learning rate: 0.1, Accuracy: 0.6126596182891666
Number of estimators: 10, Learning rate: 0.5, Accuracy: 0.6459563366744473
Number of estimators: 10, Learning rate: 1, Accuracy: 0.6706714266099135
Number of estimators: 50, Learning rate: 0.1, Accuracy: 0.6708087326651105
Number of estimators: 50, Learning rate: 0.5, Accuracy: 0.7358918028285047
Number of estimators: 50, Learning rate: 1, Accuracy: 0.7694631333241796
Number of estimators: 100, Learning rate: 0.1, Accuracy: 0.6995056982012907
Number of estimators: 100, Learning rate: 0.5, Accuracy: 0.7798297404915556
Number of estimators: 100, Learning rate: 1, Accuracy: 0.7975422216119731


# SVM

In [26]:
from sklearn.svm import SVC

kernels = ['rbf', 'poly', 'linear' , 'sigmoid']
cs = [0.01, 1, 10]

# decision tree does not need scaled data to perform better
# the same applies to all ensembling methods using desicion tree as base estimator
for kernel in kernels:
    for c in cs:
        svc = SVC(kernel = kernel, C = c, max_iter = 500)
        svc.fit(X_train, y_train)
        print(f"Kernel: {kernel}, C: {c}, Accuracy: {svc.score(X_test, y_test)}")



Kernel: rbf, C: 0.01, Accuracy: 0.6032541535081697




Kernel: rbf, C: 1, Accuracy: 0.6014691747906082




Kernel: rbf, C: 10, Accuracy: 0.5946038720307566




Kernel: poly, C: 0.01, Accuracy: 0.5034326513799259




Kernel: poly, C: 1, Accuracy: 0.5083756693670191




Kernel: poly, C: 10, Accuracy: 0.5349443910476452




Kernel: linear, C: 0.01, Accuracy: 0.5209391734175477




Kernel: linear, C: 1, Accuracy: 0.5209391734175477




Kernel: linear, C: 10, Accuracy: 0.5209391734175477




Kernel: sigmoid, C: 0.01, Accuracy: 0.4088287793491693




Kernel: sigmoid, C: 1, Accuracy: 0.4312783193738844




Kernel: sigmoid, C: 10, Accuracy: 0.4318275435946725


## Fully Connected NN without Embedding layer

In [27]:
# Import the dependencies
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense, LSTM, Flatten, Dropout, Conv1D, MaxPooling1D, GRU, Bidirectional
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
import numpy as np

In [28]:
len(X_train[20])

989

In [29]:
X_train

array([[     0,      0,      0, ..., 119850,  69810,   2530],
       [     0,      0,      0, ...,  33080,  25496,  44280],
       [     0,      0,      0, ...,  53899,  69396,  84799],
       ...,
       [     0,      0,      0, ...,  98337, 104651,  24173],
       [     0,      0,      0, ...,  17627,  58081,  32133],
       [     0,      0,      0, ...,  97297,  97228,  36380]])

In [30]:

model = Sequential()
model.add(Dense(20, activation='relu', input_dim=len(X_train[0])))
model.add(Dropout(0.3))
model.add(Dense(30, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [31]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 20)                19800     
                                                                 
 dropout (Dropout)           (None, 20)                0         
                                                                 
 dense_1 (Dense)             (None, 30)                630       
                                                                 
 dropout_1 (Dropout)         (None, 30)                0         
                                                                 
 dense_2 (Dense)             (None, 1)                 31        
                                                                 
Total params: 20,461
Trainable params: 20,461
Non-trainable params: 0
_________________________________________________________________


In [32]:
model.fit(X_train, y_train, epochs=10, batch_size=64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x20bbf1c1e50>

In [33]:
model_scores = model.evaluate(X_test, y_test, verbose=0)

# Print out the accuracy of the model on the test set
print("Model accuracy on the test dataset: {0:.2f}%".format(model_scores[1]*100))

Model accuracy on the test dataset: 49.67%


In [39]:
max_review_length = len(X_train[0])
max_words = len(WordsForids)


# Using Embedding Layer

## Fully Connected NN 

In [40]:
# Define the layers in the model
embedding_vector_length = 32

# Define the layers in the model
model = Sequential()
model.add(Embedding(max_words, embedding_vector_length, input_length=max_review_length))
model.add(Flatten())
model.add(Dense(30, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [41]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 989, 32)           4118304   
                                                                 
 flatten (Flatten)           (None, 31648)             0         
                                                                 
 dense_3 (Dense)             (None, 30)                949470    
                                                                 
 dropout_2 (Dropout)         (None, 30)                0         
                                                                 
 dense_4 (Dense)             (None, 1)                 31        
                                                                 
Total params: 5,067,805
Trainable params: 5,067,805
Non-trainable params: 0
_________________________________________________________________


In [42]:
model.fit(X_train, y_train, epochs=3, batch_size=64)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x20be0b92370>

In [43]:
model_scores = model.evaluate(X_test, y_test, verbose=0)

# Print out the accuracy of the model on the test set
print("Model accuracy on the test dataset: {0:.2f}%".format(model_scores[1]*100))

Model accuracy on the test dataset: 95.61%


# CNN

In [44]:
# Define the layers in the model
embedding_vector_length = 32

# Define the layers in the model
model = Sequential()
model.add(Embedding(max_words, embedding_vector_length, input_length=max_review_length))
model.add(Conv1D(30, kernel_size = 3, activation='relu'))
model.add(MaxPooling1D(2))
model.add(Flatten())
model.add(Dense(30, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [45]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 989, 32)           4118304   
                                                                 
 conv1d (Conv1D)             (None, 987, 30)           2910      
                                                                 
 max_pooling1d (MaxPooling1D  (None, 493, 30)          0         
 )                                                               
                                                                 
 flatten_1 (Flatten)         (None, 14790)             0         
                                                                 
 dense_5 (Dense)             (None, 30)                443730    
                                                                 
 dropout_3 (Dropout)         (None, 30)                0         
                                                      

In [46]:
model.fit(X_train, y_train, epochs=3, batch_size=64)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x20be12ffca0>

In [47]:
model_scores = model.evaluate(X_test, y_test, verbose=0)

# Print out the accuracy of the model on the test set
print("Model accuracy on the test dataset: {0:.2f}%".format(model_scores[1]*100))

Model accuracy on the test dataset: 95.52%


# RNN (using LSTM)

In [48]:
# Define the layers in the model
model = Sequential()
model.add(Embedding(max_words, embedding_vector_length, input_length=max_review_length))
model.add(LSTM(20))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [49]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 989, 32)           4118304   
                                                                 
 lstm (LSTM)                 (None, 20)                4240      
                                                                 
 dense_7 (Dense)             (None, 1)                 21        
                                                                 
Total params: 4,122,565
Trainable params: 4,122,565
Non-trainable params: 0
_________________________________________________________________


In [50]:
model.fit(X_train, y_train, epochs=3, batch_size=64)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x20c09580a90>

In [51]:
model_scores = model.evaluate(X_test, y_test, verbose=0)

# Print out the accuracy of the model on the test set
print("Model accuracy on the test dataset: {0:.2f}%".format(model_scores[1]*100))

Model accuracy on the test dataset: 96.31%


## CNN-RNN(using GRU) Hybrid

In [52]:

embedding_vector_length = 32

# Define the layers in the model
model = Sequential()
model.add(Embedding(max_words, embedding_vector_length, input_length=max_review_length))
model.add(Conv1D(30, kernel_size = 3, activation='relu'))
model.add(MaxPooling1D(2))
model.add(GRU(20))
model.add(Dense(20, activation = 'relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [53]:
model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 989, 32)           4118304   
                                                                 
 conv1d_1 (Conv1D)           (None, 987, 30)           2910      
                                                                 
 max_pooling1d_1 (MaxPooling  (None, 493, 30)          0         
 1D)                                                             
                                                                 
 gru (GRU)                   (None, 20)                3120      
                                                                 
 dense_8 (Dense)             (None, 20)                420       
                                                                 
 dropout_4 (Dropout)         (None, 20)                0         
                                                      

In [54]:
model.fit(X_train, y_train, epochs=3, batch_size=64)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x20c0a093a00>

In [56]:
model_scores = model.evaluate(X_test, y_test, verbose=0)

# Print out the accuracy of the model on the test set
print("Model accuracy on the test dataset: {0:.2f}%".format(model_scores[1]*100))

Model accuracy on the test dataset: 95.26%
