# Loading important libraries

In [31]:
import tensorflow as tf 
from tensorflow import keras 
import pandas as pd
import numpy as np

# Loading the data


In [9]:
X_train_full = pd.read_csv("train.csv")
X_test_full = pd.read_csv("test.csv")
X_valid_full = pd.read_csv("valid.csv")

# preprocess the data

In [16]:
X_train = X_train_full["text"]
y_train = X_train_full["label"]

X_test = X_test_full["text"]
y_test = X_test_full["label"]

X_valid = X_valid_full["text"]
y_valid = X_valid_full["label"]

In [18]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words = 20000, oov_token="oov_tok")
tokenizer.fit_on_texts(X_train)

word_index = tokenizer.word_index

X_train_sequence = tokenizer.texts_to_sequences(X_train)
X_train_padded = pad_sequences(X_train_sequence, maxlen = 250, padding = "post", truncating = "post")

X_test_sequence = tokenizer.texts_to_sequences(X_test)
X_test_padded = pad_sequences(X_test_sequence, maxlen = 250, padding = "post", truncating = "post")

X_valid_sequence = tokenizer.texts_to_sequences(X_valid)
X_valid_padded = pad_sequences(X_valid_sequence, maxlen = 250, padding = "post", truncating = "post")

In [28]:
type(word_index)

dict

In [19]:
type(y_train)

pandas.core.series.Series

In [21]:
type(X_valid_padded)

numpy.ndarray

converting labels from series to numpy array

In [35]:
y_train_array = np.array(y_train) 
y_test_array = np.array(y_test) 
y_valid_array = np.array(y_valid) 

# Making model

In [41]:
model = tf.keras.Sequential([
        tf.keras.layers.Embedding(20000, 16, input_length = 250),
        tf.keras.layers.GlobalAveragePooling1D(),
        tf.keras.layers.Dense(20, activation = "relu"),
        tf.keras.layers.Dense(1,activation = "sigmoid")
])

model.compile(loss="binary_crossentropy",optimizer = "adam", metrics=["accuracy"])

In [43]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 250, 16)           320000    
                                                                 
 global_average_pooling1d_1   (None, 16)               0         
 (GlobalAveragePooling1D)                                        
                                                                 
 dense_2 (Dense)             (None, 20)                340       
                                                                 
 dense_3 (Dense)             (None, 1)                 21        
                                                                 
Total params: 320,361
Trainable params: 320,361
Non-trainable params: 0
_________________________________________________________________


In [44]:
history = model.fit(X_train_padded,y_train_array,epochs = 10, validation_data=(X_valid_padded,y_valid_array))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# Testing  the model


In [71]:
sentence = [" this movie is writeen so bad. The actors actining was pathetic and story is a flop"]

sequence = tokenizer.texts_to_sequences(sentence)
padded = pad_sequences(sequence, maxlen = 250, padding="post", truncating="post")

print(model.predict(padded))

[[0.03303787]]
