In [39]:
from sklearn.metrics import accuracy_score
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow import keras
from tqdm import tqdm
import pandas as pd
import pickle

In [15]:
with open('dataset/train.ft.txt', 'r', encoding='utf8') as f:
    txt = [a for a in f]

In [16]:
df = pd.DataFrame()
df['text'] = txt
df['label'], df['text'] = df.text.apply(lambda x: x.split(' ', maxsplit=1)[0]), df.text.apply(lambda x: x.split(' ', maxsplit=1)[1])
df.label = df.label.map({'__label__2':1, '__label__1':0})

In [17]:
df_train = df[0:3000000]
df_valid = df[3000000:3300000]
df_test = df[3300000:3600000]

In [18]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [19]:
df_train

Unnamed: 0,text,label
0,Stuning even for the non-gamer: This sound tra...,1
1,The best soundtrack ever to anything.: I'm rea...,1
2,Amazing!: This soundtrack is my favorite music...,1
3,Excellent Soundtrack: I truly like this soundt...,1
4,"Remember, Pull Your Jaw Off The Floor After He...",1
...,...,...
2999995,"Very nice!: Nice tool, worth every penny! I us...",1
2999996,the switch is junk: I have read other reviews ...,0
2999997,"Great Tool: This is my first Dewalt, and I am ...",1
2999998,Don't expect much: The D26451 is a poor replac...,0


In [20]:
def custom_standardization(input_data):
  lowercase = tf.strings.lower(input_data)
  return lowercase

vocab_size = 10000
sequence_length = 100

vectorize_layer = layers.TextVectorization(
    standardize=custom_standardization,
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=sequence_length)

In [21]:
VECTORIZE_LAYER_TRAIN = True

if VECTORIZE_LAYER_TRAIN:
    vectorize_layer.adapt(df_train.text.values)
    with open('vocab.pkl', 'wb') as f:
        pickle.dump(vectorize_layer.get_vocabulary(), f)
else:
    with open('vocab.pkl', 'rb') as f:
        vocab = pickle.load(f)
    vectorize_layer.set_vocabulary(vocab)


In [22]:
embedding_dim=16
embedding_layer = tf.keras.layers.Embedding(vocab_size, embedding_dim)

In [23]:
model = tf.keras.Sequential([
  vectorize_layer,
  embedding_layer,
  layers.LSTM(64, return_sequences=True),
  layers.Conv1D(16, 3, activation='relu'),
  layers.MaxPooling1D(),
  layers.Conv1D(16, 3, activation='relu'),
  layers.GlobalMaxPooling1D(),
  layers.Dense(1)
])

In [24]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [25]:
EARLY_STOPPING = True
if EARLY_STOPPING:
    callbacks = [tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3000, restore_best_weights = True)]
else:
    callbacks = []

In [33]:
with tf.device("/device:GPU:0"):
    model.fit(
        df_train.text, df_train.label,
        callbacks = callbacks,
        epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [36]:
valid_preds = model.predict(df_valid.text)
valid_preds = [1 if (a > 0) else 0 for a in valid_preds ]

In [37]:
accuracy_score(valid_preds, df_valid.label)
#0.93881

0.9427033333333333