In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras import models, layers, optimizers
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences
import bz2
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
import re
import nltk
nltk.download('punkt') # At first you have to download these nltk packages.
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\luisg\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\luisg\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\luisg\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
tf.config.list_physical_devices()

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')]

In [73]:
import os
print(os.listdir("../data"))

['test.ft.txt.bz2', 'train.ft.txt.bz2']


In [109]:
def get_labels_and_texts(file):
    labels = []
    texts = []
    for line in bz2.BZ2File(file):
        x = line.decode("utf-8")
        labels.append(int(x[9]) - 1)
        texts.append(x[10:].strip())
    return np.array(labels), texts
train_labels, train_texts = get_labels_and_texts('../data/train.ft.txt.bz2')
test_labels, test_texts = get_labels_and_texts('../data/test.ft.txt.bz2')

In [111]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(train_texts, train_labels, test_size=0.2, stratify=train_labels)

print("Train data:",  len(X_train), len(y_train))
print("Valid data:",  len(X_val), len(y_val))

Train data: 2880000 2880000
Valid data: 720000 720000


In [107]:
# tf.strings.lower('ONE HOT ENCODING').numpy()
nltk.word_tokenize('ONE HOT ENCODING')

['ONE', 'HOT', 'ENCODING']

In [116]:
stop_words = stopwords.words('english') # defining stop_words
stop_words.remove('not') # removing not from the stop_words list as it contains value in negative movies
lemmatizer = WordNetLemmatizer()

In [None]:
def data_preprocessing(review):

  # data cleaning
  review = re.sub(re.compile('<.*?>'), '', review) #removing html tags
  review =  re.sub('[^A-Za-z0-9]+', ' ', review) #taking only words
  
  # lowercase
  review = review.lower()
  
  # tokenization
  tokens = nltk.word_tokenize(review) # converts review to tokens
  
  # stop_words removal
  review = [word for word in tokens if word not in stop_words] #removing stop words
  
  # lemmatization
  review = [lemmatizer.lemmatize(word) for word in review]
  
  # join words in preprocessed review
  review = ' '.join(review)
  
  return review

In [120]:
prep_train_texts = [data_preprocessing(text) for text in X_train]

In [121]:
prep_valid_texts = [data_preprocessing(text) for text in X_val]

In [122]:
prep_test_texts = [data_preprocessing(text) for text in test_texts]

In [123]:
MAX_FEATURES = 12000
tokenizer = Tokenizer(num_words=MAX_FEATURES)
tokenizer.fit_on_texts(prep_train_texts)
tk_train_texts = tokenizer.texts_to_sequences(prep_train_texts)
tk_val_texts = tokenizer.texts_to_sequences(prep_valid_texts)
tk_test_texts = tokenizer.texts_to_sequences(prep_test_texts)

In [127]:
len(tk_test_texts[10])

18

In [128]:
MAX_LENGTH = max(len(train_ex) for train_ex in train_texts)
tk_train_texts = pad_sequences(tk_train_texts, maxlen=MAX_LENGTH)
tk_val_texts = pad_sequences(tk_val_texts, maxlen=MAX_LENGTH)
tk_test_texts = pad_sequences(tk_test_texts, maxlen=MAX_LENGTH)

In [130]:
tk_test_texts.shape

(400000, 1015)

In [136]:
from keras.layers import BatchNormalization


def build_model():
    sequences = layers.Input(shape=(MAX_LENGTH,))
    embedded = layers.Embedding(MAX_FEATURES, 64)(sequences)
    x = layers.Conv1D(64, 3, activation='relu')(embedded)
    x = BatchNormalization()(x)
    x = layers.MaxPool1D(3)(x)
    x = layers.Conv1D(64, 5, activation='relu')(x)
    x = BatchNormalization()(x)
    x = layers.MaxPool1D(5)(x)
    x = layers.Conv1D(64, 5, activation='relu')(x)
    x = layers.GlobalMaxPool1D()(x)
    x = layers.Flatten()(x)
    x = layers.Dense(100, activation='relu')(x)
    predictions = layers.Dense(1, activation='sigmoid')(x)
    model = models.Model(inputs=sequences, outputs=predictions)
    model.compile(
        optimizer='rmsprop',
        loss='binary_crossentropy',
        metrics=['binary_accuracy']
    )
    return model
    
model = build_model()

In [142]:
model.fit(
    tk_train_texts, 
    y_train, 
    batch_size=128,
    epochs=1,
    validation_data=(tk_val_texts, y_val), )

[1m14534/22500[0m [32m━━━━━━━━━━━━[0m[37m━━━━━━━━[0m [1m11:34[0m 87ms/step - binary_accuracy: 0.8609 - loss: 0.3157

KeyboardInterrupt: 

In [143]:
from tensorflow.python.client import device_lib 
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 13775129602208402442
xla_global_id: -1
]


In [141]:
len(train_labels)

3600000

In [79]:
# train = {
#     'review': train_texts,
#     'label': train_labels
# }

# test = {
#     'review': test_texts,
#     'label': test_labels
# }

In [80]:
# df_train = pd.DataFrame(train)
# df_train.head()

In [81]:
# df_test = pd.DataFrame(test)
# df_test.head()

### Data preprocessing

In [115]:
# stop_words = stopwords.words('english') # defining stop_words
# stop_words.remove('not') # removing not from the stop_words list as it contains value in negative movies
# lemmatizer = WordNetLemmatizer()

In [83]:
# def data_preprocessing(review):
    
#   # data cleaning
#   review = re.sub(re.compile('<.*?>'), '', review) #removing html tags
#   review =  re.sub('[^A-Za-z0-9]+', ' ', review) #taking only words
  
#   # lowercase
#   review = review.lower()
  
#   # tokenization
#   tokens = nltk.word_tokenize(review) # converts review to tokens
  
#   # stop_words removal
#   review = [word for word in tokens if word not in stop_words] #removing stop words
  
#   # lemmatization
#   review = [lemmatizer.lemmatize(word) for word in review]
  
#   # join words in preprocessed review
#   review = ' '.join(review)
  
#   return review

In [84]:
# df_test['preprocessed_review'] = df_test['review'].apply(lambda review: data_preprocessing(review))
# df_test.head()

In [85]:
# df_train['preprocessed_review'] = df_train['review'].apply(lambda review: data_preprocessing(review))
# df_train.head()

In [86]:
# from sklearn.model_selection import train_test_split

# data = df_train.copy()
# y = data['preprocessed_review'].values.tolist()
# data.drop(['label', 'review'], axis=1, inplace=True)

# X_train, X_val, y_train, y_val = train_test_split(data, y, test_size=0.2, stratify=y)

# print("Train data:",  X_train.shape, y_train.shape)
# print("Valid data:",  X_val.shape, y_val.shape)

In [87]:
# from sklearn.model_selection import train_test_split

# train_txt = df_train['preprocessed_review'].values#.tolist()
# train_lbl = df_train['label'].values#.tolist()
# test_txt = df_test['preprocessed_review'].values#.tolist()
# test_lbl = df_test['label'].values#.tolist()

# print(len(train_txt))
# print(len(train_lbl))
# print(len(test_txt))
# print(len(test_lbl))

# X_train, X_val, y_train, y_val = train_test_split(train_txt, train_lbl, test_size=0.2, stratify=train_lbl)

# print("Train data:",  X_train.shape, y_train.shape)
# print("Valid data:",  X_val.shape, y_val.shape)

In [70]:
MAX_LENGTH = max(len(train_ex) for train_ex in train_texts)
X_train = pad_sequences(X_train, maxlen=MAX_LENGTH)
X_val = pad_sequences(X_val, maxlen=MAX_LENGTH)
test_txt = pad_sequences(test_txt, maxlen=MAX_LENGTH)

ValueError: invalid literal for int() with base 10: 'invaluable simply put miss vera dream job cross dressing slapping pancake make thrift store rag exuding class glamour miss vera provides reader menagerie resource evolve quintessential ultra femme vi

In [71]:
MAX_LENGTH

1015