In [0]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [7]:
%cd 'drive/My Drive/IMDB/'

/content/drive/My Drive/IMDB


In [1]:
import os
import pandas as pd
import numpy as np

from keras.models import Sequential, load_model
from keras.layers import Dense, Embedding,Bidirectional,GRU

from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from sklearn.model_selection import KFold

import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import nltk
import pickle

nltk.download('stopwords')
nltk.download('wordnet')


Using TensorFlow backend.


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [0]:
def preprocess_reviews(reviews):
  # delete or replace special chars with whith spaces or End Of Sentence identifiers
  # remove stopwords and lemmanize
  # turn review strings into word list 


  reviews = [REPLACE_NO_SPACE.sub("", line.lower()) for line in reviews]
  reviews = [REPLACE_WITH_EOS.sub(" EOS ", line) for line in reviews]
  reviews = [REPLACE_WITH_SPACE.sub(" ", line) for line in reviews]

  for i in range(len(reviews)):
    review = reviews[i]

    #lemmanize
    review = [lemmatizer.lemmatize(token) for token in review.split(" ")]
    review = [lemmatizer.lemmatize(token, "v") for token in review]

    #remove stopwords
    review = [word for word in review if not word in stop_words]
    review = " ".join(review)

    reviews[i] = review

  return reviews

def split_seq(s, y, SEQ_LEN):
  #creates for 
  s_extend = sequence.pad_sequences([s[i:i+SEQ_LEN]  for i in range(0,len(s), SEQ_LEN)],  maxlen = SEQ_LEN)
  y_extend = np.repeat(y,s_extend.shape[0]).reshape(-1,1)
  l = s_extend.shape[0]
  
  return(s_extend, y_extend, l)

In [0]:
stop_words = set(stopwords.words("english")) 
lemmatizer = WordNetLemmatizer()

REPLACE_NO_SPACE = re.compile("(\;)|(\:)|(\')|(\,)|(\")|(\()|(\))|(\[)|(\])")
REPLACE_WITH_EOS = re.compile("(\.)|(\!)|(\?)")
REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")  


In [4]:
#PATH = 'drive/My Drive/IMDB/'
FILENAME = "reviews_train.tsv"

FILEPATH =  FILENAME
FILEPATH

'reviews_train.tsv'

In [8]:
!ls

CrossValidation.ipynb  reviews_train.tsv  Uebung4-NotWorkingAsIntended.py
requirements.txt       tokenizer.pickle


In [0]:
## Cross Validation

df = pd.read_table(FILEPATH, header = None,names = ['id', 'y','text'])
kf = KFold(n_splits=10, shuffle = True)

MAX_VOCAB = 10000
SEQ_LEN = 70

accs = []

j = 0
for trn_idx, val_idx in kf.split(df):



  ##training
  reviews_trn = df['text'][trn_idx]
  y_trn = (df.y.values == 'pos').astype('int')[trn_idx]

  #preprocess and tokenize training data
  X_trn = preprocess_reviews(reviews_trn)

  tokenizer = Tokenizer(num_words = MAX_VOCAB)
  tokenizer.fit_on_texts(X_trn)
  X_trn = tokenizer.texts_to_sequences(X_trn)


  #add observations by dividing reviews at SEQ_LEN in sub sequences
  y_long = np.empty([0,1], int)
  long_sequences  = np.empty([0,SEQ_LEN], int)


  for s, y in zip(X_trn, y_trn):
    s, y, _ = split_seq(s, y, SEQ_LEN)
    y_long = np.vstack([y_long, y])
    long_sequences = np.vstack([long_sequences, s])


  long_sequences.shape, y_long.shape
  X_trn = long_sequences
  y_trn = y_long

  #construct model
  embedding_size= 100
  h_size = 32 

  model=Sequential()
  model.add(Embedding(MAX_VOCAB, embedding_size, input_length=SEQ_LEN))
  model.add(Bidirectional(GRU(h_size, input_shape = (SEQ_LEN,embedding_size))))
  model.add(Dense(1, activation='sigmoid'))
  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


  #train model for one epoch
  batch_size = 128
  epochs = 1
  model.fit(X_trn,y_trn, batch_size=batch_size, epochs=epochs)



  ##validation
  reviews_val = df['text'][val_idx]
  y_val = (df.y.values == 'pos').astype('int')[val_idx]

  X_val = preprocess_reviews(reviews_val)
  X_val = tokenizer.texts_to_sequences(X_val)

  #prepare validation data
  len_ar = []
  X_long  = np.empty([0,SEQ_LEN], int)

  for s, l in zip(X_val, y_val):
      s, _, l = split_seq(s, y, SEQ_LEN)
      X_long = np.vstack([X_long, s])
      len_ar.append(l)

  len_ar = np.array(len_ar) 

  # make predictions
  pred = model.predict(X_long)
  predictions = []
  idx = 0
  for i in range(len_ar.shape[0]):
    y_hat = pred[idx: (idx+len_ar[i])].mean().round()
    predictions.append(y_hat)
    idx = idx + len_ar[i]

  predictions = np.array(predictions)
  acc = (predictions == y_val).mean()
  accs.append(acc)
  print("Cross Fold: " + str(j))
  print("Accuracy: " +str(acc))
  j+=1

Epoch 1/1
Cross Fold: 0
Accuracy: 0.9
Epoch 1/1
Cross Fold: 1
Accuracy: 0.8868
Epoch 1/1
Cross Fold: 2
Accuracy: 0.8928
Epoch 1/1
Cross Fold: 3
Accuracy: 0.8936
Epoch 1/1
Cross Fold: 4
Accuracy: 0.8888
Epoch 1/1
Cross Fold: 5
Accuracy: 0.892
Epoch 1/1
Cross Fold: 6
Accuracy: 0.88
Epoch 1/1
Cross Fold: 7
Accuracy: 0.8916
Epoch 1/1


In [0]:
accs = accs[0:10]
CV_res = accs
CV_res.append(np.mean(accs))
CV_res.append(np.std(accs))
CV_res.append(np.median(accs))
row_labels = ["Fold" + str(i) for i in range(10)] + ['Mean', 'Std', 'Median']
CV_res = pd.DataFrame(CV_res, columns = ['Accuracy'], index = row_labels)
CV_res

In [0]:
RESULTS_FILE = 'Cross_Validation_results.tsv'
RESULTS_PATH =  RESULTS_FILE
CV_res.to_csv(RESULTS_PATH,  sep = '\t')

In [0]:
## Train

#PATH = 'drive/My Drive/data/'
TRAIN_FILE = 'reviews_train.tsv'

TRAIN_PATH =  TRAIN_FILE

MODEL_NAME = 'GRU_BIDIREC.h5'
TOKENIZER_NAME = 'tokenizer.pickle'

#input params
MAX_VOCAB = 10000
SEQ_LEN = 70

#model params
embedding_size= 100
h_size = 32 

#train params
batch_size = 128
epochs = 1

#read data
df = pd.read_table(TRAIN_PATH, header = None,names = ['id', 'y','text'])

##training
reviews_trn = df['text']
y_trn = (df.y.values == 'pos').astype('int')

#preprocess and tokenize training data
X_trn = preprocess_reviews(reviews_trn)

tokenizer = Tokenizer(num_words = MAX_VOCAB)
tokenizer.fit_on_texts(X_trn)
# save Tokenizer
with open(TOKENIZER_NAME, 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)


X_trn = tokenizer.texts_to_sequences(X_trn)


#add observations by dividing reviews at SEQ_LEN in sub sequences
y_long = np.empty([0,1], int)
long_sequences  = np.empty([0,SEQ_LEN], int)


for s, y in zip(X_trn, y_trn):
  s, y, _ = split_seq(s, y, SEQ_LEN)
  y_long = np.vstack([y_long, y])
  long_sequences = np.vstack([long_sequences, s])


long_sequences.shape, y_long.shape
X_trn = long_sequences
y_trn = y_long

#construct model
model=Sequential()
model.add(Embedding(MAX_VOCAB, embedding_size, input_length=SEQ_LEN))
model.add(Bidirectional(GRU(h_size, input_shape = (SEQ_LEN,embedding_size))))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

#train model for one epoch
model.fit(X_trn,y_trn, batch_size=batch_size, epochs=epochs)

model.save(MODEL_NAME)  # creates a HDF5 File MODELNAME


Epoch 1/1


In [0]:
del model  # deletes the existing model
del tokenizer


In [0]:
##Make Predictions

#PATH = 'drive/My Drive/data/'
PREDICT_FILE = "reviews_train.tsv"

PREDICT_PATH =  PREDICT_FILE

MODEL_NAME = "GRU_BIDIREC.h5"
TOKENIZER_NAME = 'tokenizer.pickle'

OUTPUT_FILE =  "result_file.csv"

df = pd.read_table(PREDICT_PATH, header = None,names = ['id', 'y','text'])


#load tokenizer
with open(TOKENIZER_NAME, 'rb') as handle:
    tokenizer = pickle.load(handle)
#load model
model = load_model(MODEL_NAME)

reviews_val = df['text']

X_val = preprocess_reviews(reviews_val)
X_val = tokenizer.texts_to_sequences(X_val)

#prepare validation data
len_ar = []
X_long  = np.empty([0,SEQ_LEN], int)

for s in X_val:
    s, _, l = split_seq(s, 0, SEQ_LEN)
    X_long = np.vstack([X_long, s])
    len_ar.append(l)

len_ar = np.array(len_ar) 


# make predictions
pred = model.predict(X_long)
predictions = []
idx = 0
for i in range(len_ar.shape[0]):
  y_hat = pred[idx: (idx+len_ar[i])].mean().round()
  predictions.append(y_hat)
  idx = idx + len_ar[i]

predictions = np.array(predictions)
output = pd.DataFrame(predictions, index = df['id'], columns = ['y_hat'])
output.replace([1.0,0.0], ['pos', 'neg'], inplace = True)
output.to_csv(OUTPUT_FILE, sep = '\t')

In [0]:
!ls 'drive/My Drive/data'

Cross_Validation_results2.csv	 NotWorkingAsIntended.py  review_51_cut.npy
Cross_Validation_results.csv	 requirements.txt	  reviews_train.tsv
Cross_Validation_results.gsheet  result_file.csv
