In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
import ast
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import tensorflow_datasets as tfds
tfds.disable_progress_bar()

#import sys
#sys.setrecursionlimit(10000)

In [2]:
#load the encoder, encode the reviews, and positive/negative review list
encoder=tfds.features.text.SubwordTextEncoder.load_from_file('SteamData/SteamReviewVocab')

    
generic = lambda x: ast.literal_eval(x)

starting_dataset=pd.read_csv('SteamData/FormattedReviewRatingList.csv', converters={'Rating': generic})
trimmed_review=starting_dataset.loc[:, 'Trimmed Review'].to_numpy()
ratings=starting_dataset.drop('Trimmed Review', axis=1).to_numpy(dtype='int32')


total_dataset=tf.data.experimental.make_csv_dataset(
    'SteamData/FormattedReviewRatingList.csv', batch_size=1, column_names=['Trimmed Review', 'Rating'], column_defaults=['string', 'int32'], 
    label_name='Rating', field_delim=',', use_quote_delim=True, header=True, num_epochs=None, shuffle=True,
    shuffle_buffer_size=len(ratings)
)


In [3]:
#encode the text
#result = total_dataset.map(lambda review_str, rating_int: encoder.encode(review_str))

encoded_reviews=[]
for i, j in enumerate(trimmed_review):
    encoded_reviews.append(encoder.encode(j))

#creating tensorflow datasets for training
def labeler(review, rating):
    return review, rating
#pairing the labels (good/bad game) with the encoded reviews
encoded_review_rating_list=[]
for i,j in enumerate(encoded_reviews):
    encoded_review_dataset = tf.data.Dataset.from_tensors(tf.cast(j, dtype='int64'))
    encoded_review_rating_list.append(encoded_review_dataset.map(lambda x: labeler(x,ratings[i])))


In [4]:
 #Combine the list of review:score sets into a single tensor dataset.
encoded_review_ratings = encoded_review_rating_list[0]
#test_var_tensor=tf.constant()
for single_dataset in encoded_review_rating_list[1:]:
    encoded_review_ratings=encoded_review_ratings.concatenate(single_dataset)

#Shuffle the datasets to avoid any biases.
buffer_size = len(encoded_reviews)
all_labeled_data = encoded_review_ratings.shuffle(
    buffer_size, reshuffle_each_iteration=False)

In [10]:
##Split the encoded words into training and test datasets, take size amount of data that goes into the training set
training_ratio=0.6
take_size= round(len(encoded_reviews)*training_ratio)
batch_size=30

#Organizing our training and validation data, the padded shapes are set to the longest review (as specified by None keywords)
train_data = encoded_review_ratings.take(take_size)
train_data = train_data.padded_batch(batch_size, padded_shapes=((None,), (1,)))

test_data = encoded_review_ratings.skip(take_size)
test_data = test_data.padded_batch(batch_size, padded_shapes=((None,), (1,)))

In [11]:
embedding_dim=10
learn_rate= 0.001
#determining layers of our neural network
model = keras.Sequential([
    layers.Embedding(encoder.vocab_size, embedding_dim),
    layers.GlobalAveragePooling1D(),
    layers.Dense(200, activation='relu'),
    layers.Dropout(rate=0.15),
    layers.Dense(100, activation='relu'),
    layers.Dense(50, activation='relu'),
    layers.Dense(1)
])

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 10)          33080     
_________________________________________________________________
global_average_pooling1d (Gl (None, 10)                0         
_________________________________________________________________
dense (Dense)                (None, 200)               2200      
_________________________________________________________________
dropout (Dropout)            (None, 200)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 100)               20100     
_________________________________________________________________
dense_2 (Dense)              (None, 50)                5050      
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 5

In [None]:
#set the optimizer and loss equation here, then compile and run the model.
opt = tf.keras.optimizers.Adam(learning_rate=learn_rate)
loss_eqn=tf.keras.losses.BinaryCrossentropy(from_logits='True')

model.compile(optimizer=opt,
              loss=loss_eqn,
              metrics=['accuracy'])

history = model.fit(
    train_data,
    epochs=30,
    validation_data=test_data)

In [None]:
#plotting the accuracy and losses for our training and validation sets.
history_dict = history.history

acc = history_dict['accuracy']
val_acc = history_dict['val_accuracy']
loss=history_dict['loss']
val_loss=history_dict['val_loss']

epochs = range(1, len(acc) + 1)

plt.figure(figsize=(15,10))
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

plt.figure(figsize=(15,10))
plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')
plt.ylim((0.5,1))
plt.show()

In [None]:
#Extract TSV files of the word embeddings, to be used with the embedding project http://projector.tensorflow.org/
import io

e = model.layers[0]
weights = e.get_weights()[0]

out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')

for num, word in enumerate(encoder.subwords):
    vec = weights[num+1] # skip 0, it's padding.
    out_m.write(word + "\n")
    out_v.write('\t'.join([str(x) for x in vec]) + "\n")
out_v.close()
out_m.close()

In [None]:
for ex in encoded_review_ratings.take(1):
  print(ex)

In [6]:
encoded_review_ratings

<ConcatenateDataset shapes: ((None,), (1,)), types: (tf.int64, tf.int32)>

In [12]:
next_feature, next_label = next(iter(test_data))

print (next_feature, next_label)

tf.Tensor(
[[  67  381   31 ...    0    0    0]
 [3084   27    2 ...    0    0    0]
 [   3  113    9 ...    1    9  103]
 ...
 [ 101    1   38 ...    0    0    0]
 [   9    7    5 ...    0    0    0]
 [ 572    1   40 ...    0    0    0]], shape=(30, 452), dtype=int64) tf.Tensor(
[[1]
 [1]
 [0]
 [1]
 [0]
 [0]
 [1]
 [0]
 [0]
 [0]
 [0]
 [0]
 [1]
 [1]
 [1]
 [1]
 [0]
 [1]
 [0]
 [0]
 [1]
 [1]
 [1]
 [0]
 [0]
 [0]
 [1]
 [1]
 [0]
 [1]], shape=(30, 1), dtype=int32)
