In [7]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import tensorflow_datasets as tfds
tfds.disable_progress_bar()

#import sys
#sys.setrecursionlimit(10000)

In [9]:
#load the encoder and trimmed review file
encoder=tfds.features.text.SubwordTextEncoder.load_from_file('SteamData/SteamReviewVocab')

with open('SteamData/SteamReviewTrimmed.txt', 'r') as file:
        review_text_trimmed=file.readlines()

#remove newline characters
for j,item in enumerate(review_text_trimmed):
    review_text_trimmed[j]=re.sub('\n','', item)
    
#encode the words 
encoded_reviews=[]
for i in review_text_trimmed:
    encoded_reviews.append(encoder.encode(i))

#convert boolean strings to int for reviews (1 means a positive review, 0 means negative)
review_data=pd.read_json(path_or_buf= 'SteamData/SteamReviews.json', orient='columns')
review_rating= review_data['voted_up'].copy()
rating_encoded=[]
for i in review_rating:
    rating_encoded.append(int(i))

In [10]:
#creating tensorflow datasets for training
def labeler(review, rating):
    return review, rating
#pairing the labels (good/bad game) with the encoded reviews
encoded_review_rating_list=[]
for i,j in enumerate(encoded_reviews):
    encoded_review_dataset = tf.data.Dataset.from_tensors(j)
    encoded_review_rating_list.append(encoded_review_dataset.map(lambda x: labeler(x,rating_encoded[i])))



In [11]:
# Combine the list of review:score sets into a single tensor dataset.
encoded_review_ratings = encoded_review_rating_list[0]
#test_var_tensor=tf.constant()
for single_dataset in encoded_review_rating_list[1:]:
    encoded_review_ratings=encoded_review_ratings.concatenate(single_dataset)

encoded_review_ratings=encoded_review_ratings
#Shuffle the datasets to avoid any biases.
buffer_size = len(encoded_reviews)
all_labeled_data = encoded_review_ratings.shuffle(
    buffer_size, reshuffle_each_iteration=False)

In [12]:
##Split the encoded words into training and test datasets, take size amount of data that goes into the training set
training_ratio=0.9
take_size= round(len(encoded_reviews)*training_ratio)
batch_size=30

#Organizing our training and validation data, the padded shapes are set to the longest review (as specified by None keywords)
train_data = all_labeled_data.take(take_size)
train_data = train_data.padded_batch(batch_size, padded_shapes=([None],()))

test_data = all_labeled_data.skip(take_size)
test_data = test_data.padded_batch(batch_size, padded_shapes=([None],()))

In [13]:
embedding_dim=10
learn_rate= 0.001
#determining layers of our neural network
model = keras.Sequential([
    layers.Embedding(encoder.vocab_size, embedding_dim),
    layers.GlobalAveragePooling1D(),
    layers.Dense(200, activation='relu'),
    layers.Dropout(rate=0.15),
    layers.Dense(100, activation='relu'),
    layers.Dense(50, activation='relu'),
    layers.Dense(1)
])

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 10)          213580    
_________________________________________________________________
global_average_pooling1d (Gl (None, 10)                0         
_________________________________________________________________
dense (Dense)                (None, 200)               2200      
_________________________________________________________________
dropout (Dropout)            (None, 200)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 100)               20100     
_________________________________________________________________
dense_2 (Dense)              (None, 50)                5050      
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 5

In [None]:
#set the optimizer and loss equation here, then compile and run the model.
opt = tf.keras.optimizers.Adam(learning_rate=learn_rate)
loss_eqn=tf.keras.losses.BinaryCrossentropy(from_logits='True')

model.compile(optimizer=opt,
              loss=loss_eqn,
              metrics=['accuracy'])

history = model.fit(
    train_data,
    epochs=30,
    validation_data=test_data)

In [None]:
#plotting the accuracy and losses for our training and validation sets.
history_dict = history.history

acc = history_dict['accuracy']
val_acc = history_dict['val_accuracy']
loss=history_dict['loss']
val_loss=history_dict['val_loss']

epochs = range(1, len(acc) + 1)

plt.figure(figsize=(15,10))
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

plt.figure(figsize=(15,10))
plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')
plt.ylim((0.5,1))
plt.show()

In [None]:
#Extract TSV files of the word embeddings, to be used with the embedding project http://projector.tensorflow.org/
import io

e = model.layers[0]
weights = e.get_weights()[0]

out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')

for num, word in enumerate(encoder.subwords):
    vec = weights[num+1] # skip 0, it's padding.
    out_m.write(word + "\n")
    out_v.write('\t'.join([str(x) for x in vec]) + "\n")
out_v.close()
out_m.close()

In [57]:
for ex in all_labeled_data.take(5):
  print(ex)

(<tf.Tensor: shape=(122,), dtype=int32, numpy=
array([1278, 1990,    6,    4,  938,    9,   35,  693,    2,    5,   41,
          2,  398,  315,  106,  505,   13,  237,   79,   59,  535,   60,
         20,  997,  314, 1918,  133,  129,  217,  130, 1492,   17,  122,
         18,    6,    4,  100, 1720, 1207,  236,    1,    5,   24, 1913,
          1,   87,   43,  164,  102,    7,  489,   13,  126,  463,  105,
       2435, 1757,  463,  105, 1099,  532, 1099,  159,  236,    1,   24,
       1954,    5,  235,    4,  346,  315,  169, 1981,   87,   50,  443,
        129,    4,   16,    9,   13, 2579,   73, 1290,    7,    1,   12,
        294,   10,  436,  752,  800,   19,  491,   17,   32,  162,  556,
          2,   87,  321,    4,   16,    6, 1061,  182,  126,    3,   12,
         72,  311,  131,    4,    7,   12,  100,   36,   17,    6, 1535,
          7])>, <tf.Tensor: shape=(), dtype=int32, numpy=0>)
(<tf.Tensor: shape=(114,), dtype=int32, numpy=
array([   4,   27,   20,   89,   20,    2,

In [56]:
for ex in test_data.take(5):
  print(ex)

(<tf.Tensor: shape=(30, 350), dtype=int32, numpy=
array([[   3,   12,  104, ...,   33,   14,  317],
       [  27,  256,    0, ...,    0,    0,    0],
       [   3,   12,  147, ...,    0,    0,    0],
       ...,
       [   3,   45, 1980, ...,    0,    0,    0],
       [2888,   98,   49, ...,    0,    0,    0],
       [   6,    4,   18, ...,    0,    0,    0]])>, <tf.Tensor: shape=(30,), dtype=int32, numpy=
array([0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 0])>)
(<tf.Tensor: shape=(30, 351), dtype=int32, numpy=
array([[ 166,  134,   28, ...,    0,    0,    0],
       [  50,  331,   80, ...,    0,    0,    0],
       [ 849,  544, 1882, ...,    0,    0,    0],
       ...,
       [ 174,  102,   19, ...,    0,    0,    0],
       [   7,   15,    5, ...,    0,    0,    0],
       [  19, 1192,  144, ...,    0,    0,    0]])>, <tf.Tensor: shape=(30,), dtype=int32, numpy=
array([0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0,


In [61]:
sample_text, sample_labels = next(iter(test_data))

sample_text, sample_labels

(<tf.Tensor: shape=(30, 350), dtype=int32, numpy=
 array([[   3,   12,  104, ...,   33,   14,  317],
        [  27,  256,    0, ...,    0,    0,    0],
        [   3,   12,  147, ...,    0,    0,    0],
        ...,
        [   3,   45, 1980, ...,    0,    0,    0],
        [2888,   98,   49, ...,    0,    0,    0],
        [   6,    4,   18, ...,    0,    0,    0]])>,
 <tf.Tensor: shape=(30,), dtype=int32, numpy=
 array([0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1,
        0, 0, 0, 1, 1, 1, 0, 0])>)

In [16]:
sample_text, sample_labels = next(iter(encoded_review_ratings))

sample_text, sample_labels

RecursionError: maximum recursion depth exceeded while calling a Python object

In [15]:
encoded_review_ratings
#<ConcatenateDataset shapes: ((None,), ()), types: (tf.int32, tf.int32)>

<ConcatenateDataset shapes: ((None,), ()), types: (tf.int32, tf.int32)>

In [14]:
test_data
#<PaddedBatchDataset shapes: ((None, None), (None,)), types: (tf.int32, tf.int32)>

<PaddedBatchDataset shapes: ((None, None), (None,)), types: (tf.int32, tf.int32)>