In [5]:
import requests
import json
import numpy as np
import pandas as pd
import ast
import re
from collections import Counter


import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import tensorflow_datasets as tfds
tfds.disable_progress_bar()

In [6]:
##Getting reviews dataset and saving it.
URL= 'https://store.steampowered.com/appreviews/285190'
cursor_string='*'
cursor_list=['']
review_data= pd.DataFrame()
parameters={'json': 1,'filter': 'all','language': 'english','day_range': 5000,'cursor': cursor_string,
            'review_type': 'all','purchase_type': 'all','num_per_page': 1}
#we access the reviews the first time in order to ascertain how many reviews there are to download
response= requests.get( url= URL, params=parameters)
raw_data=response.json()
summary=raw_data['query_summary']

#save the summary as a txt file
summaryString= json.dumps(summary)
with open('SteamData/SteamReviewSummary.txt', 'w') as file:
    file.write(summaryString)
    
#set the maximum number of reviews per request
parameters['num_per_page']=100
#calculate how many times to loop the get request, based on the total number of reviews in our query summary
request_iterations =1+int(summary['total_reviews']/parameters['num_per_page'])

for i in range(request_iterations):
    response= requests.get( url= URL, params=parameters)
    raw_data=response.json()
    raw_reviews=raw_data['reviews']
    review_data=review_data.append(pd.json_normalize(raw_reviews, max_level = 2))
    

    

    
    cursor_list+=cursor_string
    cursor_string=raw_data['cursor']
    parameters['cursor']= cursor_string
    
    #get the last remaining set of reviews (not included in the 100 per page iterations)
if summary['total_reviews']%parameters['num_per_page']!=0:
    parameters['num_per_page']=(summary['total_reviews']%parameters['num_per_page'])
    response= requests.get( url= URL, params=parameters)
    raw_data=response.json()
    raw_reviews=raw_data['reviews']
    review_data=review_data.append(pd.json_normalize(raw_reviews, max_level = 2))

review_data.drop_duplicates(keep='first', inplace=True)
review_data.reset_index(inplace=True)
review_data.to_json(path_or_buf= 'SteamData/SteamReviews.json')

In [7]:
##reload review data set and get summary
review_data=pd.read_json(path_or_buf= 'SteamData/SteamReviews.json', orient='columns')
with open('SteamData/SteamReviewSummary.txt', 'r') as file:
    review_summary= ast.literal_eval(file.read())

In [8]:
review_text=review_data['review'].copy()
review_rating= review_data['voted_up'].copy()
def decontracted(phrase):
    
    # specific
    phrase = re.sub(r'tl;dr', 'tldr', phrase)
    phrase = re.sub(r'won\'t', 'will not', phrase)
    phrase = re.sub(r'can\'t', 'can not', phrase)
    phrase = re.sub(r'\bdont\b', 'do not', phrase)
    phrase = re.sub(r'\bwont\b', 'will not', phrase)
    phrase = re.sub(r'\bcant\b', 'can not', phrase)
    phrase = re.sub(r'\bcannot\b', 'can not', phrase)
    phrase = re.sub(r'let\'s', 'let us', phrase)
    phrase = re.sub(r'w/', 'with', phrase)
    phrase = re.sub(r'w/o', 'without', phrase)
    phrase = re.sub(r'\bive\b', 'i have', phrase)
    phrase = re.sub(r'\blets\b', 'let us', phrase)

    # general
    phrase = re.sub(r'n\'t', ' not', phrase)
    phrase = re.sub(r'\'re', ' are', phrase)
    phrase = re.sub(r'\'s', ' is', phrase)
    phrase = re.sub(r'\'d', ' would', phrase)
    phrase = re.sub(r'\'ll', ' will', phrase)
    phrase = re.sub(r'\'t', ' not', phrase)
    phrase = re.sub(r'\'ve', ' have', phrase)
    phrase = re.sub(r'\'m', ' am', phrase)
    return phrase

#clean data: removing symbols/formatting characters, making all characters lower case, removing contractions

for i in range(len(review_text)): 
    review_text[i]=decontracted(review_text[i].lower())
    review_text[i]=re.sub(r'(http\S+|\[.*?\])|\\n|\W|(\$\S+)|\d|[^\x00-\x7A]', ' ', review_text[i])
    
#convert boolean strings to int for reviews (1 means a positive review, 0 means negative)
rating_encoded=[]
for i in review_rating:
    rating_encoded.append(int(i))

In [9]:
print(review_data.shape, review_summary)

(3748, 21) {'num_reviews': 1, 'review_score': 5, 'review_score_desc': 'Mixed', 'total_positive': 3966, 'total_negative': 5299, 'total_reviews': 9265}


In [10]:
#Converting the reviews to word vectors

#initialize a counter object
word_count =Counter()

#For each review in our file, we update our counter with any new words and increase the count of words already seen.
for review in review_text:
    word_count.update(review.split())
    
#identifying words only used once, removing most common words (like the, and, a)
top_word_list = word_count.most_common(5)
one_appearance_word_list = [w for w in word_count if word_count[w] == 1]
#len(one_appearance_word_list)
#len(word_count)

8010

In [11]:
#Removing useless words, 

#initializing a regex pattern for removing words
one_word_regex=''
for word in one_appearance_word_list:
    one_word_regex+='\\b'+ word + '\\b|'
    
for word, count in top_word_list:  
    one_word_regex+='\\b'+ word + '\\b|'
pattern=re.compile(one_word_regex.rstrip('|'))

In [12]:
review_text_trimmed=[]
#removing words that appear one time, or appear so often that they contain no information
for i in range(len(review_text)):
    review_text_trimmed.append(re.sub(pattern,'', review_text[i]))
    review_text_trimmed[i]=re.sub('\s{2,}',' ', review_text_trimmed[i])


In [13]:
print(review_text_trimmed[3724], '\n', review_text[3724])

honestly i have disagree with most negative reviews this game is great it is not exactly dow it is very different from but on its own merits it is solid warhammer game for one thing it is an rts again with base building proper armies which is welcome return form all three different armies have their particular playstyle they have done them justice as space marines drop dreadnought in middle fight or crush enemy infantry under your drop pod while you are at it sweet as eldar teleport your entire base away strike beautiful as orks waaagh it certainly feels like proper warhammer k this is helped by fantastic score by paul morgan recent solid voice acting let us face it really sucked in that department singleplayer campaign is worth entry price alone it has great story with some proper k moments that make you go fuck yeah as far as i am concerned last two missions this game are new definition epic give it chance you will not regret it proper k games are not dime dozen this is good one look

In [14]:
#create and save encoder for our reviews,
encoder = tfds.features.text.SubwordTextEncoder.build_from_corpus(
    review_text_trimmed, target_vocab_size=len(word_count))
encoder.save_to_file('SteamData/SteamReviewVocab')

In [15]:
encoder= encoder.load_from_file('SteamData/SteamReviewVocab')

#encode the words 
encoded_reviews=[]
for i in review_text_trimmed:
    encoded_reviews.append(encoder.encode(i))


In [16]:
#creating tensorflow datasets for training
def labeler(review, rating):
    return review, rating
#pairing the labels (good/bad game) with the encoded reviews
encoded_review_rating_list=[]
for i,j in enumerate(encoded_reviews):
    encoded_review_dataset = tf.data.Dataset.from_tensors(j)
    encoded_review_rating_list.append(encoded_review_dataset.map(lambda x: labeler(x,rating_encoded[i])))



In [21]:
# Combine the list of review:score sets into a single tensor dataset.
encoded_review_ratings = encoded_review_rating_list[0]
for labeled_dataset in encoded_review_rating_list[1:]:
    encoded_review_ratings=encoded_review_ratings.concatenate(labeled_dataset)

#Shuffle the datasets to avoid any biases.
buffer_size = len(encoded_reviews)
all_labeled_data = encoded_review_ratings.shuffle(
    buffer_size, reshuffle_each_iteration=False)

In [29]:
##Split the encoded words into training and test datasets, take size amount of data that goes into the training set
training_ratio=0.6
take_size= round(len(encoded_reviews)*training_ratio)
batch_size=20

train_data = encoded_review_ratings.take(take_size).shuffle(buffer_size)
train_data = train_data.padded_batch(batch_size, padded_shapes=([None],()))

test_data = encoded_review_ratings.skip(take_size)
test_data = test_data.padded_batch(batch_size, padded_shapes=([None],()))

In [25]:
embedding_dim=16

model = keras.Sequential([
    layers.Embedding(encoder.vocab_size, embedding_dim),
    layers.GlobalAveragePooling1D(),
    layers.Dense(100, activation='relu'),
    layers.Dense(50, activation='relu'),
    layers.Dense(1)
])

model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, None, 16)          155728    
_________________________________________________________________
global_average_pooling1d_2 ( (None, 16)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 100)               1700      
_________________________________________________________________
dense_5 (Dense)              (None, 50)                5050      
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 51        
Total params: 162,529
Trainable params: 162,529
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

history = model.fit(
    train_data,
    epochs=10,
    validation_data=test_data, validation_steps=5)

In [30]:
text_dummy, label_dummy = next(iter(test_data))

text_dummy[0], label_dummy[0]

RecursionError: maximum recursion depth exceeded while calling a Python object