In [2]:
import requests
import json
import numpy as np
import pandas as pd
import ast
import re
from collections import Counter


import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import tensorflow_datasets as tfds
tfds.disable_progress_bar()

In [3]:
##Getting reviews dataset and saving it.
URL= 'https://store.steampowered.com/appreviews/285190'
cursor_string='*'
cursor_list=['']
review_data= pd.DataFrame()
parameters={'json': 1,'filter': 'all','language': 'english','day_range': 5000,'cursor': cursor_string,
            'review_type': 'all','purchase_type': 'all','num_per_page': 1}
#we access the reviews the first time in order to ascertain how many reviews there are to download
response= requests.get( url= URL, params=parameters)
raw_data=response.json()
summary=raw_data['query_summary']

#save the summary as a txt file
summaryString= json.dumps(summary)
with open('C:/Users/Malcolm/Untitled Folder/SteamData/SteamReviewSummary.txt', 'w') as file:
    file.write(summaryString)
    
#set the maximum number of reviews per request
parameters['num_per_page']=100
#calculate how many times to loop the get request, based on the total number of reviews in our query summary
request_iterations =1+int(summary['total_reviews']/parameters['num_per_page'])

for i in range(request_iterations):
    response= requests.get( url= URL, params=parameters)
    raw_data=response.json()
    raw_reviews=raw_data['reviews']
    review_data=review_data.append(pd.json_normalize(raw_reviews, max_level = 2))
    

    

    
    cursor_list+=cursor_string
    cursor_string=raw_data['cursor']
    parameters['cursor']= cursor_string
    
    #get the last remaining set of reviews (not included in the 100 per page iterations)
if summary['total_reviews']%parameters['num_per_page']!=0:
    parameters['num_per_page']=(summary['total_reviews']%parameters['num_per_page'])
    response= requests.get( url= URL, params=parameters)
    raw_data=response.json()
    raw_reviews=raw_data['reviews']
    review_data=review_data.append(pd.json_normalize(raw_reviews, max_level = 2))

review_data.drop_duplicates(keep='first', inplace=True)
review_data.reset_index(inplace=True)
review_data.to_json(path_or_buf= 'C:/Users/Malcolm/Untitled Folder/SteamData/SteamReviews.json')

In [4]:
##reload review data set and get summary
review_data=pd.read_json(path_or_buf= 'C:/Users/Malcolm/Untitled Folder/SteamData/SteamReviews.json', orient='columns')
with open('C:/Users/Malcolm/Untitled Folder/SteamData/SteamReviewSummary.txt', 'r') as file:
    review_summary= ast.literal_eval(file.read())

In [5]:
review_text=review_data['review'].copy()
review_rating= review_data['voted_up'].copy()
def decontracted(phrase):
    
    # specific
    phrase = re.sub(r'tl;dr', 'tldr', phrase)
    phrase = re.sub(r'won\'t', 'will not', phrase)
    phrase = re.sub(r'can\'t', 'can not', phrase)
    phrase = re.sub(r'\bdont\b', 'do not', phrase)
    phrase = re.sub(r'\bwont\b', 'will not', phrase)
    phrase = re.sub(r'\bcant\b', 'can not', phrase)
    phrase = re.sub(r'\bcannot\b', 'can not', phrase)
    phrase = re.sub(r'let\'s', 'let us', phrase)
    phrase = re.sub(r'w/', 'with', phrase)
    phrase = re.sub(r'w/o', 'without', phrase)
    phrase = re.sub(r'\bive\b', 'i have', phrase)
    phrase = re.sub(r'\blets\b', 'let us', phrase)

    # general
    phrase = re.sub(r'n\'t', ' not', phrase)
    phrase = re.sub(r'\'re', ' are', phrase)
    phrase = re.sub(r'\'s', ' is', phrase)
    phrase = re.sub(r'\'d', ' would', phrase)
    phrase = re.sub(r'\'ll', ' will', phrase)
    phrase = re.sub(r'\'t', ' not', phrase)
    phrase = re.sub(r'\'ve', ' have', phrase)
    phrase = re.sub(r'\'m', ' am', phrase)
    return phrase

#clean data: removing symbols/formatting characters, making all characters lower case, removing contractions

for i in range(len(review_text)): 
    review_text[i]=decontracted(review_text[i].lower())
    review_text[i]=re.sub(r'(http\S+|\[.*?\])|\\n|\W|(\$\S+)|\d|[^\x00-\x7A]', ' ', review_text[i])
    
#convert boolean strings to int for reviews (1 means a positive review, 0 means negative)
rating_encoded=[]
for i in review_rating:
    rating_encoded.append(int(i))

In [6]:
print(review_data.shape, review_summary)

(3750, 21) {'num_reviews': 1, 'review_score': 5, 'review_score_desc': 'Mixed', 'total_positive': 3966, 'total_negative': 5299, 'total_reviews': 9265}


In [7]:
#Converting the reviews to word vectors

#initialize a counter object
word_count =Counter()

#For each review in our file, we update our counter with any new words and increase the count of words already seen.
for review in review_text:
    word_count.update(review.split())
    
#identifying words only used once, removing most common words (like the, and, a)
top_word_list = word_count.most_common(5)
one_appearance_word_list = [w for w in word_count if word_count[w] == 1]
len(one_appearance_word_list)
#len(word_count)

7992

In [8]:
#Removing useless words, 

#initializing a regex pattern for removing words
one_word_regex=''
for word in one_appearance_word_list:
    one_word_regex+='\\b'+ word + '\\b|'
    
for word, count in top_word_list:  
    one_word_regex+='\\b'+ word + '\\b|'
pattern=re.compile(one_word_regex.rstrip('|'))

In [9]:
review_text_trimmed=[]
#removing words that appear one time, or appear so often that they contain no information
for i in range(len(review_text)):
    review_text_trimmed.append(re.sub(pattern,'', review_text[i]))
    review_text_trimmed[i]=re.sub('\s{2,}',' ', review_text_trimmed[i])

#

In [10]:
print(review_text_trimmed[3724], '\n', review_text[3724])

honestly i have disagree with most negative reviews this game is great it is not exactly dow it is very different from but on its own merits it is solid warhammer game for one thing it is an rts again with base building proper armies which is welcome return form all three different armies have their particular playstyle they have done them justice as space marines drop dreadnought in middle fight or crush enemy infantry under your drop pod while you are at it sweet as eldar teleport your entire base away strike beautiful as orks waaagh it certainly feels like proper warhammer k this is helped by fantastic score by paul morgan recent solid voice acting let us face it really sucked in that department singleplayer campaign is worth entry price alone it has great story with some proper k moments that make you go fuck yeah as far as i am concerned last two missions this game are new definition epic give it chance you will not regret it proper k games are not dime dozen this is good one look

In [11]:
#create and save encoder for our reviews,
encoder = tfds.features.text.SubwordTextEncoder.build_from_corpus(
    review_text_trimmed, target_vocab_size=len(word_count))
encoder.save_to_file('C:/Users/Malcolm/Untitled Folder/SteamData/SteamReviewVocab')

NotFoundError: Failed to create a NewWriteableFile: C:/Users/Malcolm/Untitled Folder/SteamData/SteamReviewVocab.subwords : Das System kann den angegebenen Pfad nicht finden.
; No such process

In [None]:
encoder= encoder.load_from_file('C:/Users/Malcolm/Untitled Folder/SteamData/SteamReviewVocab')

#encode the words 
encoded_reviews=[]
for i in review_text_trimmed:
    encoded_reviews.append(encoder.encode(i))


In [None]:
#creating tensorflow datasets for training
def labeler(review, rating):
    return review, rating

encoded_review_rating_list=[]
for i,j in enumerate(encoded_reviews):
    encoded_review_dataset = tf.data.Dataset.from_tensors(j)
    encoded_review_rating_list.append(encoded_review_dataset.map(lambda x: labeler(x,rating_encoded[i])))



In [None]:
# Combine these labeled datasets into a single dataset, and shuffle it.
encoded_review_ratings = encoded_review_rating_list[0]
for labeled_dataset in encoded_review_rating_list[1:]:
    encoded_review_ratings=encoded_review_ratings.concatenate(labeled_dataset)
    
buffer_size = len(encoded_reviews)
all_labeled_data = encoded_review_ratings.shuffle(
    buffer_size, reshuffle_each_iteration=False)

In [None]:
##Split the encoded words into training and test datasets, take size is the fraction that goes into the training set
training_ratio=0.6
take_size= round(len(encoded_reviews)*training_ratio)
batch_size=10

train_data = encoded_review_ratings.take(take_size).shuffle(buffer_size)
train_data = train_data.padded_batch(batch_size, padded_shapes=([None],()))

test_data = encoded_review_ratings.skip(take_size)
test_data = test_data.padded_batch(batch_size, padded_shapes=([None],()))

In [None]:
embedding_dim=16

model = keras.Sequential([
  layers.Embedding(encoder.vocab_size, embedding_dim),
  layers.GlobalAveragePooling1D(),
  layers.Dense(16, activation='relu'),
  layers.Dense(1)
])

model.summary()

In [18]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

history = model.fit(
    train_data,
    epochs=10,
    validation_data=test_data, validation_steps=5)

Train for 225 steps, validate for 5 steps
Epoch 1/10
  0/225 [..............................] - ETA: 0s

RecursionError: maximum recursion depth exceeded while calling a Python object

In [218]:
#labeler(encoded_reviews[1],1)
#tf.data.Dataset.from_tensors(encoded_reviews[1])
#print(encoded_review_rating_list)
#tf.data.Dataset.from_tensors(rating_encoded[1])

<TensorDataset shapes: (80,), types: tf.int32>

In [245]:
encoded_review_rating_list[1:5]

[<MapDataset shapes: ((80,), ()), types: (tf.int32, tf.int32)>,
 <MapDataset shapes: ((850,), ()), types: (tf.int32, tf.int32)>,
 <MapDataset shapes: ((105,), ()), types: (tf.int32, tf.int32)>,
 <MapDataset shapes: ((594,), ()), types: (tf.int32, tf.int32)>]

In [277]:
encoder.vocab_size

10674

In [286]:
print(encoded_review_ratings)#.concatenate()

<ConcatenateDataset shapes: ((None,), ()), types: (tf.int32, tf.int32)>


In [303]:
dummy=encoded_review_rating_list[0].concatenate(encoded_review_rating_list[1])

for e in dummy.as_numpy_iterator():
    print (e)

(array([10450,     4,    20,   466,    73,   344,     1,     3,   173,
           3,    36,    47,   486,   739,    33,    51,  5849,   275,
          99,  1455,   614,     9,   202,   443,    18,    40,     5,
        1440,  1455,   614,    67,     4,  2801,    12,     4,  2801,
          67,    18,    79,     5,  1440,  1455,   614,    18,   298,
          18,    99,   188,   359,    19,    75,   367,    45,    10,
          60,     2,    71,   188,   359,    19,    75,  1524,     9,
          23,     5,  1468,  1215,    38]), 0)
(array([ 239,    7,    4,    1,   59,  726,   12,  424,  400,  312,    6,
        145,  202,  758,  229,   19,    3,    1,  325,    9,    3,  101,
         15,  542,   81,  743,  112,   83,  120,    4,   12,  197,  253,
        965,   27, 2135,  589, 4687,   91,  614,  279,   27, 3504,   91,
        342,  111,    4, 1280,   88,  177,   46,  107, 3134, 4234, 1408,
          2,   15,  228,  425,    9,  825, 2293,   42, 4347, 2779,    8,
        663,  643, 2758