In [1]:
import numpy as np
import pandas as pd
import ast
import re

from collections import Counter
import tensorflow_datasets as tfds

In [2]:
##reload review data set and get summary
review_data=pd.read_json(path_or_buf= 'SteamData/SteamReviews.json', orient='columns')
with open('SteamData/SteamReviewSummary.txt', 'r') as file:
    review_summary= ast.literal_eval(file.read())

In [3]:
# if greater than zero, we have duplicates present
pd.set_option('display.max_rows', 9300)
review_data.duplicated(subset=None, keep='first').sum()

0

In [4]:
review_text=review_data['review'].copy()
review_rating= review_data['voted_up'].copy()

#the decontracted method removed contractions and also a few common typos
def decontracted(phrase):
    
    # specific
    phrase = re.sub(r'\btl;dr\b', 'tldr', phrase)
    phrase = re.sub(r'\bwon\'t\b', 'will not', phrase)
    phrase = re.sub(r'\bcan\'t\b', 'can not', phrase)
    phrase = re.sub(r'\bdont\b', 'do not', phrase)
    phrase = re.sub(r'\bwont\b', 'will not', phrase)
    phrase = re.sub(r'\bcant\b', 'can not', phrase)
    phrase = re.sub(r'\bcannot\b', 'can not', phrase)
    phrase = re.sub(r'\blet\'s\b', 'let us', phrase)
    phrase = re.sub(r'\bw/\b', 'with', phrase)
    phrase = re.sub(r'\bw/o\b', 'without', phrase)
    phrase = re.sub(r'\bive\b', 'i have', phrase)
    phrase = re.sub(r'\blets\b', 'let us', phrase)
    
    # general
    phrase = re.sub(r'\b&\b', 'and', phrase)
    phrase = re.sub(r'n\'t', ' not', phrase)
    phrase = re.sub(r'\'re', ' are', phrase)
    phrase = re.sub(r'\'s', ' is', phrase)
    phrase = re.sub(r'\'d', ' would', phrase)
    phrase = re.sub(r'\'ll', ' will', phrase)
    phrase = re.sub(r'\'t', ' not', phrase)
    phrase = re.sub(r'\'ve', ' have', phrase)
    phrase = re.sub(r'\'m', ' am', phrase)
    return phrase


#removing contractions, URLs, digits (and any non-space character around them)
for i in range(len(review_text)): 
    review_text[i]=decontracted(review_text[i].lower())
    review_text[i]=re.sub(r'(http\S+|\[.*?\])|\\n|\$|\S*\d\S*|[^\x00-\x7A]|\W', ' ', review_text[i])
    
rating_encoded=[]
for i in review_rating:
    rating_encoded.append(int(i))

In [5]:
print(review_data.shape, review_summary)

(573, 21) {'num_reviews': 1, 'review_score': 5, 'review_score_desc': 'Mixed', 'total_positive': 3971, 'total_negative': 5302, 'total_reviews': 9273}


In [6]:
#Converting the reviews to word vectors

#initialize a counter object
word_count =Counter()

#For each review in our file, we update our counter with any new words and increase the count of words already seen.
for review in review_text:
    word_count.update(review.split())
    
#identifying words only used once, removing most common words (like the, and, a)
top_word_list = word_count.most_common(5)
one_appearance_word_list = [w for w in word_count if word_count[w] == 1]
#len(one_appearance_word_list)
#len(word_count)

#Removing useless words, 

#initializing a regex pattern for removing words
one_word_regex=''

#a single regex expression is created by concatenating each word in the list of words to remove
for word in one_appearance_word_list:
    one_word_regex+='\\b'+ word + '\\b|'
for word, count in top_word_list:  
    one_word_regex+='\\b'+ word + '\\b|'
#eliminate single characters because some people t y p e  l i k e  t h i s for emphasis    
#one_word_regex+='(\\b\\w\\b)'
pattern=re.compile(one_word_regex.rstrip('|'))

In [7]:
review_text_trimmed=[]
#removing words that appear one time, or appear so often that they contain no information
for i in range(len(review_text)):
    review_text_trimmed.append(re.sub(pattern,'', review_text[i]))

#delete any empty reviews
for x,y in enumerate(review_text_trimmed):
    if not y or y.isspace():
        del review_text_trimmed[x]
        del rating_encoded[x]
    

In [8]:
#create and save encoder for our reviews
encoder = tfds.features.text.SubwordTextEncoder.build_from_corpus(
    review_text_trimmed, target_vocab_size=len(word_count)+1)
encoder.save_to_file('SteamData/SteamReviewVocab')


In [9]:
trimmed_review_d=pd.DataFrame(data=review_text_trimmed,columns=['Trimmed Review'], dtype='string')
rating_encoded_d=pd.DataFrame(data=rating_encoded,columns=['Rating'], dtype='int64')

#encoded_review_d=encoded_review_d.fillna(0)
#encoded_review_d=encoded_review_d.applymap(lambda x: int(x))



In [10]:
#save our reviews and their positive/negative review label
my_df = pd.concat([trimmed_review_d, rating_encoded_d], axis=1)
my_df.reset_index()
my_df.to_csv('SteamData/FormattedReviewRatingList.csv', index=False, header=True)

In [11]:
my_df

Unnamed: 0,Trimmed Review,Rating
0,love it,1
1,dawn war is such great game in terms grap...,1
2,e n d e s s t s,0
3,if i wanted play starcraft i would play star...,0
4,i am not one be negative about games but thi...,0
5,very fun game recommend game,1
6,thanks for great franchise despite some ni...,0
7,i started this franchise with dawn war i pe...,1
8,sad see what was previously an excellent rts ...,0
9,this is first time i have not completed game...,0
