# Pre-Processing
In this notebook we are going to clean steam data, game description and reviews.

## imports

In [1]:
import nltk

In [2]:
import numpy as np
import pandas as pd
import re
import string
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

In [168]:
steam       = pd.read_csv(r"C:\Users\elaaf\Desktop\SDS\project_4_data\steam.csv")
description = pd.read_csv(r"C:\Users\elaaf\Desktop\SDS\project_4_data\steam_description_data.csv")
reviews     = pd.read_csv(r"C:\Users\elaaf\Desktop\SDS\project_4_data\steam_reviews.csv")

## 1. Steam reviews cleaning 

In [169]:
reviews.shape

(434891, 8)

In [170]:
reviews.columns

Index(['date_posted', 'funny', 'helpful', 'hour_played',
       'is_early_access_review', 'recommendation', 'review', 'title'],
      dtype='object')

In [171]:
reviews.head(1)

Unnamed: 0,date_posted,funny,helpful,hour_played,is_early_access_review,recommendation,review,title
0,2019-02-10,2,4,578,False,Recommended,&gt Played as German Reich&gt Declare war on B...,Expansion - Hearts of Iron IV: Man the Guns


In [172]:
reviews.groupby("title")["review"].count().sort_values()

title
Survivor Pass: Vikendi                                3
Sid Meier’s Civilization® VI: Rise and Fall           3
Sid Meier's Civilization® VI: Gathering Storm         3
Expansion - Hearts of Iron IV: Man the Guns           3
ACE COMBAT™ 7: SKIES UNKNOWN                         10
Tom Clancy's Rainbow Six® Siege                      10
Tannenberg                                           10
Subnautica: Below Zero                               10
Pathfinder: Kingmaker                                10
Overcooked! 2                                        10
NBA 2K19                                             10
My Time At Portia                                    10
Wargroove                                            10
Kenshi                                               10
GOD EATER 3                                          10
Moonlighter                                          10
Warhammer 40,000: Mechanicus                         10
Football Manager 2019                     

### We are going to combine reviews per game

In [173]:
games = list(reviews["title"].unique())

In [174]:
reviews_per_game = pd.DataFrame(columns=["game", "reviews"])
reviews_per_game["game"] = games
reviews_per_game.head(5)

Unnamed: 0,game,reviews
0,Expansion - Hearts of Iron IV: Man the Guns,
1,Dead by Daylight,
2,Wargroove,
3,Wallpaper Engine,
4,Factorio,


In [185]:
all_reviews = []
for game in games:
    review_per_game = reviews[reviews["title"] == game]["review"]
    temp = ''
    for r in review_per_game: #join drop long strings
        temp += str(r) + "\n"
    all_reviews.append(temp)

In [186]:
all_reviews[0]

"&gt Played as German Reich&gt Declare war on Belgium&gt Can't break Belgium so go through France&gt Capitulate France in order to get to Belgium&gt Get True Blitzkrieg achievementThis game is dad\nyes.\nVery good game although a bit overpriced in my opinion. I'd prefer playing the game with mods (historical accuracy so on) although the vanilla version is good aswell. 7/10\n"

In [187]:
reviews_per_game["reviews"] = all_reviews

In [188]:
reviews_per_game.head(5)

Unnamed: 0,game,reviews
0,Expansion - Hearts of Iron IV: Man the Guns,&gt Played as German Reich&gt Declare war on B...
1,Dead by Daylight,Out of all the reviews I wrote This one is pro...
2,Wargroove,It's like Advance Wars with a coat of Fire Emb...
3,Wallpaper Engine,Cool as hell!\nA wonderful additional to your ...
4,Factorio,Factorio is a puzzle game. The objective is su...


In [189]:
reviews_per_game.to_csv(r"..\Data\reviews_per_game.csv")

In [7]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\elaaf\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:
stopwords = stopwords.words('english')
stemmer = nltk.stem.PorterStemmer()

In [190]:
digits = ['0','1',
         '2','3',
         '4','5',
         '6','7',
         '8','9']

html_entites = ["&gt", "&lt"]

In [196]:
##input: sentence
##output: tokenized list
#purpose: clean (remove stopwords, punctuations, digits and lowercase) and split words
def token(sentence):
    
    
    sentence = sentence.replace('\d+'," ").replace('\n'," ").strip(" ")
    
    # 1. remove punct, digits, <> and lowercase
    for d in digits:
        sentence = sentence.replace(d,'')
    
    for symbol in html_entites:
        sentence = sentence.replace(symbol,'')
    
    for punct in string.punctuation:
        sentence = sentence.replace(punct,'').lower()

        
    words = sentence.split(' ')
    t = []
    
    # 2. remove stop words and stem
    for word in words:
        if (not word in stopwords) and (word!=''):
            # Stem words
            stemmed = stemmer.stem(word)
            t.append(stemmed)

    return t

### Check to see if the tokenizer works as expected

In [197]:
count = 0

for s in reviews_per_game.reviews:
    print(token(s))
    count +=1
    1/0

['play', 'german', 'reich', 'declar', 'war', 'belgium', 'cant', 'break', 'belgium', 'go', 'franc', 'capitul', 'franc', 'order', 'get', 'belgium', 'get', 'true', 'blitzkrieg', 'achievementthi', 'game', 'dad', 'ye', 'good', 'game', 'although', 'bit', 'overpr', 'opinion', 'id', 'prefer', 'play', 'game', 'mod', 'histor', 'accuraci', 'although', 'vanilla', 'version', 'good', 'aswel']


ZeroDivisionError: division by zero

## Vectorize with TF-IDF and tokenizer

In [198]:
vectorizer = TfidfVectorizer(tokenizer=token)

In [199]:
TF_IDF_matrix = vectorizer.fit_transform(rev['review'])

In [200]:
TF_IDF_matrix.shape

(434891, 268269)

In [201]:
import pickle
with open(r"../Data/reviews_TF_IDF_matrix.pickle", 'wb') as handle:
    pickle.dump(TF_IDF_matrix, handle, protocol=pickle.HIGHEST_PROTOCOL) #HIGHEST_PROTOCOL for smallest size possible