# Creating a MongoDB Database
- attempt to create a MongoBD database from review data

In [1]:
import pymongo

In [2]:
connection = pymongo.MongoClient('mongodb://localhost:27017')

In [19]:
# Make a new database for reviews and then a collection called reviews
db = connection['reviews_project']
reviews = db['reviews']

In [13]:
# Here's the function that we use to collect reviews
import re
from os.path import abspath
from langdetect import detect
from langdetect.lang_detect_exception import LangDetectException

def get_reviews_for_game(file_path):
    '''
    Get list of reviews in a single game file.

    :param file_path: path to reviews file
    :type file_path: str
    :returns: list of dicts
    '''

    reviews = []
    lines = open(abspath(file_path)).readlines()
    i = 0
    while i + 1 < len(lines): # We need to get every 2-line couplet
        # Extract the hours value and the review text from each 2-line
        # sequence
        try:
            h = float(lines[i].split()[1].strip())
            r = lines[i + 1].split(' ', 1)[1].strip()
        except (ValueError, IndexError) as e:
            i += 2
            continue
        # Skip reviews that don't have any characters
        if not len(r):
            i += 2
            continue
        # Skip reviews if they cannot be recognized as English
        try:
            if not detect(r) == 'en':
                i += 2
                continue
        except LangDetectException:
            i += 2
            continue
        # Now we append the 2-key dict to the end of reviews
        reviews.append(dict(hours=h,
                            review=r))
        i += 2 # Increment i by 2 since we need to go to the next
            # 2-line couplet
    return reviews

In [5]:
import os
!pwd

/home/mulhollandm2/reviews_project/reviewer_experience_prediction/util


In [6]:
os.chdir('../data')

In [7]:
!pwd

/home/mulhollandm2/reviews_project/reviewer_experience_prediction/data


In [8]:
ls

[00m[00mArma_3.txt[00m                           [00mGrand_Theft_Auto_V.txt[00m
[00mCounter_Strike_Global_Offensive.txt[00m  [01;32m__init__.py[00m*
[00mCounter_Strike.txt[00m                   [01;34m__pycache__[00m/
[00mDota_2.txt[00m                           [00mSid_Meiers_Civilization_5.txt[00m
[00mFootball_Manager_2015.txt[00m            [00mTeam_Fortress_2.txt[00m
[00mGarrys_Mod.txt[00m                       [00mThe_Elder_Scrolls_V.txt[00m
[00mget_review_data_271590.txt[00m           [00mWarframe.txt[00m
[m

In [14]:
# Let's get reviews from a game that has only a little bit of data
rs = get_reviews_for_game('Football_Manager_2015.txt')

In [15]:
# Here's the first few reviews
rs[:3]

[{'hours': 237.9,
  'review': "ive bought this game for many years and think this might be my last. firstly im not keen on spending 30 a year on something that could be released as an update. the lack of competition in the manager genre has made SI very lazy, to the point where they make one or two small changes and market it as a new game. here are some of the points where my problems lie -ridiculous injurys, in my first season there was not one point where i didnt have at least six players injured -how many times in real football does a team make all its subs then a player gets injured and they finish the game with 10 men? well in FM about one in three games but amazingly never to the computer opponent -its almost impossible to generate any momentum anymore in FM, you will be playing well then suddenly lose for no reason -football has always been in the headlines for footballers astronomical wages, thank god its not like FM, youth players who have never made a first team appearance a

In [21]:
# Now, let's try to insert a review into the collection
r = rs[0]
result = reviews.insert(r)

In [24]:
# Success! It worked! I confirmed as much by actually checking in the mongo shell

In [25]:
# Now, let's take that review out of the collection and do this 4rell
reviews.drop()

In [26]:
list(reviews.find())

[]

In [27]:
# Let's add all the reviews this time, but also we'll add another key for the game name
# and another one for the appid
r = None
for r in rs:
    r['name'] = 'Football_Manager_2015.txt'
    r['appid'] = '295270'
    reviews.insert(r)

In [28]:
# Now, there should be a lot of stuff in the collection, so let's check it out
list(reviews.find())[:3]

[{'_id': ObjectId('5538a86fc134cf71c3fffdf7'),
  'appid': '295270',
  'hours': 237.9,
  'name': 'Football_Manager_2015.txt',
  'review': "ive bought this game for many years and think this might be my last. firstly im not keen on spending 30 a year on something that could be released as an update. the lack of competition in the manager genre has made SI very lazy, to the point where they make one or two small changes and market it as a new game. here are some of the points where my problems lie -ridiculous injurys, in my first season there was not one point where i didnt have at least six players injured -how many times in real football does a team make all its subs then a player gets injured and they finish the game with 10 men? well in FM about one in three games but amazingly never to the computer opponent -its almost impossible to generate any momentum anymore in FM, you will be playing well then suddenly lose for no reason -football has always been in the headlines for footballers

In [30]:
# This is exciting!
# Let's see if we can find review entries with hours less than 50
list(reviews.find({'hours': {'$lt': 50}}))[:5]

[{'_id': ObjectId('5538a963c134cf71c3fffe04'),
  'appid': '295270',
  'hours': 37.8,
  'name': 'Football_Manager_2015.txt',
  'review': 'Well this is it, the last FM game I\'ll ever buy for full price. It\'s not a bad game on it\'s own, but charging AAA price for this yearly database update, and on top of that shamelessly charging 4.5 euros for the ingame editor is the last straw for me. At least give the pre-order buyers a ~40% discount and a free ingame editor, because your yearly rehash is highly overpriced right now, SI/SEGA. Honestly I don\'t know what I was expecting out of this year\'s game, it has been exactly the same for what, 5-6 years now? and that\'s only if you count the annoying interviews and the 3D engine, take those away, and it\'s the same exact game I played 12 years ago. That would not necessarily be a bad thing if they didn\'t ask for so much money. The things the game gets wrong, as SI would put it, include (but are not limited to): The economy is screwed up stil

In [31]:
# Next: let's add in reviews from other games