In [12]:
import pandas as pd
from splinter import Browser
from bs4 import BeautifulSoup as soup
import lxml
from bs4 import NavigableString
import re

from sqlalchemy import create_engine
import psycopg2

from config import db_password

# Must gather rating data for all the games without data in the ratings_df in the exploratory analysis notebook.
# Must gather positive, neutral, and negative ratings numbers for both users and critics as well as the
# metascore and userscore

no_rating_df = pd.read_csv("../Resources/games_to_get_ratings.csv")

In [13]:
!which chromedriver

/usr/local/bin/chromedriver


In [14]:
browser = Browser("chrome", executable_path="/usr/local/bin/chromedriver", headless=True)

In [15]:
games_df = pd.read_csv("../Resources/vgsales.csv.zip")
list_of_games = games_df["Name"].values.tolist()

In [16]:
## Navigating to the correct page via the search bar using splinter is too slow. I will rewrite the code to manually 
# write in the url using string interpolation. 
# I will need to interpolate the correct console name as well into the url. Metacritics url structure is:
# "https://www.metacritic.com/game/{console}/{game}"

In [17]:
# must change game names into the proper format to be interpolated into the URL.

game_conversions= {
    "PSV":"playstation-vita",
    "DC":"dreamcast",
    "WiiU":"wii-u",
    "GC":"gamecube",
    "N64":"nintendo-64",
    "XB":"xbox",
    "PSP":"psp",
    "PS4":"playstation-4",
    "PS": "playstation",
    "Wii":"wii",
    "PS3": "playstation-3",
    "PS2":"playstation-2",
    "GBA":"game-boy-advance",
    "DS":"ds",
    "XB":"xbox",
    "X360":"xbox-360"
}

def convert_to_name(platform_abb):
    if platform_abb in game_conversions:
        full_platform= game_conversions[platform_abb]
    else:
        full_platform="other"
    return full_platform
    

In [18]:
# This actually shows that the vast majority of our data belongs to the consoles listed on metacritic, these "legacy consoles"
# Get rid of those games which fall into the "other" category under full_platform_name
games_df["full_platform_name"]= games_df["Platform"].map(convert_to_name)
games_df = games_df.loc[games_df["full_platform_name"]!="other"]


In [19]:
games_df["Name"].tolist()

['Wii Sports',
 'Mario Kart Wii',
 'Wii Sports Resort',
 'New Super Mario Bros.',
 'Wii Play',
 'New Super Mario Bros. Wii',
 'Nintendogs',
 'Mario Kart DS',
 'Wii Fit',
 'Wii Fit Plus',
 'Kinect Adventures!',
 'Grand Theft Auto V',
 'Grand Theft Auto: San Andreas',
 'Brain Age: Train Your Brain in Minutes a Day',
 'Pokemon Diamond/Pokemon Pearl',
 'Grand Theft Auto V',
 'Grand Theft Auto: Vice City',
 'Pokemon Ruby/Pokemon Sapphire',
 'Pokemon Black/Pokemon White',
 'Brain Age 2: More Training in Minutes a Day',
 'Gran Turismo 3: A-Spec',
 'Call of Duty: Modern Warfare 3',
 'Call of Duty: Black Ops',
 'Call of Duty: Black Ops 3',
 'Call of Duty: Black Ops II',
 'Call of Duty: Black Ops II',
 'Call of Duty: Modern Warfare 2',
 'Call of Duty: Modern Warfare 3',
 'Grand Theft Auto III',
 'Super Smash Bros. Brawl',
 'Call of Duty: Black Ops',
 'Animal Crossing: Wild World',
 'Halo 3',
 'Grand Theft Auto V',
 'Pokemon HeartGold/Pokemon SoulSilver',
 'Super Mario 64',
 'Gran Turismo 4',
 'S

In [20]:
# This worked pretty well, but I will alter the code further see if I can increase my scraping success to fail ratio

In [21]:
browser = Browser("chrome", executable_path="/usr/local/bin/chromedriver", headless=True)
problem_games_counter= 0
problem_games = []
abberant_side_detail_games = []
game_info_list = []
counter = 1
abberant_side_details_counter = 0


# zip the two 
zipped_list =list(zip(games_df["full_platform_name"].tolist(),games_df["Name"].tolist()))

for platform, name in zipped_list:
    game_scores = {}
    x = re.compile("\.|:|;|\s|&")
    edited_game_title= re.sub(x,"-", name).lower().replace("---","-").replace("--","-").replace("'", "")
    # periods at the end of the url will show up as a dash. This conditional is to get rid of it.
    if edited_game_title[-1] == "-":
        edited_game_title = edited_game_title[:-1]
        print(f"\nGAME TITLE: {edited_game_title}")
    else:
        print(f"\nGAME TITLE: {edited_game_title}")
    browser.visit(f"https://www.metacritic.com/game/{platform}/{edited_game_title}")
    # fill in my search term to the Metacritic Browser and us "\n" to press "enter" and initialize search
    
    game_scores["Name"]=name
    game_scores["Platform"]=platform
    
    html= browser.html
    my_soup = soup(html, "lxml")
    # Grabbing the Metascore
    try:
        metascore = my_soup.find("a", class_="metascore_anchor").get_text()
        game_scores["metascore"]= metascore
    except:
        problem_games.append(name)
        print('THIS IS A PROBLEM CHILD')
        continue
    # Grabbing the Userscore
    try:
        userscore_first = my_soup.find("div", class_="details side_details")
        user_score = userscore_first.select("div.metascore_w")[0].get_text()
        game_scores["user_score"]= user_score
    except:
        problem_games.append(name)
        print("THIS IS A PROBLEM CHILD")
        continue
    # Grabbing the release data
    try:
        product_data = my_soup.find("div",class_="product_data")
        release_data = product_data.find("li", class_="summary_detail release_data")
        release_date = release_data.find("span",class_="data").get_text()
        game_scores["release_date"]=release_date
    except:
        problem_games.append(name)
        print("THIS IS A PROBLEM CHILD")
        continue
    # Grabbing the positive, mixed, and negative critic review numbers
    try:
        critic_reviews = []
        positive_scores = my_soup.find("div",class_="distribution_wrap")
        x = positive_scores.select("ol.score_counts.hover_none span.count")
        for i in x:
            critic_reviews.append(i.get_text())
        game_scores["positive_critics"] = critic_reviews[0]
        game_scores["neutral_critics"]=critic_reviews[1]
        game_scores["negative_critics"]=critic_reviews[2]
    except:
        problem_games.append(name)
        print('THIS IS A PROBLEM CHILD')
        continue
    # Grabbing the positive, mixed, and negative user review numbers
    try:
        user_reviews = []
        user_scores = my_soup.find("div",class_="user_reviews_module")
        drilling_down = user_scores.find("div",class_="distribution_wrap")
        ordered_list_html = drilling_down.select("ol.score_counts.hover_none span.count")
        for i in ordered_list_html:
            user_reviews.append(i.get_text())
        game_scores["positive_users"] = user_reviews[0]
        game_scores["neutral_users"]=user_reviews[1]
        game_scores["negative_users"]=user_reviews[2]
    except:
        problem_games.append(name)
        print("THIS IS A PROBLEM CHILD")
        continue
    # Grabbing side details
    try:
        i=0
        side_details = my_soup.find_all("div", class_="details side_details")[1]
        unordered_list = side_details.select("ul.summary_details li")
        developer = unordered_list[0].find("span", class_="data").get_text()
        game_scores["developer"]=developer
        try:
            number_players = unordered_list[2].find("span",class_="data").get_text()
            if number_players != "On GameFAQs":
                game_scores["number_players"]=number_players
            else:
                i = i-1
        except:
            i= i-1
            print("mutliplayer not listed")
        rating = unordered_list[4+i].find("span", class_="data").get_text()
        game_scores["rating"]=rating
        game_info_list.append(game_scores)
        print('SUCCESSFUL')
        print(f"This is the {counter}th game in the list.")
        print(abberant_side_details_counter)
        counter+=1
    except:
        abberant_side_detail_games.append(name)
        print("SUCCESS but no side details")
        abberant_side_details_counter+=1
        continue
        
            
scraping_df = pd.DataFrame(game_info_list)
scraping_df.to_csv("../Resources/final_scrape.csv")

problem_ratings = pd.DataFrame(problem_games)
problem_ratings.to_csv("../Resources/final_problems.csv")


db_string = f"postgres://postgres:{db_password}@127.0.0.1:5432/games_db" 
# need database name first!?
engine = create_engine(db_string)
ratings_df.to_sql(name="ratings", con=engine, if_exists="replace")
        


GAME TITLE: wii-sports
SUCCESSFUL
This is the 1th game in the list.
0

GAME TITLE: mario-kart-wii
SUCCESSFUL
This is the 2th game in the list.
0

GAME TITLE: wii-sports-resort
SUCCESSFUL
This is the 3th game in the list.
0

GAME TITLE: new-super-mario-bros
SUCCESSFUL
This is the 4th game in the list.
0

GAME TITLE: wii-play
SUCCESSFUL
This is the 5th game in the list.
0

GAME TITLE: new-super-mario-bros-wii
SUCCESSFUL
This is the 6th game in the list.
0

GAME TITLE: nintendogs
THIS IS A PROBLEM CHILD

GAME TITLE: mario-kart-ds
SUCCESSFUL
This is the 7th game in the list.
0

GAME TITLE: wii-fit
SUCCESSFUL
This is the 8th game in the list.
0

GAME TITLE: wii-fit-plus
SUCCESSFUL
This is the 9th game in the list.
0

GAME TITLE: kinect-adventures!
SUCCESSFUL
This is the 10th game in the list.
0

GAME TITLE: grand-theft-auto-v
SUCCESSFUL
This is the 11th game in the list.
0

GAME TITLE: grand-theft-auto-san-andreas
SUCCESSFUL
This is the 12th game in the list.
0

GAME TITLE: brain-age-train

SUCCESSFUL
This is the 91th game in the list.
0

GAME TITLE: pokémon-emerald-version
THIS IS A PROBLEM CHILD

GAME TITLE: kingdom-hearts
SUCCESSFUL
This is the 92th game in the list.
0

GAME TITLE: halo-3-odst
SUCCESSFUL
This is the 93th game in the list.
0

GAME TITLE: red-dead-redemption
SUCCESSFUL
This is the 94th game in the list.
0

GAME TITLE: super-mario-sunshine
SUCCESSFUL
This is the 95th game in the list.
0

GAME TITLE: driver
SUCCESSFUL
This is the 96th game in the list.
0

GAME TITLE: kinect-sports
SUCCESSFUL
This is the 97th game in the list.
0

GAME TITLE: gears-of-war-3
SUCCESSFUL
This is the 98th game in the list.
0

GAME TITLE: gears-of-war
SUCCESSFUL
This is the 99th game in the list.
0

GAME TITLE: metal-gear-solid-2-sons-of-liberty
SUCCESSFUL
This is the 100th game in the list.
0

GAME TITLE: metal-gear-solid-4-guns-of-the-patriots
SUCCESSFUL
This is the 101th game in the list.
0

GAME TITLE: metal-gear-solid
SUCCESSFUL
This is the 102th game in the list.
0

GAME TI

KeyboardInterrupt: 

In [83]:
game_info_list

[{'Name': 'Wii Sports',
  'metascore': '\n76\n',
  'user_score': '8.0',
  'release_date': 'Nov 19, 2006',
  'positive_critics': '31',
  'neutral_critics': '20',
  'negative_critics': '0',
  'positive_users': '105',
  'neutral_users': '40',
  'negative_users': '3',
  'developer': '\n                    Nintendo                ',
  'number_players': 'No Online Multiplayer',
  'rating': 'E'},
 {'Name': 'Mario Kart Wii',
  'metascore': '\n82\n',
  'user_score': '8.4',
  'release_date': 'Apr 27, 2008',
  'positive_critics': '65',
  'neutral_critics': '7',
  'negative_critics': '1',
  'positive_users': '235',
  'neutral_users': '62',
  'negative_users': '22',
  'developer': '\n                    Nintendo                ',
  'number_players': 'Up to 12 ',
  'rating': 'E'},
 {'Name': 'Wii Sports Resort',
  'metascore': '\n80\n',
  'user_score': '8.1',
  'release_date': 'Jul 26, 2009',
  'positive_critics': '60',
  'neutral_critics': '13',
  'negative_critics': '0',
  'positive_users': '62',
 

In [2]:
scraping_df = pd.DataFrame(game_info_list)
scraping_df.to_csv("../Resources/final_scrape.csv")

NameError: name 'pd' is not defined

In [None]:
# get developer, genre, number_players, rating, release_date

In [96]:
len(problem_games)

5662

In [1]:
problem_ratings = pd.DataFrame(problem_games)
problem_ratings.to_csv("../Resources/final_problems.csv")

NameError: name 'pd' is not defined

In [None]:
#'& III ! Pokémon

In [85]:
len(abberant_side_detail_games)

0