### Importing necessary libraries

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import time
from sqlalchemy import create_engine, text

try:
    write_path = f"{directory}consoles_raw.csv"
    read_path = f"{directory}consoles_enriched.csv"
except:
    write_path = "../../../../01_aux_files/results/Video_Game_Sales/Web Scrapping & Pandas/consoles_raw.csv"
    read_path = "../../../../01_aux_files/results/Video_Game_Sales/Web Scrapping & Pandas/consoles_enriched.csv"

<div align = 'center'>
<h1>Consoles Data Scrap</h>
</div>

In [2]:
url = f"https://www.vgchartz.com/charts/platform_totals/Hardware.php/"
data = []
        
response = requests.get(url)
if response.status_code == 200:
    soup = BeautifulSoup(response.text, 'html.parser')
    table = soup.find('div', {'id': 'chart_body'}).find('table')
    # Iterating over every row
    for row in table.find_all('tr'):
        # Getting cells from every row
        cells = row.find_all(['th', 'td'])
        row_data = [cell.text.strip() for cell in cells]
        data.append(row_data) ## Saving current row into "every row" list
    print(f"Consoles Page Succesfully Scrapped!")

    consoles_chart = pd.DataFrame(data)
else:
    raise ValueError(f'Page {str(x)} Request Failed. Status Code: {response.status_code}')

Consoles Page Succesfully Scrapped!


### Some  format changes

In [3]:
consoles_chart.columns = consoles_chart.iloc[0] ## First Row as header
consoles_chart = consoles_chart[1:] ##Discard First row (now header)

### Data Cleaning

In [4]:
consoles_chart.rename(columns={"Platform":"Console_Name", "Global" :"Global_WorldWide_Sales"}, inplace=True)
consoles_chart["Console_Abbreviation"] = consoles_chart["Console_Name"].str.split("(").str.get(1).str.replace(")", "").str.strip()
consoles_chart["Console_Name"] = consoles_chart["Console_Name"].str.split("(").str.get(0).str.replace(")", "").str.strip().str.upper()
consoles_chart = consoles_chart[["Console_Abbreviation", "Console_Name", "Global_WorldWide_Sales"]] ## Choosing Necessary columns

consoles_chart["Global_WorldWide_Sales"] = pd.to_numeric(consoles_chart["Global_WorldWide_Sales"], downcast='float')
consoles_chart["Global_WorldWide_Sales"] = consoles_chart["Global_WorldWide_Sales"] * 1000000.0 ## To return the exact number in millions
consoles_chart["Global_WorldWide_Sales"] = consoles_chart["Global_WorldWide_Sales"].astype(int)

In [5]:
consoles_chart

Unnamed: 0,Console_Abbreviation,Console_Name,Global_WorldWide_Sales
1,PS2,PLAYSTATION 2,158700000
2,DS,NINTENDO DS,154020000
3,NS,NINTENDO SWITCH,132910000
4,GB,GAME BOY,118690000
5,PS4,PLAYSTATION 4,117170000
...,...,...,...
79,HS,HYPERSCAN,10000
80,AV,ADVENTURE VISION,10000
81,PVS,PALMTEX PORTABLE VIDEOGAME SYSTEM,10000
82,Pa,PANDORA,10000


In [6]:
consoles_chart.to_csv(write_path)

### After scrapping this data, we will enrich the information with chatGPT help, adding more columns: 
### Developer,Release_Date,Origin_Country,Generation,Type,Media_Type,
### Graphics,Online_Play,Predecessor,Successor

#### Final result:

In [7]:
consoles_df = pd.read_csv(read_path, sep=',', header=0)
consoles_df

Unnamed: 0,Console_Abbreviation,Console_Name,Developer,Release_Date,Origin_Country,Generation,Type,Media_Type,Graphics,Online_Play,Predecessor,Successor
0,PS2,PLAYSTATION 2,Sony,2000-03-04,Japan,Sixth,Home Console,Physical/Digital,Various,Yes,PS1,PS3
1,DS,NINTENDO DS,Nintendo,2004-11-21,Japan,Seventh,Handheld,Physical/Digital,2D/3D,Yes,GBA,3DS
2,NS,NINTENDO SWITCH,Nintendo,2017-03-03,Japan,Eighth,Hybrid,Physical/Digital,HD,Yes,Wii U,
3,GB,GAME BOY,Nintendo,1989-04-21,Japan,Fourth,Handheld,Physical,8-bit,No,,
4,PS4,PLAYSTATION 4,Sony,2013-11-15,Japan,Eighth,Home Console,Physical/Digital,HD,Yes,PS3,PS5
...,...,...,...,...,...,...,...,...,...,...,...,...
78,VIS,TANDY VIDEO INFORMATION SYSTEM,Tandy,1977-12-03,USA,First,Pong Console,Cartridge,8-bit,No,,
79,HS,HYPERSCAN,Mattel,2006-10-23,USA,Seventh,Home Console,CD-ROM,32-bit,Yes,,
80,AV,ADVENTURE VISION,Fairchild,1976-01-01,USA,First,Handheld,Cartridge,8-bit,No,,
81,PVS,PALMTEX PORTABLE VIDEOGAME SYSTEM,Palmtex,1982-01-01,USA,Second,Handheld,Cartridge,8-bit,No,,


### Final Transformations

In [8]:
consoles_df = consoles_df.apply(lambda x : x.str.upper())
consoles_df["Release_Date"] = pd.to_datetime(consoles_df["Release_Date"]).dt.date
consoles_df["Predecessor"] = np.where(consoles_df["Predecessor"] == 'UNKNOWN', 'N/S', consoles_df["Predecessor"])
consoles_df["Successor"] = np.where((consoles_df["Successor"] == 'UNKNOWN') | (consoles_df["Successor"].isnull()), 'NOT SPECIFIED', consoles_df["Successor"])
consoles_df["Predecessor"] = np.where(consoles_df["Predecessor"].isnull(), 'NOT SPECIFIED', consoles_df["Predecessor"])
consoles_df["Online_Play"] = np.where(consoles_df["Online_Play"].isnull(), 'NOT SPECIFIED', consoles_df["Online_Play"])
consoles_df

Unnamed: 0,Console_Abbreviation,Console_Name,Developer,Release_Date,Origin_Country,Generation,Type,Media_Type,Graphics,Online_Play,Predecessor,Successor
0,PS2,PLAYSTATION 2,SONY,2000-03-04,JAPAN,SIXTH,HOME CONSOLE,PHYSICAL/DIGITAL,VARIOUS,YES,PS1,PS3
1,DS,NINTENDO DS,NINTENDO,2004-11-21,JAPAN,SEVENTH,HANDHELD,PHYSICAL/DIGITAL,2D/3D,YES,GBA,3DS
2,NS,NINTENDO SWITCH,NINTENDO,2017-03-03,JAPAN,EIGHTH,HYBRID,PHYSICAL/DIGITAL,HD,YES,WII U,NOT SPECIFIED
3,GB,GAME BOY,NINTENDO,1989-04-21,JAPAN,FOURTH,HANDHELD,PHYSICAL,8-BIT,NO,NOT SPECIFIED,NOT SPECIFIED
4,PS4,PLAYSTATION 4,SONY,2013-11-15,JAPAN,EIGHTH,HOME CONSOLE,PHYSICAL/DIGITAL,HD,YES,PS3,PS5
...,...,...,...,...,...,...,...,...,...,...,...,...
78,VIS,TANDY VIDEO INFORMATION SYSTEM,TANDY,1977-12-03,USA,FIRST,PONG CONSOLE,CARTRIDGE,8-BIT,NO,NOT SPECIFIED,NOT SPECIFIED
79,HS,HYPERSCAN,MATTEL,2006-10-23,USA,SEVENTH,HOME CONSOLE,CD-ROM,32-BIT,YES,NOT SPECIFIED,NOT SPECIFIED
80,AV,ADVENTURE VISION,FAIRCHILD,1976-01-01,USA,FIRST,HANDHELD,CARTRIDGE,8-BIT,NO,NOT SPECIFIED,NOT SPECIFIED
81,PVS,PALMTEX PORTABLE VIDEOGAME SYSTEM,PALMTEX,1982-01-01,USA,SECOND,HANDHELD,CARTRIDGE,8-BIT,NO,NOT SPECIFIED,NOT SPECIFIED


In [9]:
source_rows = len(consoles_df.index)
consoles_df.dtypes

Console_Abbreviation    object
Console_Name            object
Developer               object
Release_Date            object
Origin_Country          object
Generation              object
Type                    object
Media_Type              object
Graphics                object
Online_Play             object
Predecessor             object
Successor               object
dtype: object

## Saving Final Results Directly to our database

#### We are using our INTERMEDIATE schema, which will save raw data before inserting / updating existing records on our DW

In [10]:
table_schema = 'INTERMEDIATE'
table_name = 'Dim_Consoles_Information'
user_name = 'XXXXX' ## We won't save credentials on github! 
password =  'XXXXX' ## We won't save credentials on github!
dw_server = 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'
db_name = 'VideoGames_Sales'

connection_string = (
    f'mssql+pyodbc://{user_name}:{password}@{dw_server}/'
    f'{db_name}?driver=ODBC+Driver+17+for+SQL+Server'
)
# Connecting with SQL Server
engine = create_engine(connection_string, fast_executemany=True)

## Before inserting new records, we must truncate the intermediate table
## To avoid duplicated data
try:
    start_time = time.time()
    with engine.connect() as conn:
        conn.execute(text(f"TRUNCATE TABLE {table_schema}.{table_name}"))
        conn.commit() ## We must commit our changes, otherwise they won't have effect
    print("INTERMEDIATE.Dim_Consoles_Information Tables Successfully truncated! :D")

    ## Inserting new data
    consoles_df.to_sql(name=table_name,schema=table_schema, con=engine, index=False, if_exists='append')

    ## Cheking and comparing inserted rows
    with engine.connect() as conn:
        rows_affected = conn.execute(text(f"select count(1) from {table_schema}.{table_name}")).scalar()
    
    ## Properly closing connections
    engine.dispose()

    ### Printing and comparing row numbers
    print(f"Source File rows: {source_rows}")
    print(f"Inserted rows: {rows_affected}")

    if (source_rows != rows_affected):
        raise ValueError(f"Error! There is a difference of {source_rows - rows_affected} between source and destination")
    
    end_time = time.time()
    elapsed_time = (end_time - start_time) / 60 ##Minutes
    print(f"DB Insertion Finished in {elapsed_time} minutes ! :D")
except Exception as e:
    raise ValueError(f"Job Failed with the following message! :( : {str(e)}")

SRC and INTERMEDIATE Tables Successfully truncated! :D
Source File rows: 83
Inserted rows: 83
DB Insertion Finished in 0.0008335312207539877 minutes ! :D
