# Web Scrape Video Games
> <a href=#imports>Import Libraries</a>        

> <a href=#webscrabing>Web Scraping</a>

> <a href=#merge>Merging Data</a>

> <a href=#EDA>Explratory Data Analysis (EDA)</a>



# <a name=imports> Import Libraries</a> 


In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline 

# <a name=webscrabing>Web Scraping</a>

In [2]:
url = "https://www.imdb.com/search/title/?title_type=video_game"

response = requests.get(url)
response.status_code
page = response.text
soup = BeautifulSoup(page, "lxml")

In [3]:
def clean_num(s):
    """cleans text scraped from website
    
    Parameters:
    s (String): raw text

    Returns:
    String: cleaned digit text

    """ 

    if s is None:
        return s
    return "".join(re.findall('\d*\.?\d+',s))

def get_data(block, tag, class_text):
    """fetch data from tag and its associated class
    
    Parameters:
    block (int): Book item 
    tag (String): tag for element we want to fetch
    class_text (String): class for element we want to fetch

    Returns:
    String: data text fetched

    """ 
    try:
        return block.find(tag, class_ = class_text).text.replace('(','').replace(')','')
    except AttributeError:
        return ""
    return

def get_data_a_tag(i, tag, class_text):    
    try:
        return i.find(tag, class_ = class_text).a.text
    except AttributeError:
        return ""
    return
def get_data_strong_tag(i, tag, class_text):    
    try:
        return i.find(tag, class_ = class_text).strong.text
    except AttributeError:
        return ""
    return
def get_data_descr(i, tag, class_text):    
    try:
        return i.find_all(tag, class_ = class_text)[1].text.strip()
    except AttributeError:
        return ""
    return


def get_data_votes(i, tag, class_text):    
    try:
        return i.find(tag, class_ = class_text).text.split()[1].replace(',','')
    except AttributeError:
        return ""
    return

In [4]:
    
Vgame_list = []

for i in (soup.find_all('div','lister-item mode-advanced')):
    Name = get_data_a_tag(i, "h3","lister-item-header")
    year = get_data(i, "span", "lister-item-year text-muted unbold").split()[0]
    Gtype = get_data(i, "p", "text-muted").strip()
    rate = clean_num(get_data_strong_tag(i, "div","ratings-bar"))
    votes = clean_num(get_data_votes(i, "p","sort-num_votes-visible"))
    
    headers = ['Name', 'year', 'game_type', 'rate','votes']
    Vgame_dict = dict(zip(headers, [ Name,
                                    year,
                                    Gtype,
                                    rate,
                                    votes]))
    Vgame_list.append(Vgame_dict)   

In [5]:
vedioGame = pd.DataFrame(Vgame_list) #transpose
vedioGame

Unnamed: 0,Name,year,game_type,rate,votes
0,Marvel's Guardians of the Galaxy,2021,"Action, Adventure, Comedy",9.1,949.0
1,Red Dead Redemption II,2018,"Action, Adventure, Crime",9.7,28888.0
2,Grand Theft Auto: San Andreas,2004,"Action, Crime",9.4,37749.0
3,Grand Theft Auto V,2013,"Action, Crime, Drama",9.5,55657.0
4,Cyberpunk 2077,2020,"Action, Crime, Drama",7.9,5948.0
5,Call of Duty: Vanguard,2021,"Action, Adventure, History",6.3,505.0
6,The Last of Us: Part II,2020,"banned\n|\n\nAction, Adventure, Drama",8.3,26194.0
7,Halo Infinite,2021,"Action, Adventure, Sci-Fi \n|\nComp...",,
8,Far Cry 6,2021,"Action, Adventure",8.1,818.0
9,Death Stranding,2019,"Action, Adventure, Drama",8.9,6992.0


In [6]:
def get_movie_dict(link):

    #Create full url to scrape
    url =link
    
    #Request HTML and parse
    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page,"lxml")
  
    
    for i in (soup.find_all('div','lister-item mode-advanced')):        
        Name = get_data_a_tag(i, "h3","lister-item-header")
        year = get_data(i, "span", "lister-item-year text-muted unbold").split()[0]
        Gtype = get_data(i, "p", "text-muted").strip()
        rate = clean_num(get_data_strong_tag(i, "div","ratings-bar"))
        votes = clean_num(get_data_votes(i, "p","sort-num_votes-visible"))
    
        headers = ['Name', 'year', 'game_type', 'rate','votes']
        #Create game vedio dictionary and return
        movie_dict = dict(zip(headers, [ Name,
                                    year,
                                    Gtype,
                                    rate,
                                    votes]))
        Vgame_list.append(movie_dict) 

    return movie_dict

In [7]:
get_movie_dict("https://www.imdb.com/search/title/?title_type=video_game&start=51&ref_=adv_nxt")

{'Name': 'Call of Duty: WWII',
 'year': '2017',
 'game_type': 'Action, Adventure, Drama',
 'rate': '7.7',
 'votes': '5600'}

In [8]:
get_movie_dict('https://www.imdb.com/search/title/?title_type=video_game&start=101&ref_=adv_nxt')

{'Name': 'NieR: Automata',
 'year': '2017',
 'game_type': 'Action, Adventure, Drama',
 'rate': '8.9',
 'votes': '2451'}

In [9]:
get_movie_dict('https://www.imdb.com/search/title/?title_type=video_game&start=151&ref_=adv_nxt')

{'Name': 'Metro Exodus',
 'year': '2019',
 'game_type': 'Action, Adventure, Horror',
 'rate': '8.2',
 'votes': '1966'}

In [10]:
get_movie_dict('https://www.imdb.com/search/title/?title_type=video_game&start=201&ref_=adv_nxt')

{'Name': 'Half-Life: Alyx',
 'year': '2020',
 'game_type': 'Action, Adventure, Horror',
 'rate': '9.5',
 'votes': '561'}

In [11]:
get_movie_dict('https://www.imdb.com/search/title/?title_type=video_game&start=251&ref_=adv_nxt')

{'Name': 'Titanfall 2',
 'year': '2016',
 'game_type': 'Action, Adventure, Sci-Fi',
 'rate': '8.5',
 'votes': '3254'}

In [12]:
get_movie_dict('https://www.imdb.com/search/title/?title_type=video_game&start=301&ref_=adv_nxt')

{'Name': 'Command & Conquer: Red Alert 2',
 'year': '2000',
 'game_type': 'Action, Sci-Fi, War',
 'rate': '8.6',
 'votes': '2902'}

In [13]:
get_movie_dict('https://www.imdb.com/search/title/?title_type=video_game&start=351&ref_=adv_nxt')

{'Name': 'Battlefield: Hardline',
 'year': '2015',
 'game_type': 'Action, Crime',
 'rate': '6.5',
 'votes': '2363'}

In [14]:
get_movie_dict('https://www.imdb.com/search/title/?title_type=video_game&start=401&ref_=adv_nxt')

{'Name': 'Wolverine',
 'year': 'Video',
 'game_type': 'Action, Adventure, Drama            \n|\nAnnounced',
 'rate': '',
 'votes': ''}

In [15]:
get_movie_dict('https://www.imdb.com/search/title/?title_type=video_game&start=451&ref_=adv_nxt')

{'Name': "Lego Marvel's Avengers",
 'year': '2016',
 'game_type': 'Action, Adventure, Fantasy',
 'rate': '7.9',
 'votes': '1127'}

In [16]:
vedioGamePages = pd.DataFrame(Vgame_list) #transpose
vedioGamePages

Unnamed: 0,Name,year,game_type,rate,votes
0,Marvel's Guardians of the Galaxy,2021,"Action, Adventure, Comedy",9.1,949
1,Red Dead Redemption II,2018,"Action, Adventure, Crime",9.7,28888
2,Grand Theft Auto: San Andreas,2004,"Action, Crime",9.4,37749
3,Grand Theft Auto V,2013,"Action, Crime, Drama",9.5,55657
4,Cyberpunk 2077,2020,"Action, Crime, Drama",7.9,5948
...,...,...,...,...,...
495,The Long Dark,2014,"Adventure, Drama, Mystery",8.1,384
496,Inside,2016,"Adventure, Horror, Mystery",8.8,3054
497,Lego Batman 3: Beyond Gotham,2014,"Action, Adventure, Comedy",7.6,1437
498,Fainaru fantajî XII,2006,"Action, Adventure, Crime",8.2,3271


In [17]:
vedioGamePages.to_csv('videogame_webscraping.csv')

In [18]:
vedioGamePages.Name=vedioGamePages.Name.str.lower()
vedioGamePages.head()

Unnamed: 0,Name,year,game_type,rate,votes
0,marvel's guardians of the galaxy,2021,"Action, Adventure, Comedy",9.1,949
1,red dead redemption ii,2018,"Action, Adventure, Crime",9.7,28888
2,grand theft auto: san andreas,2004,"Action, Crime",9.4,37749
3,grand theft auto v,2013,"Action, Crime, Drama",9.5,55657
4,cyberpunk 2077,2020,"Action, Crime, Drama",7.9,5948


In [19]:
vedioGamePages

Unnamed: 0,Name,year,game_type,rate,votes
0,marvel's guardians of the galaxy,2021,"Action, Adventure, Comedy",9.1,949
1,red dead redemption ii,2018,"Action, Adventure, Crime",9.7,28888
2,grand theft auto: san andreas,2004,"Action, Crime",9.4,37749
3,grand theft auto v,2013,"Action, Crime, Drama",9.5,55657
4,cyberpunk 2077,2020,"Action, Crime, Drama",7.9,5948
...,...,...,...,...,...
495,the long dark,2014,"Adventure, Drama, Mystery",8.1,384
496,inside,2016,"Adventure, Horror, Mystery",8.8,3054
497,lego batman 3: beyond gotham,2014,"Action, Adventure, Comedy",7.6,1437
498,fainaru fantajî xii,2006,"Action, Adventure, Crime",8.2,3271


In [20]:
df=pd.read_csv('vgsales.csv') #data from kaggle 

In [21]:
df.Name=df.Name.str.lower()
df.head()

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,1,wii sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,2,super mario bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,3,mario kart wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,4,wii sports resort,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.0
4,5,pokemon red/pokemon blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37


In [22]:
df.drop([ 'Year', 'Genre','Publisher'], axis=1, inplace=True)

# <a name=merge>Merging Data</a> 

In [23]:
mer_vidgame=vedioGamePages.merge(df, on='Name')

In [24]:
mer_vidgame.head()

Unnamed: 0,Name,year,game_type,rate,votes,Rank,Platform,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,grand theft auto: san andreas,2004,"Action, Crime",9.4,37749,18,PS2,9.43,0.4,0.41,10.57,20.81
1,grand theft auto: san andreas,2004,"Action, Crime",9.4,37749,875,XB,1.26,0.61,0.0,0.09,1.95
2,grand theft auto: san andreas,2004,"Action, Crime",9.4,37749,2122,PC,0.0,0.92,0.0,0.05,0.98
3,grand theft auto: san andreas,2004,"Action, Crime",9.4,37749,9829,X360,0.08,0.03,0.0,0.01,0.12
4,grand theft auto v,2013,"Action, Crime, Drama",9.5,55657,17,PS3,7.01,9.27,0.97,4.14,21.4


In [25]:
df1=pd.DataFrame(mer_vidgame)

# <a name=EDA>Explratory data analysis(EDA)</a>

In [26]:
df1.shape

(647, 12)

In [27]:
df1['game_type'].tolist()

['Action, Crime',
 'Action, Crime',
 'Action, Crime',
 'Action, Crime',
 'Action, Crime, Drama',
 'Action, Crime, Drama',
 'Action, Crime, Drama',
 'Action, Crime, Drama',
 'Action, Crime, Drama',
 'Action, Adventure, Fantasy',
 'Action, Adventure, Fantasy',
 'Action, Adventure, Fantasy',
 'Action, Adventure, Crime',
 'Action, Adventure, Crime',
 'Action, Adventure, Crime',
 'Action, Adventure, Fantasy',
 'Action, Adventure, Fantasy',
 'Action, Adventure, Fantasy',
 'Action, Adventure, Drama',
 'Action, Adventure, Drama',
 'Horror, Mystery, Thriller',
 'Action, Crime, Drama',
 'Action, Crime, Drama',
 'Action, Adventure, Drama',
 'banned\n|\n\nAction, Adventure, Fantasy',
 'Action, Adventure, Fantasy',
 'Action, Adventure, Fantasy',
 'Action, Adventure, Fantasy',
 'Action, Adventure, Crime',
 'Action, Adventure, Crime',
 'Action, Adventure, Crime',
 'Action, Adventure, Comedy',
 'Action, Adventure, Drama',
 'Action, Crime, Drama',
 'Action, Crime, Drama',
 'Action, Crime, Drama',
 'Act

In [28]:
df1['game_type']=df1['game_type'].str.replace('banned\n|\n\n','')
df1['game_type']=df1['game_type'].str.replace('\n|\nAnnounced','')
df1['game_type']=df1['game_type'].str.replace('|','')
df1['game_type']=df1['game_type'].str.replace('Announced','')
df1['game_type']=df1['game_type'].str.replace('130 min\n|\n\n','')

  df1['game_type']=df1['game_type'].str.replace('banned\n|\n\n','')
  df1['game_type']=df1['game_type'].str.replace('\n|\nAnnounced','')
  df1['game_type']=df1['game_type'].str.replace('|','')
  df1['game_type']=df1['game_type'].str.replace('130 min\n|\n\n','')


In [29]:
df1

Unnamed: 0,Name,year,game_type,rate,votes,Rank,Platform,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,grand theft auto: san andreas,2004,"Action, Crime",9.4,37749,18,PS2,9.43,0.40,0.41,10.57,20.81
1,grand theft auto: san andreas,2004,"Action, Crime",9.4,37749,875,XB,1.26,0.61,0.00,0.09,1.95
2,grand theft auto: san andreas,2004,"Action, Crime",9.4,37749,2122,PC,0.00,0.92,0.00,0.05,0.98
3,grand theft auto: san andreas,2004,"Action, Crime",9.4,37749,9829,X360,0.08,0.03,0.00,0.01,0.12
4,grand theft auto v,2013,"Action, Crime, Drama",9.5,55657,17,PS3,7.01,9.27,0.97,4.14,21.40
...,...,...,...,...,...,...,...,...,...,...,...,...
642,lego marvel's avengers,2016,"Action, Adventure, Fantasy",7.9,1127,5597,XOne,0.18,0.11,0.00,0.03,0.32
643,lego marvel's avengers,2016,"Action, Adventure, Fantasy",7.9,1127,5698,PS3,0.10,0.17,0.00,0.05,0.32
644,lego marvel's avengers,2016,"Action, Adventure, Fantasy",7.9,1127,6957,WiiU,0.12,0.09,0.00,0.02,0.23
645,lego marvel's avengers,2016,"Action, Adventure, Fantasy",7.9,1127,6974,PSV,0.07,0.11,0.00,0.05,0.23


In [30]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 647 entries, 0 to 646
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Name          647 non-null    object 
 1   year          647 non-null    object 
 2   game_type     647 non-null    object 
 3   rate          647 non-null    object 
 4   votes         647 non-null    object 
 5   Rank          647 non-null    int64  
 6   Platform      647 non-null    object 
 7   NA_Sales      647 non-null    float64
 8   EU_Sales      647 non-null    float64
 9   JP_Sales      647 non-null    float64
 10  Other_Sales   647 non-null    float64
 11  Global_Sales  647 non-null    float64
dtypes: float64(5), int64(1), object(6)
memory usage: 65.7+ KB


In [31]:
df1['year'] = pd.to_numeric(df1['year'], errors='coerce')

In [32]:
df1['rate'] = pd.to_numeric(df1['rate'], errors='coerce')

In [33]:
df1['votes'] = pd.to_numeric(df1['votes'], errors='coerce')

In [34]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 647 entries, 0 to 646
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Name          647 non-null    object 
 1   year          641 non-null    float64
 2   game_type     647 non-null    object 
 3   rate          641 non-null    float64
 4   votes         641 non-null    float64
 5   Rank          647 non-null    int64  
 6   Platform      647 non-null    object 
 7   NA_Sales      647 non-null    float64
 8   EU_Sales      647 non-null    float64
 9   JP_Sales      647 non-null    float64
 10  Other_Sales   647 non-null    float64
 11  Global_Sales  647 non-null    float64
dtypes: float64(8), int64(1), object(3)
memory usage: 65.7+ KB


In [35]:
df1.describe()

Unnamed: 0,year,rate,votes,Rank,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
count,641.0,641.0,641.0,647.0,647.0,647.0,647.0,647.0,647.0
mean,2010.051482,8.198908,7827.794072,3850.709428,0.977187,0.642736,0.053756,0.233091,1.906986
std,4.743301,0.761812,8464.62252,3980.215834,1.510633,0.967026,0.160178,0.546981,2.800818
min,1993.0,4.4,76.0,17.0,0.0,0.0,0.0,0.0,0.01
25%,2007.0,7.7,2363.0,721.5,0.13,0.08,0.0,0.02,0.32
50%,2011.0,8.2,5142.0,2390.0,0.42,0.29,0.0,0.08,0.87
75%,2013.0,8.7,9790.0,5681.0,1.135,0.8,0.04,0.245,2.235
max,2023.0,9.7,55858.0,16375.0,9.67,9.27,2.02,10.57,21.4


In [36]:
df1

Unnamed: 0,Name,year,game_type,rate,votes,Rank,Platform,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,grand theft auto: san andreas,2004.0,"Action, Crime",9.4,37749.0,18,PS2,9.43,0.40,0.41,10.57,20.81
1,grand theft auto: san andreas,2004.0,"Action, Crime",9.4,37749.0,875,XB,1.26,0.61,0.00,0.09,1.95
2,grand theft auto: san andreas,2004.0,"Action, Crime",9.4,37749.0,2122,PC,0.00,0.92,0.00,0.05,0.98
3,grand theft auto: san andreas,2004.0,"Action, Crime",9.4,37749.0,9829,X360,0.08,0.03,0.00,0.01,0.12
4,grand theft auto v,2013.0,"Action, Crime, Drama",9.5,55657.0,17,PS3,7.01,9.27,0.97,4.14,21.40
...,...,...,...,...,...,...,...,...,...,...,...,...
642,lego marvel's avengers,2016.0,"Action, Adventure, Fantasy",7.9,1127.0,5597,XOne,0.18,0.11,0.00,0.03,0.32
643,lego marvel's avengers,2016.0,"Action, Adventure, Fantasy",7.9,1127.0,5698,PS3,0.10,0.17,0.00,0.05,0.32
644,lego marvel's avengers,2016.0,"Action, Adventure, Fantasy",7.9,1127.0,6957,WiiU,0.12,0.09,0.00,0.02,0.23
645,lego marvel's avengers,2016.0,"Action, Adventure, Fantasy",7.9,1127.0,6974,PSV,0.07,0.11,0.00,0.05,0.23


In [37]:
df_transformed =pd.DataFrame()

df_transformed["NA_Sales_log"] = np.log(df1["NA_Sales"]) 
df_transformed["EU_Sales_log"] = np.log(df1["EU_Sales"]) 
df_transformed["JP_Sales_log"] = np.log(df1["JP_Sales"]) 
df_transformed["Other_Sales_log"] = np.log(df1["Other_Sales"])
df_transformed[["rank", "rate"]] = df1[["Rank", "rate"]]
df_transformed["Gobal_Sales_log"] = np.log(df1["Global_Sales"])
                                      

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [38]:
df_transformed.head()

Unnamed: 0,NA_Sales_log,EU_Sales_log,JP_Sales_log,Other_Sales_log,rank,rate,Gobal_Sales_log
0,2.243896,-0.916291,-0.891598,2.35802,18,9.4,3.035434
1,0.231112,-0.494296,-inf,-2.407946,875,9.4,0.667829
2,-inf,-0.083382,-inf,-2.995732,2122,9.4,-0.020203
3,-2.525729,-3.506558,-inf,-4.60517,9829,9.4,-2.120264
4,1.947338,2.226783,-0.030459,1.420696,17,9.5,3.063391


In [39]:
df_transformed.to_csv("log_transformed_data.csv", sep = ",")

In [40]:
df_transformed.corr()

Unnamed: 0,NA_Sales_log,EU_Sales_log,JP_Sales_log,Other_Sales_log,rank,rate,Gobal_Sales_log
NA_Sales_log,1.0,0.681405,0.378179,0.754538,-0.848198,0.195715,0.911807
EU_Sales_log,0.681405,1.0,0.453435,0.854941,-0.81506,0.214056,0.881984
JP_Sales_log,0.378179,0.453435,1.0,0.513463,-0.24688,0.29829,0.425985
Other_Sales_log,0.754538,0.854941,0.513463,1.0,-0.827257,0.25965,0.908834
rank,-0.848198,-0.81506,-0.24688,-0.827257,1.0,-0.134883,-0.950138
rate,0.195715,0.214056,0.29829,0.25965,-0.134883,1.0,0.187205
Gobal_Sales_log,0.911807,0.881984,0.425985,0.908834,-0.950138,0.187205,1.0
