In [1]:
import re
import csv
import random
import requests
import datetime
from bs4 import BeautifulSoup

import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

import warnings
warnings.filterwarnings('ignore')

# Crawling data using BeautifulSoup package

In [2]:
session = requests.Session()
header = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'}

In [3]:
def get_proxies():
    url = 'https://free-proxy-list.net/'
    r = requests.get(url)
    while not r.status_code == 200:
        r = requests.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')
    trs = soup.find('table', class_ = 'table table-striped table-bordered').find('tbody').find_all('tr')
    proxy = []
    for tr in trs:
        tds = tr.find_all('td')
        if tds[6].text == 'no':
            proxy.append('http://'+tds[0].text + ':' + tds[1].text)
    return proxy

In [4]:
def read_url(url, proxy):    
    id = random.randint(0, 50)
    session.proxies = {'http': proxy[id]}
    r = session.get(url, headers = header)
    soup = BeautifulSoup(r.text, 'html.parser')
    return soup

In [5]:
def month_converter(month):
    months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
    return months.index(month) + 1

In [None]:
with open('data.csv', "w", newline='') as fp:
    wr = csv.writer(fp, dialect='excel')
    wr.writerow(['Number', 'Platform', 'Date', 'Passed Day', 'Meta Score', 'User Score'])

    for n in range(200):
      url = 'https://www.metacritic.com/browse/games/score/metascore/all/all/filtered?page={}'.format(n)
      if n%20 == 0:
        proxy = get_proxies()
      soup = read_url(url, proxy)
      r = requests.get(url, headers=header)
      while not r.status_code == 200:
          r = requests.get(url, headers=header)
      soup = BeautifulSoup(r.text, 'html.parser')

      records = soup.find_all('td', class_ = 'clamp-summary-wrap')
      for record in records:
        number = int(re.findall(r'\d+', record.find('span', class_='title numbered').text)[0])
        details_soup = record.find('div', class_='clamp-details').find_all('span')
        platform = details_soup[1].text.strip()

        rec_date = details_soup[2].text.strip().replace(',', '').split(' ')
        year = int(rec_date[2])
        month = month_converter(rec_date[0][:3])
        day = int(rec_date[1])
        rec_date = datetime.date(year, month, day)

        delta = datetime.date.today() - rec_date
        pass_days = delta.days

        meta_score_classes = ['metascore_w large game positive', 
                              'metascore_w large game mixed', 
                              'metascore_w large game negative']
        for meta_score_class in meta_score_classes:
          try:
            meta_score = float(record.find('div', class_=meta_score_class).text.strip())
            break
          except:
            pass

        user_score_classes = ['metascore_w user large game positive', 
                              'metascore_w user large game mixed', 
                              'metascore_w user large game negative']
        for user_score_class in user_score_classes:
          try:
            user_score = float(record.find('div', class_=user_score_class).text.strip())
            break
          except:
            pass

        wr.writerow([number, platform, rec_date, pass_days, meta_score, user_score])

# Pre - processing

*Remove rows with missing values*

In [None]:
df = pd.read_csv('data.csv')
df = df[df != '']
df = df.dropna()
df = df.reset_index(drop = True)
df.head(10)
df.head()

*remove column 'Number'*

In [None]:
df.drop(['Number'], axis=1, inplace = True)
df.head()

# EDA

In [None]:
df['Year'] = pd.DatetimeIndex(df['Date']).year

*Number of years and platforms*

In [None]:
print('Number of Years:', len(df['Year'].unique()))
print('Number of Platforms:', len(df['Platform'].unique()))

*Statistical Features*

In [None]:
df.describe()

## Top Average Meta Scores by Year

In [None]:
dfx_year = df.groupby('Year').mean()
df10 = dfx_year.nlargest(10, 'Meta Score')
df10[['Meta Score']].groupby(['Year']).mean().plot.bar(figsize=(10, 5))

## Top Average User Scores by Year

In [None]:
df10 = dfx_year.nlargest(10, 'User Score')
df10[['User Score']].groupby(['Year']).mean().plot.bar(figsize=(10, 5))

## Scatter plot between average meta score and user score by the year

In [None]:
dfx_year.plot.scatter(x='Meta Score', y='User Score', c='DarkBlue')

## Top Average Meta Scores by Platform

In [None]:
dfx_platform = df.groupby('Platform').mean()
df10 = dfx_platform.nlargest(10, 'Meta Score')
df10[['Meta Score']].groupby(['Platform']).mean().plot.bar(figsize=(10, 5))

## Top Average User Scores by Platform

In [None]:
df10 = dfx_platform.nlargest(10, 'User Score')
df10[['User Score']].groupby(['Platform']).mean().plot.bar(figsize=(10, 5))

## Scatter plot between average meta score and user score by the platform

In [None]:
dfx_platform.plot.scatter(x='Meta Score', y='User Score', c='DarkBlue')

*Remove string columns*

In [None]:
df_numerical = df.drop(['Platform', 'Date'], axis = 1)
df_numerical = df_numerical[['Passed Day', 'Year', 'Meta Score', 'User Score']]
df_numerical.head()

# Regression Models
## Correlation Analysis

In [None]:
corr = df_numerical.corr()
corr.style.background_gradient(cmap='coolwarm').set_precision(2)

In [None]:
X = df_numerical[['Passed Day', 'Year', 'Meta Score']]
y = df_numerical['User Score']

# Train Regression models
RF = RandomForestRegressor()
LR = LinearRegression()
RF.fit(X, y)
LR.fit(X, y)

In [None]:
# Predict gross using trained regression models
RF_pred = RF.predict(X)
LR_pred = LR.predict(X)

In [None]:
fig, axs = plt.subplots(1,2,figsize=(16,9))

axs[0].plot(RF_pred, "b^", label="RandomForestRegressor")
axs[0].tick_params(axis="x", which="both", bottom=False, top=False, labelbottom=False)
axs[0].set_ylabel("predicted")
axs[0].set_xlabel("training samples")
axs[0].legend(loc="best")
axs[0].set_title("Regressor predictions and their average")

axs[1].plot(LR_pred, "ys", label="LinearRegression")
axs[1].tick_params(axis="x", which="both", bottom=False, top=False, labelbottom=False)
axs[1].set_ylabel("predicted")
axs[1].set_xlabel("training samples")
axs[1].legend(loc="best")
axs[1].set_title("Regressor predictions and their average")

plt.show()

## Calculate RMSE values

In [None]:
from sklearn.metrics import mean_squared_error
import numpy as np
rmse = []
pred_PTS = pd.DataFrame({'User Score': y, 'RF_User_Score':RF_pred, 'LR_User_Score': LR_pred})
rmse.append(mean_squared_error(pred_PTS['User Score'], pred_PTS['RF_User_Score'], squared=False))
rmse.append(mean_squared_error(pred_PTS['User Score'], pred_PTS['LR_User_Score'], squared=False))
methods = ['RandomForestRegressor', 'LinearRegression']
RMSE = pd.DataFrame({'Method': methods,'RMSE': rmse})
RMSE

In [None]:
RMSE.plot.bar(x = 'Method', y = 'RMSE', figsize=(10, 5))

*From above table and graph, we can see that Random Forest Regressor is the best. The RMSE of Random Forest Regressor is less than 1.
Now lets predict User Score for all values.*

# Predict the User Score using other features

In [None]:
RF_pred = RF.predict(X)
df_numerical['pred_User_Score'] = RF_pred
df_numerical.head(10)