In [87]:
import requests
from bs4 import BeautifulSoup
from warnings import warn

In [88]:
import time

#explicitly calling the urls since there are only two pages, otherwise we might want to iterate
urls = ['http://www.metacritic.com/browse/albums/publication/score?num_items=100&page=0','http://www.metacritic.com/browse/albums/publication/score?num_items=100&page=1']
publications_list = []

start = time.time()
for u in urls:
    page = requests.get(u, headers={'User-Agent': 'Mozilla/5.0'})
    if page.status_code != 200:
        warn("Status code for " + u + "was " + u.status_code())
    else:
        print("processing " + u + "...")
    soup = BeautifulSoup(page.text, 'html.parser')
    publications = soup.findAll('div',attrs={'class':'product_wrap'})
    publications_list.extend(publications)

end = time.time()
print("execution time: " + str(end - start) + "s")

processing http://www.metacritic.com/browse/albums/publication/score?num_items=100&page=0...
processing http://www.metacritic.com/browse/albums/publication/score?num_items=100&page=1...
execution time: 3.680203914642334s


In [89]:
names = []
review_counts = []
review_averages = []

def get_num(x):
    return int(''.join(ele for ele in x if ele.isdigit()))

start = time.time()
for p in publications_list:
    #finding the title div element containing publication name and total # reviews
    title = p.find('div',attrs={'class':'title'})
    
    #name
    publication_name = title.find("a").get_text()
    names.append(publication_name)
    
    #total album review count
    publication_album_count = title.find('span',attrs={'class':'count'}).get_text()
    publication_album_count = get_num(publication_album_count)
    review_counts.append(publication_album_count)
    
    #average album review score    
    publication_average_score = p.select("span[class*=average_score]")[0].get_text()
    review_averages.append(int(publication_average_score))

end = time.time()
print("execution time: " + str(end - start) + "s")

execution time: 0.04835081100463867s


In [90]:
import pandas as pd

In [187]:
# creating our macro-level data frame
macro_df = pd.DataFrame({
    'publication': names,
    'total reviews': review_counts,
    'average score': review_averages
})

print(macro_df.info())
macro_df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129 entries, 0 to 128
Data columns (total 3 columns):
average score    129 non-null int64
publication      129 non-null object
total reviews    129 non-null int64
dtypes: int64(2), object(1)
memory usage: 3.1+ KB
None


Unnamed: 0,average score,publication,total reviews
0,91,Expert Witness (MSN Music),232
1,86,MSN Consumer Guide (Robert Christgau),178
2,86,Alternative Press (Record Of The Week),5
3,85,Okayplayer,148
4,83,Ink Blot Magazine,85
5,82,Village Voice (Consumer Guide),223
6,81,Absolute Punk (Staff reviews),811
7,80,Country Weekly,157
8,79,Metal Hammer (UK),25
9,78,Mixmag,428


In [139]:
import plotly
import plotly.graph_objs as go

plotly.offline.init_notebook_mode(connected=True)

In [183]:
import numpy as np
print("Average Album Review")
np.average(macro_df['average score'], weights=macro_df['total reviews'])

print("Summary Statistics: Average Publication Scores")
print(macro_df['average score'].describe())
print("median score: " + str(macro_df['average score'].median()))

print("Summary Statistics: Average Publication Review Count")
print(macro_df['total reviews'].describe())

print(str(macro_df['total reviews'].sum()))

Average Album Review
Summary Statistics: Average Publication Scores
count    129.000000
mean      72.348837
std        4.642957
min       60.000000
25%       70.000000
50%       72.000000
75%       74.000000
max       91.000000
Name: average score, dtype: float64
median score: 72.0
Summary Statistics: Average Publication Review Count
count      129.000000
mean      1460.534884
std       2066.077337
min          3.000000
25%        158.000000
50%        712.000000
75%       1822.000000
max      12779.000000
Name: total reviews, dtype: float64
188409


In [182]:
data1 = go.Histogram(x = macro_df['average score'],)
layout1 = go.Layout(
    title= 'Average Scores Across Publications',
    hovermode= 'closest',
    xaxis = dict(
        title= 'Average Score',
    ),
    yaxis = dict(
        title = 'Number of Publications'
    )
)

fig1 = dict(data=[data1], layout=layout1)
iplot(fig1, filename='basic histogram')

data2 = go.Histogram(x=macro_df['total reviews'])
layout2 = go.Layout(
    title= 'Average Scores Across Publications',
    hovermode= 'closest',
    xaxis = dict(
        title= 'Number of Reviews',
    ),
    yaxis = dict(
        title = 'Number of Publications'
    )
)

fig2_1 = dict(data=[data2], layout=layout2)
iplot(fig2, filename='basic histogram')

In [180]:
N = 500

data3 = go.Scatter(
    x = macro_df['total reviews'],
    y = macro_df['average score'],
    text = macro_df['publication'],
    name = 'Above',
    mode = 'markers',
    marker = dict(
        size = 5,
        color = 'rgba(152, 0, 0, .8)',
        line = dict(
            width = 2,
            color = 'rgb(0, 0, 0)'
        )
    )
)

layout3= go.Layout(
    title= 'Average Rating vs. Number of Reviews By Publication',
    hovermode= 'closest',
    xaxis= dict(
        title= 'Number of Reviews',
        #type = 'log',
        zeroline= False,
        gridwidth= 2,
    ),
    yaxis=dict(
        title= 'Average Rating (100 max)',
        gridwidth= 2,
    ),
    showlegend= False
)

z = np.polyfit(macro_df['total reviews'],macro_df['average score'],1)
p = np.poly1d(z) 

print("Linear Regression: " + str(p))

fig3 = dict(data=[data3], layout=layout3)
iplot(fig3, filename='styled-scatter')

Linear Regression:  
-0.000494 x + 73.07


In [196]:
macro_df['paginations'] = macro_df['total reviews']
macro_df.loc[:,'paginations'] *= (0.01)
macro_df['paginations'] = np.ceil(macro_df['paginations'])
macro_df

Unnamed: 0,average score,publication,total reviews,paginations
0,91,Expert Witness (MSN Music),232,3.0
1,86,MSN Consumer Guide (Robert Christgau),178,2.0
2,86,Alternative Press (Record Of The Week),5,1.0
3,85,Okayplayer,148,2.0
4,83,Ink Blot Magazine,85,1.0
5,82,Village Voice (Consumer Guide),223,3.0
6,81,Absolute Punk (Staff reviews),811,9.0
7,80,Country Weekly,157,2.0
8,79,Metal Hammer (UK),25,1.0
9,78,Mixmag,428,5.0


In [199]:
print("Total links to follow via publication pages:")
macro_df['paginations'].sum()

Total links to follow via publication pages:


1951.0