In [4]:
#This is Python's requests library.
import requests
#The variable 'url' will contain the page's url.
url = 'https://www.allsides.com/media-bias/media-bias-ratings'
#The variable 'r' will contain all requests made to the url.
r = requests.get(url)
#Let's print a fraction of the HTML characters that will appear.
print(r.content[:100])

b'<!DOCTYPE html>\n<!--[if IEMobile 7]><html class="iem7"  lang="en" dir="ltr"><![endif]-->\n<!--[if lte'


In [5]:
#BeautifulSoup is the Python library that pulls data from HTML and XML files.
from bs4 import BeautifulSoup
#Next, we'll make a variable called soup. The url's content goes inside the brackets. Then, we call the content of the variable 'r' and specify the type of parser to use.
soup = BeautifulSoup(r.content, 'html.parser')

In [6]:
rows = soup.select('tbody tr')

In [7]:
print(rows)

[<tr class="odd views-row-first">
<td class="views-field views-field-title source-title">
<a href="/news-source/abc-news-media-bias">ABC News</a> </td>
<td class="views-field views-field-field-bias-image">
<a href="/media-bias/left-center"><img alt="Political News Media Bias Rating: Lean Left" height="24" src="https://www.allsides.com/sites/default/files/styles/bias144x24/public/bias-leaning-left.png?itok=mtG3ToEN" title="Political News Media Bias Rating: Lean Left" typeof="foaf:Image" width="144"/></a> </td>
<td class="views-field views-field-nothing-1 what-do-you-think">
<div class="agree-disagree-widget"><div class="rate-widget-4 rate-widget clear-block rate-average rate-widget-yesno rate-f13adc4a6caee57d894f9d7ef192d7f3 rate-node-76-4-1" id="rate-node-76-4-1">
<div class="item-list"><ul><li class="first"><a class="rate-button rate-btn" href="/media-bias/media-bias-ratings?rate=zmyDLixxBj_ytZsb_p9D32b0gPWtDEaeOSu_YIdkTIc" id="rate-button-3" rel="nofollow" title="agree">agree</a></li

In [9]:
#Get the row whose number is 0.
row = rows[0]
#Create a variable called 'name' that calls the the variable row with option '.select_one'. 
#Use .text to get all the text in an element.
name = row.select_one('.source-title').text.strip()
print(name)

ABC News


In [10]:
#Select the anchor in order to get the href that contains the link. This is different than selecting an element because you have to use the attribute.
allsides_page = row.select_one('.source-title a')['href']
allsides_page = 'https://www.allsides.com' + allsides_page

print(allsides_page)

https://www.allsides.com/news-source/abc-news-media-bias


In [13]:
#Let's get the rating in words by selecting the link.
#Create a variable called bias to put the link.
bias = row.select_one('.views-field-field-bias-image a')['href']
bias = bias.split('/')[-1]

print(bias)

left-center


In [17]:
#Create a variable called 'agree' and select one row with the class 'agree'.
agree = row.select_one('.agree').text
#The result must be turned into an integer (whole number).
agree = int(agree)
#Create a variable called 'disagree' and select the row with the class 'disagree'.
disagree = row.select_one('.disagree').text
#Turn result into intger.
disagree = int(disagree)
#Create a variable that figures out ratio of agree/disagree ratings
agree_ratio = agree/disagree
#Print results
print(f"Agree: {agree}, Disagree: {disagree}, Ratio {agree_ratio:.2f}")

Agree: 11215, Disagree: 8214, Ratio 1.37


In [18]:
#Create a function that replicates the table's process of getting the community response ("somewhat agree", somewhat disagree).
def get_agreeance_text(ratio):
    if ratio > 3: return "absolutely agrees"
    elif 2 < ratio <= 3: return "strongly agrees"
    elif 1.5 < ratio <= 2: return "agrees"
    elif 1 < ratio <= 1.5: return "somewhat agrees"
    elif ratio == 1: return "neutral"
    elif 0.67 < ratio < 1: return "somewhat disagrees"
    elif 0.5 < ratio <= 0.67: return "disagrees"
    elif 0.33 < ratio <= 0.5: return "strongly disagrees"
    elif ratio <= 0.33: return "absolutely disagrees"
    else: return None
#Print the result for a ratio of 2.5.
print(get_agreeance_text(2.5))

strongly agrees


In [19]:
#Create a loop that takes data from every row on the first page.
data = []
for row in rows:
    d = dict()
    d['name'] = row.select_one('.source-title').text.strip()
    d['allsides_page'] = 'https://www.allsides.com' + row.select_one('.source-title a')['href']
    d['bias'] = row.select_one('.views-field-field-bias-image a')['href'].split('/')[-1]
    d['agree'] = int(row.select_one('.agree').text)
    d['disagree'] = int(row.select_one('.disagree').text)
    d['agree_ratio'] = d['agree'] / d['disagree']
    d['agreeance_text'] = get_agreeance_text(d['agree_ratio'])
    
    data.append(d)   

In [20]:
#Print the data list to see the dictionary of key info in each row.
print(data[0])

{'name': 'ABC News', 'allsides_page': 'https://www.allsides.com/news-source/abc-news-media-bias', 'bias': 'left-center', 'agree': 11215, 'disagree': 8214, 'agree_ratio': 1.3653518383248113, 'agreeance_text': 'somewhat agrees'}


In [21]:
#Create a variable called 'pages' that include urls of all 3 pages.
pages = [
     'https://www.allsides.com/media-bias/media-bias-ratings',
    'https://www.allsides.com/media-bias/media-bias-ratings?page=1',
    'https://www.allsides.com/media-bias/media-bias-ratings?page=2'
]

In [22]:
#Import Python library to allow 10-second pause between requests.
from time import sleep
data = []
#Create variables for loop.
for page in pages:
    r = requests.get(page)
    soup = BeautifulSoup(r.content, 'html.parser')
    rows = soup.select('tbody tr')

    for row in rows:
        d = dict()

        d['name'] = row.select_one('.source-title').text.strip()
        d['allsides_page'] = 'https://www.allsides.com' + row.select_one('.source-title a')['href']
        d['bias'] = row.select_one('.views-field-field-bias-image a')['href'].split('/')[-1]
        d['agree'] = int(row.select_one('.agree').text)
        d['disagree'] = int(row.select_one('.disagree').text)
        d['agree_ratio'] = d['agree'] / d['disagree']
        d['agreeance_text'] = get_agreeance_text(d['agree_ratio'])

        data.append(d)
    
    sleep(10)

In [24]:
#Install this library to give us progress bar for status of requests.
!pip3 install tqdm

[33mYou are using pip version 18.1, however version 19.1.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [25]:
for d in tqdm_notebook(data):
    r = requests.get(d['allsides_page'])
    soup = BeautifulSoup(r.content, 'html.parser')
    
    try:
        website = soup.select_one('.www')['href']
        d['website'] = website
    except TypeError:
        pass
    
    sleep(10)

NameError: name 'tqdm_notebook' is not defined

In [26]:
#Let's save our data as JSON.
import json

with open('allsides.json', 'w') as f:
    json.dump(data, f)

In [27]:
#Let's do some data analysis. Which bias ratings does the community completely agree with?
#Create a variable that includes all the strongest ratings.
abs_agree = [d for d in data if d ['agreeance_text'] == 'absolutely agrees']
print(f"{'Outlet':<20} {'Bias':<20}")
print("-" * 30)

for d in abs_agree:
    print(f"{d['name']:<20} {d['bias']:<20}")

Outlet               Bias                
------------------------------
CNN - Editorial      left                
Fox News Editorial   right               
Jacobin              left                
Mother Jones         left                
New York Times - Opinion left                
The Nation           left                
The New Yorker       left                
ThinkProgress        left                


In [28]:
#Let's use Pandas to put the JSON data into a DataFrame.
import pandas as pd 
df = pd.read_json(open('allsides.json', 'r'))
df.set_index('name', inplace=True)
df.head()

Unnamed: 0_level_0,agree,agree_ratio,agreeance_text,allsides_page,bias,disagree
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ABC News,11215,1.365352,somewhat agrees,https://www.allsides.com/news-source/abc-news-...,left-center,8214
Al Jazeera,3249,0.649021,disagrees,https://www.allsides.com/news-source/al-jazeer...,center,5006
AlterNet,2079,2.907692,strongly agrees,https://www.allsides.com/news-source/alternet,left,715
American Spectator,5765,2.363674,strongly agrees,https://www.allsides.com/news-source/american-...,right,2439
Associated Press,6159,1.424376,somewhat agrees,https://www.allsides.com/news-source/associate...,center,4324


In [29]:
#Let's find the ratings for which the community strongly disagrees.
df[df['agreeance_text'] == 'strongly disagrees']

Unnamed: 0_level_0,agree,agree_ratio,agreeance_text,allsides_page,bias,disagree
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
The Economist,1194,0.497707,strongly disagrees,https://www.allsides.com/news-source/economist,left-center,2399


In [30]:
#Time for a visualization of the community reactions with the most votes.
df['total_votes'] = df['agree'] + df['disagree']
df.sort_values('total_votes', ascending=False, inplace=True)

df.head(10)

Unnamed: 0_level_0,agree,agree_ratio,agreeance_text,allsides_page,bias,disagree,total_votes
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
CNN (Web News),25513,0.970039,somewhat disagrees,https://www.allsides.com/news-source/cnn-media...,left-center,26301,51814
Fox News,20030,0.675411,somewhat disagrees,https://www.allsides.com/news-source/fox-news-...,right-center,29656,49686
New York Times - News,14175,0.600203,disagrees,https://www.allsides.com/news-source/new-york-...,left-center,23617,37792
Washington Post,22944,1.622975,agrees,https://www.allsides.com/news-source/washingto...,left-center,14137,37081
HuffPost,17812,0.929597,somewhat disagrees,https://www.allsides.com/news-source/huffpost-...,left,19161,36973
Politico,12783,0.628126,disagrees,https://www.allsides.com/news-source/politico-...,left-center,20351,33134
NPR News,18112,1.383546,somewhat agrees,https://www.allsides.com/news-source/npr-media...,center,13091,31203
Washington Times,20265,1.982101,agrees,https://www.allsides.com/news-source/washingto...,right-center,10224,30489
Wall Street Journal - News,11393,0.666959,disagrees,https://www.allsides.com/news-source/wall-stre...,center,17082,28475
Townhall,8550,0.639013,disagrees,https://www.allsides.com/news-source/townhall-...,right,13380,21930


In [35]:
#Let's create a second dataframe that only includes data we want to plot.
df2 = df.head(25).copy()

df2.head()

Unnamed: 0_level_0,agree,agree_ratio,agreeance_text,allsides_page,bias,disagree,total_votes
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
CNN (Web News),25513,0.970039,somewhat disagrees,https://www.allsides.com/news-source/cnn-media...,left-center,26301,51814
Fox News,20030,0.675411,somewhat disagrees,https://www.allsides.com/news-source/fox-news-...,right-center,29656,49686
New York Times - News,14175,0.600203,disagrees,https://www.allsides.com/news-source/new-york-...,left-center,23617,37792
Washington Post,22944,1.622975,agrees,https://www.allsides.com/news-source/washingto...,left-center,14137,37081
HuffPost,17812,0.929597,somewhat disagrees,https://www.allsides.com/news-source/huffpost-...,left,19161,36973


In [34]:
#Let's create a stacked bar chart.

TypeError: list indices must be integers or slices, not str