Import modules from web app

In [1]:
import sys, os.path

modules_dir = os.path.join(os.path.abspath(''),"..") + "/main/modules"
sys.path.append(modules_dir)

import scraping

Retrieve HTML text from website

In [2]:
response = scraping.request("bbc.co.uk")
print(response.status_code)
print(response.text[0:100])

200
<!DOCTYPE html><html lang="en-GB" class="no-js"><head><meta charSet="utf-8" /><meta name="viewport" 


Process the HTML text into tokens

In [3]:
tokens = scraping.process(response.text)
print(len(tokens))
print(tokens[0:10])

79
['Germany fears thousands got saline, not vaccine', 'Olympian given new medal after first got bitten', 'Olympian Adam Peaty joins the all-star Strictly 2021 line-up', 'Steal yourself a movie night tonight... iPlayer', 'You might have missed', "'Oh, my god!' Victoria Derbyshire's on-air news nightmare", 'Looking back on 40 years of Indiana Jones. Audio', 'Best of BBC iPlayer', "Why the Tourette's queen of Twitch hasn't been banned", 'Decades-old lesson found on hidden blackboard']


Score each of the tokens using...
- [AFINN](https://github.com/fnielsen/afinn) = wordlist-based approach
- [VADER] 

In [4]:
from afinn import Afinn

afinn = Afinn()
scored_tokens = []

for token in tokens:
  # afinn.score calculates by adding up individual scores for words
  # so you need to standardise by dividing the length of the phrase
  afinn_score = afinn.score(token) / len(token.split())
  scored_tokens.append({"token": token, "afinn_score": afinn_score})

print(len(scored_tokens), scored_tokens[0:10])

79 [{'token': 'Germany fears thousands got saline, not vaccine', 'afinn_score': 0.0}, {'token': 'Olympian given new medal after first got bitten', 'afinn_score': 0.375}, {'token': 'Olympian Adam Peaty joins the all-star Strictly 2021 line-up', 'afinn_score': 0.0}, {'token': 'Steal yourself a movie night tonight... iPlayer', 'afinn_score': -0.2857142857142857}, {'token': 'You might have missed', 'afinn_score': -0.5}, {'token': "'Oh, my god!' Victoria Derbyshire's on-air news nightmare", 'afinn_score': 0.125}, {'token': 'Looking back on 40 years of Indiana Jones. Audio', 'afinn_score': 0.0}, {'token': 'Best of BBC iPlayer', 'afinn_score': 0.75}, {'token': "Why the Tourette's queen of Twitch hasn't been banned", 'afinn_score': -0.2222222222222222}, {'token': 'Decades-old lesson found on hidden blackboard', 'afinn_score': 0.0}]


Analyze data with `pandas`

In [5]:
import pandas as pd

token_df = pd.DataFrame(scored_tokens)

print(token_df)

                                                token  afinn_score
0     Germany fears thousands got saline, not vaccine     0.000000
1     Olympian given new medal after first got bitten     0.375000
2   Olympian Adam Peaty joins the all-star Strictl...     0.000000
3     Steal yourself a movie night tonight... iPlayer    -0.285714
4                               You might have missed    -0.500000
..                                                ...          ...
74     One mystery, four suspects, many lies. iPlayer    -0.142857
75       England face India on day two of second Test     0.000000
76  Swimmer taking on 'coldest swim on Earth' to h...     0.181818
77  See the latest results, including Lotto, EuroM...     0.000000
78  Britney Spears' father to step down as conserv...     0.000000

[79 rows x 2 columns]
