Import modules from web app

In [6]:
import sys, os.path

modules_dir = os.path.join(os.path.abspath(''),"..") + "/main/modules"
sys.path.append(modules_dir)

import scraping

Retrieve HTML text from website

In [7]:
response = scraping.request("bbc.co.uk")
print(response.status_code)
print(response.text[0:100])

200
<!DOCTYPE html><html lang="en-GB" class="no-js"><head><meta charSet="utf-8" /><meta name="viewport" 


Process the HTML text into tokens

In [8]:
tokens = scraping.process(response.text)
print(len(tokens))
print(tokens[0:10])

78
['Superstar violinist Nicola Benedetti delights the Proms', 'Three of the strangest organs in the animal kingdom', "'Naive' to set Covid restrictions end date - Mallon", "Robin's sexuality 'missing piece' of Batman story", "'We drove from Ireland to Australia in a camper van'", 'Kutcher and Kunis spark bathing debate', "'One signing could decide the title race, but it's not Harry Kane'", "Haye returns to fight 'overconfident billionaire buddy'", 'The ghosts are back for more supernatural shenanigans. IPlayer-Video', 'What are the fan tokens given to Messi by PSG?']


Score each of the tokens using...
- [AFINN](https://github.com/fnielsen/afinn) = wordlist-based approach
- [VADER] 

In [9]:
from afinn import Afinn
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

afinn = Afinn()
vader = SentimentIntensityAnalyzer()
scored_tokens = []

for token in tokens:
  # afinn.score calculates by adding up individual scores for words
  # so you need to standardise by dividing the length of the phrase
  afinn_score = afinn.score(token) / len(token.split())

  vader_score = vader.polarity_scores(token)["compound"]
  
  scored_tokens.append({"token": token, "afinn_score": afinn_score, "vader_score": vader_score})

print(len(scored_tokens), scored_tokens[0:10])

78 [{'token': 'Superstar violinist Nicola Benedetti delights the Proms', 'afinn_score': 0.42857142857142855, 'vader_score': 0.4588}, {'token': 'Three of the strangest organs in the animal kingdom', 'afinn_score': 0.0, 'vader_score': 0.0}, {'token': "'Naive' to set Covid restrictions end date - Mallon", 'afinn_score': -0.2222222222222222, 'vader_score': -0.2732}, {'token': "Robin's sexuality 'missing piece' of Batman story", 'afinn_score': -0.2857142857142857, 'vader_score': -0.296}, {'token': "'We drove from Ireland to Australia in a camper van'", 'afinn_score': 0.0, 'vader_score': 0.0}, {'token': 'Kutcher and Kunis spark bathing debate', 'afinn_score': 0.16666666666666666, 'vader_score': 0.2263}, {'token': "'One signing could decide the title race, but it's not Harry Kane'", 'afinn_score': 0.0, 'vader_score': 0.0}, {'token': "Haye returns to fight 'overconfident billionaire buddy'", 'afinn_score': -0.14285714285714285, 'vader_score': -0.3818}, {'token': 'The ghosts are back for more s

Analyze data with `pandas`

In [19]:
import pandas as pd

token_df = pd.DataFrame(scored_tokens)

print(token_df.describe())

       afinn_score  vader_score
count    78.000000    78.000000
mean     -0.010354    -0.030365
std       0.261022     0.373667
min      -0.714286    -0.831600
25%      -0.142857    -0.274925
50%       0.000000     0.000000
75%       0.000000     0.214200
max       0.750000     0.778300
