In [1]:
import re
import pandas as pd
import spacy
import nltk

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [2]:
bbc_data = pd.read_csv("bbc_news.csv")
titles = bbc_data["title"].astype(str)

bbc_data.head()

Unnamed: 0.1,Unnamed: 0,index,title,pubDate,guid,link,description
0,0,6684,Can I refuse to work?,"Wed, 10 Aug 2022 15:46:18 GMT",https://www.bbc.co.uk/news/business-62147992,https://www.bbc.co.uk/news/business-62147992?a...,With much of the UK enduring another period of...
1,1,9267,'Liz Truss the Brief?' World reacts to UK poli...,"Mon, 17 Oct 2022 11:35:12 GMT",https://www.bbc.co.uk/news/world-63285480,https://www.bbc.co.uk/news/world-63285480?at_m...,The UK's political chaos has been watched arou...
2,2,7387,Rationing energy is nothing new for off-grid c...,"Wed, 31 Aug 2022 05:20:18 GMT",https://www.bbc.co.uk/news/uk-scotland-highlan...,https://www.bbc.co.uk/news/uk-scotland-highlan...,Scoraig in the north west Highlands has long h...
3,3,767,The hunt for superyachts of sanctioned Russian...,"Tue, 22 Mar 2022 14:37:01 GMT",https://www.bbc.co.uk/news/60739336,https://www.bbc.co.uk/news/60739336?at_medium=...,"Wealthy Russians sanctioned by the US, EU and ..."
4,4,3712,Platinum Jubilee: 70 years of the Queen in 70 ...,"Wed, 01 Jun 2022 23:17:33 GMT",https://www.bbc.co.uk/news/uk-61660128,https://www.bbc.co.uk/news/uk-61660128?at_medi...,A quick look back at the Queen's 70 years on t...


## Text Preprocessing

The text is cleaned by:
- Lowercasing
- Removing punctuation
- Removing stopwords
- Lemmatization


In [3]:
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)
    tokens = text.split()
    tokens = [t for t in tokens if t not in stop_words]
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return " ".join(tokens)

titles_clean = titles.apply(clean_text)
titles_clean.head()


0                                          refuse work
1    liz truss brief world reacts uk political turmoil
2       rationing energy nothing new offgrid community
3         hunt superyachts sanctioned russian oligarch
4             platinum jubilee 70 year queen 70 second
Name: title, dtype: object

In [4]:
nlp = spacy.load("en_core_web_sm")
docs = list(nlp.pipe(titles_clean))

In [5]:
pos_records = []

for doc in docs:
    for token in doc:
        if token.is_alpha:
            pos_records.append({
                "token": token.text,
                "pos_tag": token.pos_
            })

pos_df = pd.DataFrame(pos_records)
pos_df.head()

Unnamed: 0,token,pos_tag
0,refuse,VERB
1,work,NOUN
2,liz,PROPN
3,truss,PROPN
4,brief,ADJ


In [6]:
pos_counts = (
    pos_df
    .groupby(["token", "pos_tag"])
    .size()
    .reset_index(name="count")
    .sort_values("count", ascending=False)
)

pos_counts.head(10)

Unnamed: 0,token,pos_tag,count
781,cup,PROPN,45
1068,england,PROPN,43
3817,world,PROPN,40
3582,uk,PROPN,37
2947,say,VERB,36
3713,war,NOUN,35
3587,ukraine,VERB,34
2276,new,ADJ,31
2901,russian,ADJ,22
817,day,NOUN,21


In [7]:
ner_records = []

for doc in docs:
    for ent in doc.ents:
        ner_records.append({
            "token": ent.text,
            "ner_tag": ent.label_
        })

ner_df = pd.DataFrame(ner_records)
ner_df.head()

Unnamed: 0,token,ner_tag
0,russian,NORP
1,70 year,DATE
2,70 second,TIME
3,bull,ORG
4,1,CARDINAL


In [8]:
ner_counts = (
    ner_df
    .groupby(["token", "ner_tag"])
    .size()
    .reset_index(name="count")
    .sort_values("count", ascending=False)
)

ner_counts.head(10)

Unnamed: 0,token,ner_tag,count
640,ukraine,GPE,32
34,2022,CARDINAL,28
547,russian,NORP,25
635,uk,GPE,20
270,first,ORDINAL,14
35,2022,DATE,12
280,france,GPE,11
546,russia,GPE,10
617,tory,NORP,9
398,liverpool,GPE,8
