In [2]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import spacy
import re
import pandas as pd
import matplotlib.pyplot as plt

## Load Data

In [3]:
bbc_data = pd.read_csv("bbc_news.csv")

In [4]:
bbc_data.head()

Unnamed: 0.1,Unnamed: 0,index,title,pubDate,guid,link,description
0,0,6684,Can I refuse to work?,"Wed, 10 Aug 2022 15:46:18 GMT",https://www.bbc.co.uk/news/business-62147992,https://www.bbc.co.uk/news/business-62147992?a...,With much of the UK enduring another period of...
1,1,9267,'Liz Truss the Brief?' World reacts to UK poli...,"Mon, 17 Oct 2022 11:35:12 GMT",https://www.bbc.co.uk/news/world-63285480,https://www.bbc.co.uk/news/world-63285480?at_m...,The UK's political chaos has been watched arou...
2,2,7387,Rationing energy is nothing new for off-grid c...,"Wed, 31 Aug 2022 05:20:18 GMT",https://www.bbc.co.uk/news/uk-scotland-highlan...,https://www.bbc.co.uk/news/uk-scotland-highlan...,Scoraig in the north west Highlands has long h...
3,3,767,The hunt for superyachts of sanctioned Russian...,"Tue, 22 Mar 2022 14:37:01 GMT",https://www.bbc.co.uk/news/60739336,https://www.bbc.co.uk/news/60739336?at_medium=...,"Wealthy Russians sanctioned by the US, EU and ..."
4,4,3712,Platinum Jubilee: 70 years of the Queen in 70 ...,"Wed, 01 Jun 2022 23:17:33 GMT",https://www.bbc.co.uk/news/uk-61660128,https://www.bbc.co.uk/news/uk-61660128?at_medi...,A quick look back at the Queen's 70 years on t...


In [5]:
bbc_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Unnamed: 0   1000 non-null   int64 
 1   index        1000 non-null   int64 
 2   title        1000 non-null   object
 3   pubDate      1000 non-null   object
 4   guid         1000 non-null   object
 5   link         1000 non-null   object
 6   description  1000 non-null   object
dtypes: int64(2), object(5)
memory usage: 54.8+ KB


In [7]:
titles = pd.DataFrame(bbc_data['title'])

In [15]:
titles.head()

Unnamed: 0,title
0,Can I refuse to work?
1,'Liz Truss the Brief?' World reacts to UK poli...
2,Rationing energy is nothing new for off-grid c...
3,The hunt for superyachts of sanctioned Russian...
4,Platinum Jubilee: 70 years of the Queen in 70 ...


## Clean Data


In [26]:
# lowercase
titles['lowercase'] = titles['title'].str.lower()
titles.head()

Unnamed: 0,title,lowercase,no_stopwords,no_stopwords_no_punct,token_raw,tokens_clean,tokens_clean_lemmatized
0,Can I refuse to work?,can i refuse to work?,refuse work?,refusework,"[Can, I, refuse, to, work, ?]",[refusework],[refusework]
1,'Liz Truss the Brief?' World reacts to UK poli...,'liz truss the brief?' world reacts to uk poli...,'liz truss brief?' world reacts uk political t...,liztrussbriefworldreactsukpoliticalturmoil,"['Liz, Truss, the, Brief, ?, ', World, reacts,...",[liztrussbriefworldreactsukpoliticalturmoil],[liztrussbriefworldreactsukpoliticalturmoil]
2,Rationing energy is nothing new for off-grid c...,rationing energy is nothing new for off-grid c...,rationing energy nothing new off-grid community,rationingenergynothingnewoffgridcommunity,"[Rationing, energy, is, nothing, new, for, off...",[rationingenergynothingnewoffgridcommunity],[rationingenergynothingnewoffgridcommunity]
3,The hunt for superyachts of sanctioned Russian...,the hunt for superyachts of sanctioned russian...,hunt superyachts sanctioned russian oligarchs,huntsuperyachtssanctionedrussianoligarchs,"[The, hunt, for, superyachts, of, sanctioned, ...",[huntsuperyachtssanctionedrussianoligarchs],[huntsuperyachtssanctionedrussianoligarchs]
4,Platinum Jubilee: 70 years of the Queen in 70 ...,platinum jubilee: 70 years of the queen in 70 ...,platinum jubilee: 70 years queen 70 seconds,platinumjubilee70yearsqueen70seconds,"[Platinum, Jubilee, :, 70, years, of, the, Que...",[platinumjubilee70yearsqueen70seconds],[platinumjubilee70yearsqueen70seconds]


In [27]:
# stop word removal
en_stopwords = stopwords.words('english')
titles['no_stopwords'] = titles['lowercase'].apply(lambda x: ' '.join([word for word in x.split() if word not in (en_stopwords)]))
titles.head()

Unnamed: 0,title,lowercase,no_stopwords,no_stopwords_no_punct,token_raw,tokens_clean,tokens_clean_lemmatized
0,Can I refuse to work?,can i refuse to work?,refuse work?,refusework,"[Can, I, refuse, to, work, ?]",[refusework],[refusework]
1,'Liz Truss the Brief?' World reacts to UK poli...,'liz truss the brief?' world reacts to uk poli...,'liz truss brief?' world reacts uk political t...,liztrussbriefworldreactsukpoliticalturmoil,"['Liz, Truss, the, Brief, ?, ', World, reacts,...",[liztrussbriefworldreactsukpoliticalturmoil],[liztrussbriefworldreactsukpoliticalturmoil]
2,Rationing energy is nothing new for off-grid c...,rationing energy is nothing new for off-grid c...,rationing energy nothing new off-grid community,rationingenergynothingnewoffgridcommunity,"[Rationing, energy, is, nothing, new, for, off...",[rationingenergynothingnewoffgridcommunity],[rationingenergynothingnewoffgridcommunity]
3,The hunt for superyachts of sanctioned Russian...,the hunt for superyachts of sanctioned russian...,hunt superyachts sanctioned russian oligarchs,huntsuperyachtssanctionedrussianoligarchs,"[The, hunt, for, superyachts, of, sanctioned, ...",[huntsuperyachtssanctionedrussianoligarchs],[huntsuperyachtssanctionedrussianoligarchs]
4,Platinum Jubilee: 70 years of the Queen in 70 ...,platinum jubilee: 70 years of the queen in 70 ...,platinum jubilee: 70 years queen 70 seconds,platinumjubilee70yearsqueen70seconds,"[Platinum, Jubilee, :, 70, years, of, the, Que...",[platinumjubilee70yearsqueen70seconds],[platinumjubilee70yearsqueen70seconds]


In [28]:
# punctuation removal
titles['no_stopwords_no_punct'] = titles.apply(lambda x: re.sub(r"([^\w\w])", "", x['no_stopwords']), axis=1)
titles.head()

Unnamed: 0,title,lowercase,no_stopwords,no_stopwords_no_punct,token_raw,tokens_clean,tokens_clean_lemmatized
0,Can I refuse to work?,can i refuse to work?,refuse work?,refusework,"[Can, I, refuse, to, work, ?]",[refusework],[refusework]
1,'Liz Truss the Brief?' World reacts to UK poli...,'liz truss the brief?' world reacts to uk poli...,'liz truss brief?' world reacts uk political t...,liztrussbriefworldreactsukpoliticalturmoil,"['Liz, Truss, the, Brief, ?, ', World, reacts,...",[liztrussbriefworldreactsukpoliticalturmoil],[liztrussbriefworldreactsukpoliticalturmoil]
2,Rationing energy is nothing new for off-grid c...,rationing energy is nothing new for off-grid c...,rationing energy nothing new off-grid community,rationingenergynothingnewoffgridcommunity,"[Rationing, energy, is, nothing, new, for, off...",[rationingenergynothingnewoffgridcommunity],[rationingenergynothingnewoffgridcommunity]
3,The hunt for superyachts of sanctioned Russian...,the hunt for superyachts of sanctioned russian...,hunt superyachts sanctioned russian oligarchs,huntsuperyachtssanctionedrussianoligarchs,"[The, hunt, for, superyachts, of, sanctioned, ...",[huntsuperyachtssanctionedrussianoligarchs],[huntsuperyachtssanctionedrussianoligarchs]
4,Platinum Jubilee: 70 years of the Queen in 70 ...,platinum jubilee: 70 years of the queen in 70 ...,platinum jubilee: 70 years queen 70 seconds,platinumjubilee70yearsqueen70seconds,"[Platinum, Jubilee, :, 70, years, of, the, Que...",[platinumjubilee70yearsqueen70seconds],[platinumjubilee70yearsqueen70seconds]


In [32]:
# tokenize
titles['tokens_raw'] = titles.apply(lambda x: word_tokenize(x['title']), axis=1)
titles['tokens_clean'] = titles.apply(lambda x: word_tokenize(x['no_stopwords_no_punct']), axis=1)
titles.head()

Unnamed: 0,title,lowercase,no_stopwords,no_stopwords_no_punct,token_raw,tokens_clean,tokens_clean_lemmatized,tokens_raw
0,Can I refuse to work?,can i refuse to work?,refuse work?,refusework,"[Can, I, refuse, to, work, ?]",[refusework],[refusework],"[Can, I, refuse, to, work, ?]"
1,'Liz Truss the Brief?' World reacts to UK poli...,'liz truss the brief?' world reacts to uk poli...,'liz truss brief?' world reacts uk political t...,liztrussbriefworldreactsukpoliticalturmoil,"['Liz, Truss, the, Brief, ?, ', World, reacts,...",[liztrussbriefworldreactsukpoliticalturmoil],[liztrussbriefworldreactsukpoliticalturmoil],"['Liz, Truss, the, Brief, ?, ', World, reacts,..."
2,Rationing energy is nothing new for off-grid c...,rationing energy is nothing new for off-grid c...,rationing energy nothing new off-grid community,rationingenergynothingnewoffgridcommunity,"[Rationing, energy, is, nothing, new, for, off...",[rationingenergynothingnewoffgridcommunity],[rationingenergynothingnewoffgridcommunity],"[Rationing, energy, is, nothing, new, for, off..."
3,The hunt for superyachts of sanctioned Russian...,the hunt for superyachts of sanctioned russian...,hunt superyachts sanctioned russian oligarchs,huntsuperyachtssanctionedrussianoligarchs,"[The, hunt, for, superyachts, of, sanctioned, ...",[huntsuperyachtssanctionedrussianoligarchs],[huntsuperyachtssanctionedrussianoligarchs],"[The, hunt, for, superyachts, of, sanctioned, ..."
4,Platinum Jubilee: 70 years of the Queen in 70 ...,platinum jubilee: 70 years of the queen in 70 ...,platinum jubilee: 70 years queen 70 seconds,platinumjubilee70yearsqueen70seconds,"[Platinum, Jubilee, :, 70, years, of, the, Que...",[platinumjubilee70yearsqueen70seconds],[platinumjubilee70yearsqueen70seconds],"[Platinum, Jubilee, :, 70, years, of, the, Que..."


In [33]:
#lemmatize
lemmatizer = WordNetLemmatizer()
titles["tokens_clean_lemmatized"] = titles["tokens_clean"].apply(lambda tokens: [lemmatizer.lemmatize(token) for token in tokens]) 
titles.head()

Unnamed: 0,title,lowercase,no_stopwords,no_stopwords_no_punct,token_raw,tokens_clean,tokens_clean_lemmatized,tokens_raw
0,Can I refuse to work?,can i refuse to work?,refuse work?,refusework,"[Can, I, refuse, to, work, ?]",[refusework],[refusework],"[Can, I, refuse, to, work, ?]"
1,'Liz Truss the Brief?' World reacts to UK poli...,'liz truss the brief?' world reacts to uk poli...,'liz truss brief?' world reacts uk political t...,liztrussbriefworldreactsukpoliticalturmoil,"['Liz, Truss, the, Brief, ?, ', World, reacts,...",[liztrussbriefworldreactsukpoliticalturmoil],[liztrussbriefworldreactsukpoliticalturmoil],"['Liz, Truss, the, Brief, ?, ', World, reacts,..."
2,Rationing energy is nothing new for off-grid c...,rationing energy is nothing new for off-grid c...,rationing energy nothing new off-grid community,rationingenergynothingnewoffgridcommunity,"[Rationing, energy, is, nothing, new, for, off...",[rationingenergynothingnewoffgridcommunity],[rationingenergynothingnewoffgridcommunity],"[Rationing, energy, is, nothing, new, for, off..."
3,The hunt for superyachts of sanctioned Russian...,the hunt for superyachts of sanctioned russian...,hunt superyachts sanctioned russian oligarchs,huntsuperyachtssanctionedrussianoligarchs,"[The, hunt, for, superyachts, of, sanctioned, ...",[huntsuperyachtssanctionedrussianoligarchs],[huntsuperyachtssanctionedrussianoligarchs],"[The, hunt, for, superyachts, of, sanctioned, ..."
4,Platinum Jubilee: 70 years of the Queen in 70 ...,platinum jubilee: 70 years of the queen in 70 ...,platinum jubilee: 70 years queen 70 seconds,platinumjubilee70yearsqueen70seconds,"[Platinum, Jubilee, :, 70, years, of, the, Que...",[platinumjubilee70yearsqueen70seconds],[platinumjubilee70yearsqueen70seconds],"[Platinum, Jubilee, :, 70, years, of, the, Que..."


In [34]:
# create lists for just our tokens
tokens_raw_list = sum(titles['tokens_raw'], []) #unpack our lists into a single list
tokens_clean_list = sum(titles['tokens_clean_lemmatized'], [])

## POS Tagging

### Raw


In [35]:
nlp = spacy.load('en_core_web_sm')

In [66]:
spacy_doc = nlp(' '.join(tokens_raw_list))
spacy_doc



In [69]:
pos_df = pd.DataFrame(columns = ['token', 'post_tag'])
pos_df.head()

Unnamed: 0,token,post_tag


In [39]:
for token in spacy_doc:
    pos_df = pd.concat([pos_df, pd.DataFrame.from_records(
    [{'token':token.text, 'pos_tag' : token.pos_}])], ignore_index=True)

In [44]:
pos_df_counts = pos_df.groupby(['token', 'pos_tag']).size().reset_index(name='counts').sort_values(by='counts',  ascending=False)
pos_df_counts.head(10)
# the raw data as expected has bunch of punctuations and stopwords

Unnamed: 0,token,pos_tag,counts
95,:,PUNCT,543
8,',PUNCT,300
2897,in,ADP,187
4082,to,PART,175
3268,of,ADP,172
22,-,PUNCT,166
4043,the,DET,163
1856,and,CCONJ,147
15,'s,PART,143
97,?,PUNCT,130


In [46]:
# looking into most common nouns in dataset
nouns = pos_df_counts[pos_df_counts.pos_tag == 'NOUN'][0:10]
nouns

Unnamed: 0,token,pos_tag,counts
4267,war,NOUN,35
3552,record,NOUN,15
3416,police,NOUN,14
4316,win,NOUN,14
4356,year,NOUN,14
3061,living,NOUN,13
4009,tax,NOUN,13
3368,people,NOUN,12
2326,day,NOUN,12
2031,boss,NOUN,11


In [47]:
# looking into most common verb in dataset
verbs = pos_df_counts[pos_df_counts.pos_tag == 'VERB'][0:10]
verbs

Unnamed: 0,token,pos_tag,counts
3687,says,VERB,30
9,',VERB,14
2670,found,VERB,13
4317,win,VERB,12
4324,wins,VERB,10
2713,get,VERB,9
2388,dies,VERB,9
3990,take,VERB,8
2982,killed,VERB,8
3745,set,VERB,8


### Clean

In [72]:
spacy_doc_clean = nlp(' '.join(tokens_clean_list))
spacy_doc_clean



In [68]:
pos_df_clean = pd.DataFrame(columns = ['token', 'pos_tag'])
pos_df_clean.head()

Unnamed: 0,token,pos_tag


In [73]:
for token in spacy_doc_clean:
    pos_df_clean = pd.concat([pos_df_clean, pd.DataFrame.from_records(
    [{'token':token.text, 'pos_tag' : token.pos_}])], ignore_index=True)

In [74]:
pos_df_counts_clean = pos_df_clean.groupby(['token', 'pos_tag']).size().reset_index(name='counts').sort_values(by='counts',  ascending=False)
pos_df_counts_clean.head()

Unnamed: 0,token,pos_tag,counts
895,ukweatherlongcoldsnaplast,ADJ,2
532,muchcouldmortgagerisetrycalculator,PROPN,2
176,counciltaxmuchrising,VERB,2
965,worldathleticschampionshipsgbskeelyhodgkinsonn...,PROPN,1
13,alabamahuntmissingprisoninmateguard,NOUN,1


### NER

In [78]:
ner_df = pd.DataFrame(columns=['token', 'ner_tag'])

for token in spacy_doc.ents:
    if pd.isna(token.label_) is False:
        ner_df = pd.concat([ner_df, pd.DataFrame.from_records(
            [{'token': token.text, 'ner_tag': token.label_}])], ignore_index=True)
        

In [79]:
ner_df.head()

Unnamed: 0,token,ner_tag
0,Liz Truss,PERSON
1,UK,GPE
2,Rationing,PRODUCT
3,superyachts,CARDINAL
4,Russian,NORP


In [84]:
 # most common name entity in the dataset
ner_df_counts = ner_df.groupby(['token', 'ner_tag']).size().reset_index(name='counts').sort_values(by='counts', ascending=False)

In [86]:
ner_df_counts.head(10)

Unnamed: 0,token,ner_tag,counts
965,Ukraine,GPE,47
955,UK,GPE,36
329,England,GPE,32
819,Russian,NORP,20
957,US,GPE,19
1031,World Cup 2022,EVENT,18
1058,first,ORDINAL,13
918,The Papers,WORK_OF_ART,13
378,France,GPE,12
226,China,GPE,11


In [88]:
# most popular people in dataset
people = ner_df_counts[ner_df_counts.ner_tag == "PERSON"][:10]
people

Unnamed: 0,token,ner_tag,counts
257,Covid,PERSON,9
757,Putin,PERSON,8
760,Queen,PERSON,8
563,Liz Truss,PERSON,6
169,Boris Johnson,PERSON,6
788,Rishi Sunak,PERSON,5
325,Emma Raducanu,PERSON,4
762,Quiz,PERSON,4
581,Macron,PERSON,4
515,Jurgen Klopp,PERSON,4
