### Imports

In [2]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm

tqdm.pandas()
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/chikro/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/chikro/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Load articles

In [3]:
article_df = pd.read_csv("bbc_full.csv").astype(str)
display(article_df.head())

Unnamed: 0,title,published_date,authors,description,section,content,link,top_image
0,Anthony Martial: Man Utd boss Jose Mourinho te...,2017-01-01,,Manchester United forward Anthony Martial shou...,,Last updated on From the section Football Man...,http://www.bbc.co.uk/sport/football/38480502,https://ichef.bbci.co.uk/onesport/cps/624/cpsp...
1,Wayde van Niekerk relives Rio Olympics 400m go...,2017-01-01,,South Africas Wayde van Niekerk relives his hi...,,South Africas Wayde van Niekerk relives his hi...,http://www.bbc.co.uk/sport/athletics/38418992,https://ichef.bbci.co.uk/news/1024/cpsprodpb/0...
2,"Roger Federer can win another Grand Slam, says...",2017-01-01,,Roger Federer can return from a sixmonth injur...,,Roger Federer can return from six months out a...,http://www.bbc.co.uk/sport/tennis/38390891,https://ichef.bbci.co.uk/onesport/cps/624/cpsp...
3,TV and radio stars we lost in 2016 - BBC News,2017-01-01,https://www.facebook.com/bbcnews,A look back at some of the faces and voices fr...,Entertainment & Arts,Magician Paul Daniels died in March aged 77 af...,http://www.bbc.co.uk/news/entertainment-arts-3...,https://ichef.bbci.co.uk/news/976/cpsprodpb/B8...
4,New Year Honours 2017: Andy Murray 'honoured' ...,2017-01-01,,Tennis star Sir Andy Murray says he still feel...,,Tennis star Sir Andy Murray said he still feel...,http://www.bbc.co.uk/news/uk-38481089,https://ichef.bbci.co.uk/news/1024/branded_new...


In [None]:
import numpy as np 
np.mean(list(map(lambda x: len(x), article_df["content"])))

np.float64(609.2649334196873)

### Preprocess

In [50]:
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # # 1. Lowercase the text
    # text = text.lower()
    
    # # 2. Remove stopwords
    # text = " ".join(word for word in text.split() if word not in stop_words)
    
    # # 3. Remove non-alphanumeric characters
    # text = " ".join(word for word in text.split() if word.isalpha())
    
    # # 4. Lemmatization
    # text = " ".join(lemmatizer.lemmatize(word) for word in text.split())

    text = " ".join(lemmatizer.lemmatize(word) for word in text.lower().split() if word.isalpha() and word not in stop_words)
    
    return text

# Apply preprocessing to the specified columns
article_df['title_cleaned'] = article_df['title'].progress_apply(preprocess_text)
article_df['description_cleaned'] = article_df['description'].progress_apply(preprocess_text)
article_df['content_cleaned'] = article_df['content'].progress_apply(preprocess_text)

display(article_df.head())

100%|██████████| 140507/140507 [00:05<00:00, 26513.27it/s]
100%|██████████| 140507/140507 [00:06<00:00, 21468.25it/s]
100%|██████████| 140507/140507 [03:23<00:00, 689.55it/s]


Unnamed: 0,title,published_date,authors,description,section,content,link,top_image,title_cleaned,description_cleaned,content_cleaned
0,Anthony Martial: Man Utd boss Jose Mourinho te...,2017-01-01,,Manchester United forward Anthony Martial shou...,,Last updated on From the section Football Man...,http://www.bbc.co.uk/sport/football/38480502,https://ichef.bbci.co.uk/onesport/cps/624/cpsp...,anthony man utd bos jose mourinho tell forward...,manchester united forward anthony martial list...,last updated section football manchester unite...
1,Wayde van Niekerk relives Rio Olympics 400m go...,2017-01-01,,South Africas Wayde van Niekerk relives his hi...,,South Africas Wayde van Niekerk relives his hi...,http://www.bbc.co.uk/sport/athletics/38418992,https://ichef.bbci.co.uk/news/1024/cpsprodpb/0...,wayde van niekerk relives rio olympics gold bb...,south africa wayde van niekerk relives histori...,south africa wayde van niekerk relives histori...
2,"Roger Federer can win another Grand Slam, says...",2017-01-01,,Roger Federer can return from a sixmonth injur...,,Roger Federer can return from six months out a...,http://www.bbc.co.uk/sport/tennis/38390891,https://ichef.bbci.co.uk/onesport/cps/624/cpsp...,roger federer win another grand say former coa...,roger federer return sixmonth injury absence w...,roger federer return six month win another gra...
3,TV and radio stars we lost in 2016 - BBC News,2017-01-01,https://www.facebook.com/bbcnews,A look back at some of the faces and voices fr...,Entertainment & Arts,Magician Paul Daniels died in March aged 77 af...,http://www.bbc.co.uk/news/entertainment-arts-3...,https://ichef.bbci.co.uk/news/976/cpsprodpb/B8...,tv radio star lost bbc news,look back face voice tv radio lost,magician paul daniel died march aged diagnosed...
4,New Year Honours 2017: Andy Murray 'honoured' ...,2017-01-01,,Tennis star Sir Andy Murray says he still feel...,,Tennis star Sir Andy Murray said he still feel...,http://www.bbc.co.uk/news/uk-38481089,https://ichef.bbci.co.uk/news/1024/branded_new...,new year honour andy murray knighthood bbc news,tennis star sir andy murray say still feel lik...,tennis star sir andy murray said still feel li...


In [51]:
print(f"{len(article_df['section'].unique())} categories found")
display(sorted(article_df["section"].unique()))

273 categories found


['Aerospace & Defence',
 'Africa',
 'Africa Sport',
 'American Football',
 'Arsenal',
 'Asia',
 'Asia-Pacific',
 'Aston Villa',
 'Athletics',
 'Australia',
 'BBC InDepth',
 'BBC News Services',
 'BBC Trending',
 'BBC Verify',
 'Badminton',
 'Baseball',
 'Basketball',
 'Bath',
 'Beds, Herts & Bucks',
 'Berkshire',
 'Birmingham & Black Country',
 'Blackpool',
 'Bolton',
 'Bournemouth',
 'Boxing',
 'Bradford',
 'Brentford',
 'Brexit',
 'Brighton',
 'Bristol',
 'British & Irish Lions',
 'Business',
 'CBBC Newsround',
 'Cambridgeshire',
 'Canoeing',
 'Cardiff',
 'Celtic',
 'Champions League',
 'Championship',
 'Chelsea',
 'China',
 'China blog',
 'Climate',
 'Commonwealth Games',
 'Companies',
 'Cornwall',
 'Counties',
 'County Cricket',
 'Coventry & Warwickshire',
 'Cricket',
 'Croatia',
 'Culture',
 'Cumbria',
 'Cycling',
 'Darts',
 'Derby',
 'Devon',
 'Disability',
 'Disability Sport',
 'Diving',
 'Dorset',
 'Dragons',
 'EFL Cup',
 'EU Referendum',
 'Edinburgh, Fife & East',
 'Edinburgh,

### Articles filtering

In [52]:
tech_categories = [
    'BBC Trending',
    'BBC Verify',
    'Business',
    'Companies',
    'Climate',
    'Education & Family',
    'Features',
    'Health',
    'Inside Europe Blog',
    'Long Reads',
    'Reality Check',
    'Science & Environment',
    'Sustainability',
    'Technology',
    'UK Politics',
    'US & Canada',
    'World',
    'Your Money'
]

article_df["section"] = article_df["section"].progress_apply(lambda x: x if x in tech_categories else "nan")
article_df = article_df[article_df["section"] != "nan"]
display(article_df)

100%|██████████| 140507/140507 [00:00<00:00, 920271.64it/s]


Unnamed: 0,title,published_date,authors,description,section,content,link,top_image,title_cleaned,description_cleaned,content_cleaned
14,How a dead gorilla became the meme of 2016 - B...,2017-01-01,https://www.facebook.com/bbcnews,After Harambe was shot in a sad incident in Ci...,BBC Trending,His was the face which launched a thousand mem...,http://www.bbc.co.uk/news/blogs-trending-38383126,https://ichef.bbci.co.uk/news/976/cpsprodpb/14...,dead gorilla became meme bbc news,harambe shot sad incident cincinnati lived mil...,face launched thousand meme harambe gorilla ca...
16,'Hollywood' sign changed to 'Hollyweed' in new...,2017-01-01,https://www.facebook.com/bbcnews,A prankster changes the worldfamous Hollywood ...,US & Canada,The iconic sign was changed overnight on New Y...,http://www.bbc.co.uk/news/world-us-canada-3848...,https://ichef.bbci.co.uk/news/976/cpsprodpb/89...,sign changed new year prank bbc news,prankster change worldfamous hollywood sign re...,iconic sign changed overnight new year eve res...
17,Billions of pounds that you fail to claim - BB...,2017-01-01,https://www.facebook.com/bbcnews,Huge payouts from benefits and compensation t...,Business,A bumper lottery draw was organised following ...,http://www.bbc.co.uk/news/business-38318435,https://ichef.bbci.co.uk/news/976/cpsprodpb/A4...,billion pound fail claim bbc news,huge payouts benefit compensation pension lott...,bumper lottery draw organised following team g...
36,The psychological secrets to successful resolu...,2017-01-01,https://www.facebook.com/bbcnews,There are psychological tricks which can help ...,Health,Its important to have achievable goals After ...,http://www.bbc.co.uk/news/health-38470263,https://ichef.bbci.co.uk/news/976/cpsprodpb/83...,psychological secret successful resolution bbc...,psychological trick help people achieve stick ...,important achievable goal excess festive seaso...
47,What next for Paul Nuttall's UKIP? - BBC News,2017-01-02,https://www.facebook.com/bbcnews,After a tumultuous 2016 and a Brexit victory w...,UK Politics,Paul Nuttall left is hoping to push Labour har...,http://www.bbc.co.uk/news/uk-politics-38408815,https://ichef.bbci.co.uk/news/976/cpsprodpb/6B...,next paul bbc news,tumultuous brexit victory ukips challenge year...,paul nuttall left hoping push labour hard ukip...
...,...,...,...,...,...,...,...,...,...,...,...
140460,We took on Google and forced them to pay out £...,2024-10-27,https://www.facebook.com/bbcnews,Shivaun Raff and her husband Adam describe the...,Technology,The couple who took on Google and cost the tec...,http://www.bbc.co.uk/news/articles/cjr431lr72jo,https://ichef.bbci.co.uk/ace/standard/3840/cps...,took google forced pay bbc news,shivaun raff husband adam describe long court ...,couple took google cost tech giant shivaun raf...
140461,Katty Kay: What’s really behind America’s men ...,2024-10-27,https://www.facebook.com/bbcnews,With polls showing a stark gender divide the r...,US & Canada,Donald Trump enjoys a huge lead among men whil...,http://www.bbc.co.uk/news/articles/cjr430gry81o,https://ichef.bbci.co.uk/ace/standard/976/cpsp...,katty really behind men v woman election bbc news,poll showing stark gender divide race shaping ...,donald trump enjoys huge lead among men woman ...
140465,No new freeports to come in Budget after 'comm...,2024-10-27,https://www.facebook.com/bbcnews,The government mistakenly briefed that five ne...,Business,No new freeports in Budget after comms cockup ...,http://www.bbc.co.uk/news/articles/c74887vj8p4o,https://ichef.bbci.co.uk/ace/standard/820/cpsp...,new freeports come budget bbc news,government mistakenly briefed five new lowtax ...,new freeports budget comms cockup chancellor u...
140473,Child Trust Fund: 'My £250 investment is now w...,2024-10-27,https://www.facebook.com/bbcnews,Max Prince was shocked to discover that surpri...,Business,My £250 Child Trust Fund is now worth only £12...,http://www.bbc.co.uk/news/articles/ckg7j83drd5o,https://ichef.bbci.co.uk/ace/standard/3840/cps...,child trust investment worth bbc news,max prince shocked discover surprise fee left ...,child trust fund worth max shortly child trust...


### Save preprocessed data

In [53]:
columns = ["published_date","section",'title_cleaned',"description_cleaned","content_cleaned"]
processed_data = article_df[columns]
processed_data.columns = ["date","section","title","description","content"]
processed_data["date"] = pd.to_datetime(processed_data["date"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  processed_data["date"] = pd.to_datetime(processed_data["date"])


In [54]:
display(processed_data.head())

Unnamed: 0,date,section,title,description,content
14,2017-01-01,BBC Trending,dead gorilla became meme bbc news,harambe shot sad incident cincinnati lived mil...,face launched thousand meme harambe gorilla ca...
16,2017-01-01,US & Canada,sign changed new year prank bbc news,prankster change worldfamous hollywood sign re...,iconic sign changed overnight new year eve res...
17,2017-01-01,Business,billion pound fail claim bbc news,huge payouts benefit compensation pension lott...,bumper lottery draw organised following team g...
36,2017-01-01,Health,psychological secret successful resolution bbc...,psychological trick help people achieve stick ...,important achievable goal excess festive seaso...
47,2017-01-02,UK Politics,next paul bbc news,tumultuous brexit victory ukips challenge year...,paul nuttall left hoping push labour hard ukip...


In [None]:
processed_data.to_csv("bbc_preprocessed.csv", index=False)

: 