In [3]:
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

# Initialize stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# import necessary nltk data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to C:\Users\Familie
[nltk_data]     fatah\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Familie
[nltk_data]     fatah\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Familie
[nltk_data]     fatah\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
# Load the dataset from a local file
df = pd.read_csv("../data/News_Aggregator_Kitwe Data Collection.csv")
df.head()

Unnamed: 0,Source,Category,Headline,Link,Description,Date,Author
0,Daily Mail Zambia,"Headlines, News",K8 million CBU hall building starts,https://www.daily-mail.co.zm/2024/10/15/k8-mil...,MWILA NTAMBI Kitwe COPPERBELT University (CBU)...,"Tue, 15 Oct 2024 06:41:52 +0000",Website Editor
1,Daily Mail Zambia,"Headlines, News","Exercise aimed at recruiting 300,000 members i...",https://www.daily-mail.co.zm/2024/10/15/exerci...,MELODY MUPETA Kitwe THE United Party for Natio...,"Tue, 15 Oct 2024 06:17:07 +0000",Website Editor
2,Daily Mail Zambia,"Headlines, News",K8 million CBU hall building starts,https://www.daily-mail.co.zm/2024/10/15/k8-mil...,MWILA NTAMBI Kitwe COPPERBELT University (CBU)...,"Tue, 15 Oct 2024 06:41:52 +0000",Website Editor
3,Daily Mail Zambia,"Headlines, News","Exercise aimed at recruiting 300,000 members i...",https://www.daily-mail.co.zm/2024/10/15/exerci...,MELODY MUPETA Kitwe THE United Party for Natio...,"Tue, 15 Oct 2024 06:17:07 +0000",Website Editor
4,Daily Mail Zambia,"Headlines, News",K8 million CBU hall building starts,https://www.daily-mail.co.zm/2024/10/15/k8-mil...,MWILA NTAMBI Kitwe COPPERBELT University (CBU)...,"Tue, 15 Oct 2024 06:41:52 +0000",Website Editor


# 1.data cleaning

In [5]:
# display data types and non-null counts
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14344 entries, 0 to 14343
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Source       14344 non-null  object
 1   Category     13870 non-null  object
 2   Headline     14343 non-null  object
 3   Link         14344 non-null  object
 4   Description  14330 non-null  object
 5   Date         14344 non-null  object
 6   Author       14343 non-null  object
dtypes: object(7)
memory usage: 784.6+ KB


## 1.1 Remove Duplicates

In [6]:
#remove duplicate based on all columns
df_cleaned= df.drop_duplicates()
print(f"original data had {df.shape[0]} rows.")
print(f"cleaned data has {df_cleaned.shape[0]} rows.")

original data had 14344 rows.
cleaned data has 12346 rows.


In [7]:
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12346 entries, 0 to 14343
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Source       12346 non-null  object
 1   Category     11872 non-null  object
 2   Headline     12345 non-null  object
 3   Link         12346 non-null  object
 4   Description  12332 non-null  object
 5   Date         12346 non-null  object
 6   Author       12345 non-null  object
dtypes: object(7)
memory usage: 771.6+ KB


## 1.2 Handle missing values

In [8]:
#check for missing values
print(df_cleaned.isnull().sum())

Source           0
Category       474
Headline         1
Link             0
Description     14
Date             0
Author           1
dtype: int64


In [9]:
#drop the rows with missing values
df_cleaned=df_cleaned.dropna()
print(f"cleaned data has {df_cleaned.shape[0]} rows.")

cleaned data has 11858 rows.


In [10]:
df_cleaned.isnull().sum()

Source         0
Category       0
Headline       0
Link           0
Description    0
Date           0
Author         0
dtype: int64

## 1.3 Convert the date to Datetime Format

In [11]:
#Check the dateformat
df_cleaned.dtypes

Source         object
Category       object
Headline       object
Link           object
Description    object
Date           object
Author         object
dtype: object

In [12]:
#Convert 'Date' to datetime format
df_cleaned['Date']=pd.to_datetime(df_cleaned['Date'])

In [13]:
df_cleaned.dtypes

Source                      object
Category                    object
Headline                    object
Link                        object
Description                 object
Date           datetime64[ns, UTC]
Author                      object
dtype: object

# 2. Data preprocessing

In [15]:
 import nltk
 nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to C:\Users\Familie
[nltk_data]     fatah\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [30]:
def preprocess_text(text):
    #remove URLs
    text = re.sub(r'http\S+', '', text)
    #remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    #remove punktuation
    text = re.sub(r'[^\w\s]', '', text)
    #lowecase text
    text = text.lower()
    #tokenize text
    tokens = word_tokenize(text)
    #remove stop words and lemmatize tokens
    cleaned_tokens=[lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return cleaned_tokens

In [31]:
df_cleaned['Description_cl']=df_cleaned['Description'].apply(preprocess_text)

In [32]:
print(f"original text\n{df_cleaned['Description'].loc[14341]}")
print(f"preprocessed text:\n{df_cleaned['Description_cl'].loc[14341]}")

original text
<p>By Michael Kaluba Government has directed Zambia Police Traffic Officers to ensure that no trucks are allowed to park at undesignated spots on the road while truck parks remain empty, including those currently parked along the road towards Kasumbalesa Border. According to Phoenix, transport and Logistics Minister Frank Tayali said even trucks that are currently [&#8230;]</p>
<p>The post <a href="https://zambiareports.news/2022/06/08/traffic-police-ordered-to-rid-roads-of-insanity/">TRAFFIC POLICE ORDERED TO RID ROADS OF INSANITY</a> appeared first on <a href="https://zambiareports.news">Zambia Reports</a>.</p>
preprocessed text:
['michael', 'kaluba', 'government', 'directed', 'zambia', 'police', 'traffic', 'officer', 'ensure', 'truck', 'allowed', 'park', 'undesignated', 'spot', 'road', 'truck', 'park', 'remain', 'empty', 'including', 'currently', 'parked', 'along', 'road', 'towards', 'kasumbalesa', 'border', 'according', 'phoenix', 'transport', 'logistics', 'minister',

In [35]:
#saving as csv file
df_cleaned.to_csv('../data/cleaned_data.csv', index=False)