In [79]:
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

# Initialize stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# import necessary nltk data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to C:\Users\Familie
[nltk_data]     fatah\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Familie
[nltk_data]     fatah\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Familie
[nltk_data]     fatah\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [80]:
# Load the dataset from a local file
df = pd.read_csv("../data/Final_dataset2.csv")
df.head()

Unnamed: 0,Source,Headlines,Category,Link,Date,Author,Target_final,description
0,daily mail zambia,million cbu hall building start,education,https://www.daily-mail.co.zm/2024/10/15/k8-mil...,2024-10-15 06:41:52+00:00,Website Editor,0,mwila ntambi kitwe copperbelt university cbu k...
1,daily mail zambia,exercise aim recruit member province,politics,https://www.daily-mail.co.zm/2024/10/15/exerci...,2024-10-15 06:17:07+00:00,Website Editor,0,melody mupeta kitwe united party national deve...
2,flava fm,charcoal trader chimwemwe township appeal gove...,local news,https://flavaradioandtv.com/charcoal-traders-i...,2024-05-13 08:00:53+00:00,Newsroom,1,group charcoal trader cmml area kitwe chimwemw...
3,flava fm,ecologist warn urbanisation dire impact biodiv...,development,https://flavaradioandtv.com/ecologist-warns-of...,2024-03-26 08:33:05+00:00,Newsroom,1,pecologist mutende simwanza raise alarm detrim...
4,flava fm,copperbelt minister urge mopani settle debt ki...,politics,https://flavaradioandtv.com/copperbelt-ministe...,2024-03-14 10:18:04+00:00,Newsroom,1,pcopperbelt minister elisha matambo issue mopa...


# 1.data cleaning

In [81]:
# display data types and non-null counts
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10811 entries, 0 to 10810
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Source        10811 non-null  object
 1   Headlines     10811 non-null  object
 2   Category      10811 non-null  object
 3   Link          10811 non-null  object
 4   Date          10811 non-null  object
 5   Author        10811 non-null  object
 6   Target_final  10811 non-null  int64 
 7   description   10811 non-null  object
dtypes: int64(1), object(7)
memory usage: 675.8+ KB


## 1.1 Remove Duplicates

In [82]:
#remove duplicate based on all columns
df_cleaned= df.drop_duplicates()
print(f"original data had {df.shape[0]} rows.")
print(f"cleaned data has {df_cleaned.shape[0]} rows.")

original data had 10811 rows.
cleaned data has 10811 rows.


In [83]:
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10811 entries, 0 to 10810
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Source        10811 non-null  object
 1   Headlines     10811 non-null  object
 2   Category      10811 non-null  object
 3   Link          10811 non-null  object
 4   Date          10811 non-null  object
 5   Author        10811 non-null  object
 6   Target_final  10811 non-null  int64 
 7   description   10811 non-null  object
dtypes: int64(1), object(7)
memory usage: 675.8+ KB


## 1.2 Handle missing values

In [84]:
#check for missing values
print(df_cleaned.isnull().sum())

Source          0
Headlines       0
Category        0
Link            0
Date            0
Author          0
Target_final    0
description     0
dtype: int64


In [85]:
#drop the rows with missing values
#df_cleaned=df_cleaned.dropna()
#print(f"cleaned data has {df_cleaned.shape[0]} rows.")

In [86]:
df_cleaned.isnull().sum()

Source          0
Headlines       0
Category        0
Link            0
Date            0
Author          0
Target_final    0
description     0
dtype: int64

## 1.3 Convert the date to Datetime Format

In [87]:
#Check the dateformat
df_cleaned.dtypes

Source          object
Headlines       object
Category        object
Link            object
Date            object
Author          object
Target_final     int64
description     object
dtype: object

In [88]:
#Convert 'Date' to datetime format
df_cleaned['Date']=pd.to_datetime(df_cleaned['Date'])

In [89]:
df_cleaned.dtypes

Source                       object
Headlines                    object
Category                     object
Link                         object
Date            datetime64[ns, UTC]
Author                       object
Target_final                  int64
description                  object
dtype: object

# 2. Data preprocessing

In [90]:
 import nltk
 nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to C:\Users\Familie
[nltk_data]     fatah\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [91]:
def preprocess_text(text):
    #remove URLs
    text = re.sub(r'http\S+', '', text)
    #remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    #remove punktuation
    text = re.sub(r'[^\w\s]', '', text)
    #lowecase text
    text = text.lower()
    #tokenize text
    tokens = word_tokenize(text)
    #remove stop words and lemmatize tokens
    cleaned_tokens=[lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return cleaned_tokens

In [92]:
df_cleaned['description_cl']=df_cleaned['description'].apply(preprocess_text)

In [93]:
df_cleaned['description']

0        mwila ntambi kitwe copperbelt university cbu k...
1        melody mupeta kitwe united party national deve...
2        group charcoal trader cmml area kitwe chimwemw...
3        pecologist mutende simwanza raise alarm detrim...
4        pcopperbelt minister elisha matambo issue mopa...
                               ...                        
10806    ppolice kitwe hold year man sexually abuse dau...
10807    ppresident hakainde hichilema direct copperbel...
10808    pby michael kaluba government direct zambia po...
10809    drug enforcement commission dec arrest people ...
10810    pkitwe central police officer family late rich...
Name: description, Length: 10811, dtype: object

In [94]:
print(f"original text\n{df_cleaned['description'].iloc[0]}")
print(f"preprocessed text:\n{df_cleaned['description_cl'].iloc[0]}")

original text
mwila ntambi kitwe copperbelt university cbu kitwe launch construction auditorium million cbu vicechancellor imasiku nyambe building auditorium milestone enhance infrastructure learn environment high learning institution say finished auditorium host academic conference
preprocessed text:
['mwila', 'ntambi', 'kitwe', 'copperbelt', 'university', 'cbu', 'kitwe', 'launch', 'construction', 'auditorium', 'million', 'cbu', 'vicechancellor', 'imasiku', 'nyambe', 'building', 'auditorium', 'milestone', 'enhance', 'infrastructure', 'learn', 'environment', 'high', 'learning', 'institution', 'say', 'finished', 'auditorium', 'host', 'academic', 'conference']


In [95]:
#saving as csv file
df_cleaned.to_csv('../data/cleaned_data.csv', index=False)