In [24]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split

# Load the dataset
df = pd.read_csv('../data/fake_news_data.csv')

In [25]:
# Data Preprocessing
# 1. Convert the title to lowercase
df['title'] = df['title'].str.lower()

In [26]:
# 2. Remove punctuation from the title
df['title'] = df['title'].str.replace('[^\w\s]', '', regex=True)


In [27]:
# 3. Remove numbers from the title
df['title'] = df['title'].str.replace('\d+', '', regex=True)

In [28]:
# 4. Handle missing values in the title (fill with an empty string or drop the rows)
df['title'] = df['title'].fillna('')

In [29]:
# 5. Remove duplicates in the cleaned title
df = df.drop_duplicates(subset='title', keep='first')
print(f"\nNumber of remaining duplicates in 'title': {df.duplicated(subset='title').sum()}")


Number of remaining duplicates in 'title': 0


In [30]:
# 6. Remove rows where the title is still missing or empty after cleaning (optional)
df = df[df['title'].str.strip() != '']
print(f"\nNumber of rows after removing empty titles: {len(df)}")


Number of rows after removing empty titles: 12237


In [31]:
# Save the cleaned and processed data
df.to_csv('../data/processed_data.csv', index=False)

In [32]:
# Display processed data information
print("\nProcessed Data Info:")
print(df.info())


Processed Data Info:
<class 'pandas.core.frame.DataFrame'>
Index: 12237 entries, 0 to 15336
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   article_id     12237 non-null  int64 
 1   title          12237 non-null  object
 2   description    12210 non-null  object
 3   news_url       12060 non-null  object
 4   source_domain  12060 non-null  object
 5   real           12237 non-null  int64 
dtypes: int64(2), object(4)
memory usage: 669.2+ KB
None


In [33]:
# Confirmation message
print("Preprocessing completed and saved as 'processed_data.csv' inside /data folder.")

Preprocessing completed and saved as 'processed_data.csv'.
