In [2]:
import pandas as pd
import numpy as np

# Load dataset
df = pd.read_csv("../data/fake_news.csv")

print(df.head())
print(df.info())


                                               title  \
0  Iraq parliament panel asks government to 'reci...   
1  Obama to press Trump to preserve Cuba detente:...   
2  Joy and relief greet Puerto Rico fuel deliveri...   
3  WATCH: BLACK SPORTS HOST BLASTS White ESPN Spo...   
4  MINORITIES TURN ON OBAMA…BLAST HIS “LEGACY”: “...   

                                                text  label  
0  BAGHDAD (Reuters) - Iraq’s foreign affairs com...      1  
1  WASHINGTON (Reuters) - President Barack Obama ...      1  
2  QUEBRADILLAS, Puerto Rico (Reuters) - When Nes...      1  
3  When will Americans stop being afraid of stand...      0  
4   Barry, the Democratic party has done nothing ...      0  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44898 entries, 0 to 44897
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   44898 non-null  object
 1   text    44898 non-null  object
 2   label   44898 non-null  int64 
dty

In [3]:

print("\nMissing Values:\n", df.isnull().sum())
df = df.dropna()



Missing Values:
 title    0
text     0
label    0
dtype: int64


In [None]:

# Ensure string type
df["title"] = df["title"].astype(str)
df["text"] = df["text"].astype(str)

# Combine title and text
df["content"] = df["title"] + " " + df["text"]

# remove very short content
df = df[df["content"].str.len() > 50]

# Keep required columns
df = df[["content", "label"]]
df.columns = ["text", "label"]


In [5]:

print("\nClass Distribution:")
print(df["label"].value_counts())



Class Distribution:
label
0    23473
1    21416
Name: count, dtype: int64


In [6]:

# Shuffle
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Train-test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df["text"],
    df["label"],
    test_size=0.2,
    random_state=42,
    stratify=df["label"]
)

print("\nTraining samples:", len(X_train))
print("Testing samples:", len(X_test))



Training samples: 35911
Testing samples: 8978


In [7]:

train_df = pd.DataFrame({"text": X_train, "label": y_train})
test_df = pd.DataFrame({"text": X_test, "label": y_test})

train_df.to_csv("../data/train.csv", index=False)
test_df.to_csv("../data/test.csv", index=False)

print("\n Preprocessing completed and files saved.")



 Preprocessing completed and files saved.
