In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import os

In [2]:

# --- 1. Load Cleaned Data ---
# **NOTE:** يرجى تحديث هذا الاسم ليطابق اسم الملف الثاني الذي يحتوي على 'title'
CLEANED_FILE_NAME = 'fake_news_dataset .csv'  
# The path assumes the file is in the main project directory (up one level from 'notebooks/')
FILE_PATH = os.path.join('..', CLEANED_FILE_NAME) 

# Define column names
TEXT_COL = 'text'
TITLE_COL = 'title'
CATEGORY_COL = 'category'
FAKE_COL = 'label'  # Already 0 or 1

In [3]:
# --- 2. Load and Prepare Data ---
try:
    df = pd.read_csv(FILE_PATH)
    print(f"File loaded successfully. Initial Rows: {len(df)}")
except FileNotFoundError:
    print(f"Error: File not found at {FILE_PATH}. Please check the file name/path.")
    exit()

File loaded successfully. Initial Rows: 20000


In [4]:
# --- 3. Feature Encoding (THIS IS THE CRUCIAL STEP) ---

# A. Encode the Multi-Class Category Column (Conversion to numerical data)
le_category = LabelEncoder()
# The column 'category_encoded' is created HERE
df['category_encoded'] = le_category.fit_transform(df[CATEGORY_COL].astype(str))
print("Category Encoding Complete. 'category_encoded' column created.")
print(f"Number of unique categories encoded: {len(le_category.classes_)}")

Category Encoding Complete. 'category_encoded' column created.
Number of unique categories encoded: 7


In [5]:
# --- 4. Data Splitting (Train: 65% / Validation: 15% / Test: 20%) ---

# 4.1. Split 1: 80% Training/Validation vs 20% Test
train_val_df, test_df = train_test_split(
    df, 
    test_size=0.20, # 20% for test set
    random_state=42, 
    stratify=df[FAKE_COL]
)

# 4.2. Split 2: Divide the 80% (train_val_df) into 65% Train and 15% Validation
VAL_RATIO_OF_TRAIN_VAL = 0.1875 # 15% / 80% = 0.1875

train_df, val_df = train_test_split(
    train_val_df, 
    test_size=VAL_RATIO_OF_TRAIN_VAL, 
    random_state=42, 
    stratify=train_val_df[FAKE_COL]
)

In [6]:
# --- 5. Save the Final Numerical DataFrames ---
# Create the 'processed_data/' folder to maintain structure
processed_folder = '../processed_data/'
if not os.path.exists(processed_folder):
    os.makedirs(processed_folder)

# Columns to save in the final processed files (Now 'category_encoded' EXISTS)
COLUMNS_TO_SAVE = [TITLE_COL, TEXT_COL, FAKE_COL, 'category_encoded']

train_df[COLUMNS_TO_SAVE].to_csv(os.path.join(processed_folder, 'train_data_final.csv'), index=False, encoding='utf-8')
val_df[COLUMNS_TO_SAVE].to_csv(os.path.join(processed_folder, 'val_data_final.csv'), index=False, encoding='utf-8')
test_df[COLUMNS_TO_SAVE].to_csv(os.path.join(processed_folder, 'test_data_final.csv'), index=False, encoding='utf-8')

print("\n--- DATA SPLITTING SUMMARY ---")
print(f"Total Rows: {len(df)}")
print(f"Training Set (65%): {len(train_df)} rows")
print(f"Validation Set (15%): {len(val_df)} rows")
print(f"Testing Set (20%): {len(test_df)} rows")
print("All final splits saved successfully to 'processed_data/' folder.")


--- DATA SPLITTING SUMMARY ---
Total Rows: 20000
Training Set (65%): 13000 rows
Validation Set (15%): 3000 rows
Testing Set (20%): 4000 rows
All final splits saved successfully to 'processed_data/' folder.
