# ESA Project: Fake or Real: The Impostor Hunt in Texts

This notebook is dedicated to **data preparation**.  
It covers:

- Loading the raw metadata (`train.csv`) and text files for each article.  
- Cleaning and organizing the dataset.  
- Splitting the data into training and validation sets by `id`.  
- Saving the processed datasets for downstream modeling.

## Import librairies

In [1]:
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
import sys
import os

# Add the src folder to Python path
sys.path.append(os.path.abspath(os.path.join('..', 'src')))
import config

## Data Preparation

In [2]:
# DATA_PATH = Path("../data/raw")
TRAIN_CSV = config.RAW_DATA_DIR / "train.csv"
TRAIN_FOLDER = config.RAW_DATA_DIR / "train"

train_csv = pd.read_csv(TRAIN_CSV)

def load_text_files(article_id):
    # article_id is the value in the 'id' column
    article_folder = TRAIN_FOLDER / f"article_{int(article_id):04d}"
    file1 = (article_folder / "file_1.txt").read_text(encoding="utf-8")
    file2 = (article_folder / "file_2.txt").read_text(encoding="utf-8")
    return file1, file2

train_csv['file1_text'], train_csv['file2_text'] = zip(*train_csv['id'].apply(load_text_files))

# No text in file1 [index=14] and file2 [index=10] (double checked on the Kaggle website)
train_csv = train_csv[(train_csv['file1_text'].str.strip() != '') &
                      (train_csv['file2_text'].str.strip() != '')].reset_index(drop=True)

train_df, val_df = train_test_split(
    train_csv,
    test_size=0.2,
    stratify=train_csv['real_text_id'],  # ensures balanced labels
    random_state=42
)

# Quick check
print("Train shape:", train_df.shape)
print("Validation shape:", val_df.shape)
print("Train label distribution:\n", train_df['real_text_id'].value_counts())
print("Validation label distribution:\n", val_df['real_text_id'].value_counts())

# Save processed csv files
PROCESSED_PATH = Path("../data/processed")
PROCESSED_PATH.mkdir(exist_ok=True, parents=True)
train_df.to_csv(PROCESSED_PATH / "train_cleaned.csv", index=False)
val_df.to_csv(PROCESSED_PATH / "val_cleaned.csv", index=False)

print("Processed train/validation sets saved in data/processed/")

Train shape: (74, 4)
Validation shape: (19, 4)
Train label distribution:
 real_text_id
2    38
1    36
Name: count, dtype: int64
Validation label distribution:
 real_text_id
2    10
1     9
Name: count, dtype: int64
Processed train/validation sets saved in data/processed/


# End of Data Preparation notebook