In [None]:

# 1. Import libraries

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import os

# 2. Load training metadata

train_path = "/kaggle/input/fake-or-real-the-impostor-hunt/data/train.csv"
train_df = pd.read_csv(train_path)

print("Train shape:", train_df.shape)
print(train_df.head())

# 3. Load test dataset (loop through txt files)

test_dir = "/kaggle/input/fake-or-real-the-impostor-hunt/data/test"

test_data = []
for article_id in os.listdir(test_dir):
    article_path = os.path.join(test_dir, article_id)
    if os.path.isdir(article_path):
        for file_name in os.listdir(article_path):
            file_path = os.path.join(article_path, file_name)
            with open(file_path, "r", encoding="utf-8") as f:
                text = f.read()
            test_data.append({
                "article_id": article_id,
                "file_id": file_name,
                "text": text
            })

test_df = pd.DataFrame(test_data)
print("Test shape:", test_df.shape)
print(test_df.head())


# 4. Build text dataset for training


# Merge with texts
train_full = []
for _, row in train_df.iterrows():
    aid = f"article_{row['id']:04d}"
    real_file = f"file_{row['real_text_id']}.txt"

    # real label
    real_text = test_df[(test_df.article_id == aid) & (test_df.file_id == real_file)]["text"].values[0]
    train_full.append({"id": row["id"], "text": real_text, "label": 1})

    # fake label (the *other* file)
    fake_file = "file_1.txt" if row["real_text_id"] == 2 else "file_2.txt"
    fake_text = test_df[(test_df.article_id == aid) & (test_df.file_id == fake_file)]["text"].values[0]
    train_full.append({"id": row["id"], "text": fake_text, "label": 0})

train_full = pd.DataFrame(train_full)
print("Expanded train shape:", train_full.shape)
print(train_full.head())

# 5. TF-IDF + Logistic Regression

X = train_full["text"].fillna("")
y = train_full["label"]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

tfidf = TfidfVectorizer(stop_words="english", max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_val_tfidf   = tfidf.transform(X_val)

model = LogisticRegression(max_iter=300, random_state=42)
model.fit(X_train_tfidf, y_train)

val_preds = model.predict(X_val_tfidf)
print("Validation Accuracy:", accuracy_score(y_val, val_preds))


# 6. Predict real text for each test article

submission_rows = []
for aid, group in test_df.groupby("article_id"):
    texts = group["text"].tolist()
    files = group["file_id"].tolist()

    X_test = tfidf.transform(texts)
    preds = model.predict_proba(X_test)[:,1]  # probability of real

    best_idx = np.argmax(preds)
    best_file = files[best_idx]

    # Convert file_1.txt -> 1, file_2.txt -> 2
    real_text_id = 1 if "file_1" in best_file else 2
    article_num = int(aid.replace("article_", ""))

    submission_rows.append({"id": article_num, "real_text_id": real_text_id})

submission = pd.DataFrame(submission_rows).sort_values("id")
submission.to_csv("submission.csv", index=False)

print(" submission.csv saved")
print(submission.head())
