# Fake News Detection - Training Notebook

Run this notebook in Google Colab to train the model.

In [None]:
# 1. Install Dependencies
!pip install pandas scikit-learn nltk

In [None]:
# 2. Import Libraries
import pandas as pd
import numpy as np
import re
import string
import pickle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import nltk
from nltk.corpus import stopwords

# Download NLTK data
nltk.download('stopwords')

In [None]:
# 3. Load Dataset
# Ensure you have uploaded 'Fake.csv' and 'True.csv' to the Colab files section.
try:
    df_fake = pd.read_csv("Fake.csv")
    df_true = pd.read_csv("True.csv")
    print("Datasets loaded successfully.")
except FileNotFoundError:
    print("Error: 'Fake.csv' and 'True.csv' not found. Please upload them to Colab.")

In [None]:
# 4. Preprocessing
df_fake["class"] = 0
df_true["class"] = 1

# Remove last 10 rows for manual testing (optional)
df_fake_manual_testing = df_fake.tail(10)
for i in range(23480, 23470, -1):
    df_fake.drop([i], axis=0, inplace=True)

df_true_manual_testing = df_true.tail(10)
for i in range(21416, 21406, -1):
    df_true.drop([i], axis=0, inplace=True)

# Merge
df_marge = pd.concat([df_fake, df_true], axis=0)

# Drop unused columns
df = df_marge.drop(["title", "subject", "date"], axis=1)

# Shuffle
df = df.sample(frac=1)
df.reset_index(inplace=True)
df.drop(["index"], axis=1, inplace=True)

def wordopt(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W", " ", text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

print("Cleaning text...")
df["text"] = df["text"].apply(wordopt)

# Split
x = df["text"]
y = df["class"]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)
print("Data preprocessed and split.")

In [None]:
# 5. Vectorization and Training
print("Vectorizing...")
vectorization = TfidfVectorizer()
xv_train = vectorization.fit_transform(x_train)
xv_test = vectorization.transform(x_test)

print("Training Logistic Regression...")
LR = LogisticRegression()
LR.fit(xv_train, y_train)

print("Evaluating...")
pred_lr = LR.predict(xv_test)
print("Accuracy:", accuracy_score(y_test, pred_lr))
print(classification_report(y_test, pred_lr))

In [None]:
# 6. Save Model
with open('model.pkl', 'wb') as f:
    pickle.dump(LR, f)

with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorization, f)

print("Done! Download 'model.pkl' and 'vectorizer.pkl' from the files section.")