In [4]:
import pandas as pd
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [3]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [6]:
fake=pd.read_csv('../data/Fake.csv')
real=pd.read_csv('../data/True.csv')

In [8]:
fake['label']='fake'
real['label']='real'

df = pd.concat([fake, real]).sample(frac=1).reset_index(drop=True)

In [9]:
df.count()


title      44898
text       44898
subject    44898
date       44898
label      44898
dtype: int64

In [10]:
import re
import spacy
from nltk.corpus import stopwords
import string

In [None]:
# Load spacy model
nlp = spacy.load('en_core_web_sm')

def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove URLs
    text = re.sub(r'http\S+|www\S+', '', text)
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove non-alphabetic characters (keep spaces)
    text = re.sub(r'[^a-zA-Z ]', '', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    # Remove stopwords using nltk
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])
    # Lemmatization using spacy
    doc = nlp(text)
    text = ' '.join([token.lemma_ for token in doc])
    return text


In [8]:
# Run preprocessing on all samples
df['processed_title'] = df['title'].apply(preprocess_text)
df['processed_text'] = df['text'].apply(preprocess_text)
df['processed_subject'] = df['subject'].apply(preprocess_text)
df['processed_date'] = df['date'].astype(str).apply(preprocess_text)

processed_df = df[['processed_title', 'processed_text', 'processed_subject', 'processed_date', 'label']]

processed_df.head()

Unnamed: 0,processed_title,processed_text,processed_subject,processed_date,label
0,lindsey graham call party say republicans be n...,sc senator lindsey graham shine beacon republi...,news,january,fake
1,trump take second crack pivot next week appren...,washington reuters president donald trump beco...,politicsnew,june,real
2,saudi king say kingdom make progress tackle te...,mecca saudi arabia reuters saudi king salman r...,worldnew,september,real
3,trump support republican tax overhaul bill adv...,washington reuters us president donald trump s...,politicsnew,november,real
4,boom clinton rap pay foundation key nation video,,politic,nov,fake


In [9]:
from nltk.tokenize import word_tokenize

# Tokenize processed columns in the sample dataframe
processed_df['tokens_title'] = processed_df['processed_title'].apply(word_tokenize)
processed_df['tokens_text'] = processed_df['processed_text'].apply(word_tokenize)
processed_df['tokens_subject'] = processed_df['processed_subject'].apply(word_tokenize)
processed_df['tokens_date'] = processed_df['processed_date'].apply(word_tokenize)

# Display tokenized sample
processed_df[['tokens_title', 'tokens_text', 'tokens_subject', 'tokens_date', 'label']].head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  processed_df['tokens_title'] = processed_df['processed_title'].apply(word_tokenize)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  processed_df['tokens_text'] = processed_df['processed_text'].apply(word_tokenize)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  processed_df['tokens_text'] = processed

Unnamed: 0,tokens_title,tokens_text,tokens_subject,tokens_date,label
0,"[lindsey, graham, call, party, say, republican...","[sc, senator, lindsey, graham, shine, beacon, ...",[news],[january],fake
1,"[trump, take, second, crack, pivot, next, week...","[washington, reuters, president, donald, trump...",[politicsnew],[june],real
2,"[saudi, king, say, kingdom, make, progress, ta...","[mecca, saudi, arabia, reuters, saudi, king, s...",[worldnew],[september],real
3,"[trump, support, republican, tax, overhaul, bi...","[washington, reuters, us, president, donald, t...",[politicsnew],[november],real
4,"[boom, clinton, rap, pay, foundation, key, nat...",[],[politic],[nov],fake


In [None]:
import pickle

# Join tokens back to string for vectorization
processed_df['text_for_vectorizer'] = processed_df['tokens_text'].apply(lambda tokens: ' '.join(tokens))

# Vectorize the text
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(processed_df['text_for_vectorizer'])
y = processed_df['label'].map({'fake': 0, 'real': 1})  # Encode labels

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Logistic Regression model
model = LogisticRegression(class_weight='balanced',max_iter=1000)
model.fit(X_train, y_train)

# Save vectorizer and model to disk
with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

with open('model.pkl', 'wb') as f:
    pickle.dump(model, f)

# Evaluate model
score = model.score(X_test, y_test)
print(f"Test Accuracy: {score:.4f}")