# Practical - 14 : Use news headlines to classify fake vs real news using NLP methods.

In [None]:
import pandas as pd

In [None]:
fake = pd.read_csv('/content/Fake.csv')
real = pd.read_csv('/content/True.csv')

In [None]:
fake['label'] = 0
real['label'] = 1

df = pd.concat([fake[['title', 'label']], real[['title', 'label']]], ignore_index=True).dropna()
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
import re

def clean_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text.lower().strip()

df['clean_title'] = df['title'].apply(clean_text)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)
X = vectorizer.fit_transform(df['clean_title'])
y = df['label']


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

model = LogisticRegression()
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9474387527839644
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.94      0.95      4710
           1       0.93      0.96      0.95      4270

    accuracy                           0.95      8980
   macro avg       0.95      0.95      0.95      8980
weighted avg       0.95      0.95      0.95      8980



In [None]:
import numpy as np

feature_names = vectorizer.get_feature_names_out()
coefs = model.coef_[0]
top_fake = np.argsort(coefs)[:10]
top_real = np.argsort(coefs)[-10:]

print("Top indicative words for fake news:")
print([feature_names[i] for i in top_fake])
print("\nTop indicative words for real news:")
print([feature_names[i] for i in top_real])


Top indicative words for fake news:
['video', 'watch', 'breaking', 'just', 'hillary', 'gop', 'muslim', 'tweets', 'obamas', 'racist']

Top indicative words for real news:
['lawmakers', 'north', 'pm', 'china', 'urges', 'talks', 'senate', 'house', 'factbox', 'says']


In [None]:
import re

# Custom headline
test_headline = ["NASA discovers signs of ancient microbial life on Mars"]

# Clean the headline
def clean_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', str(text))
    return text.lower().strip()

cleaned_headline = [clean_text(headline) for headline in test_headline]

# Use the original vectorizer (already fit on training data)
X_new = vectorizer.transform(cleaned_headline)

# Predict
prediction = model.predict(X_new)[0]
label = "Real" if prediction == 1 else "Fake"

print(f"The model predicts this headline is: {label}")

The model predicts this headline is: Real
