In [34]:
import pandas as pd
import numpy as np
import regex as re
from sklearn.svm import LinearSVC
# library for natural processing language
import nltk
# Python's string module (used to remove punctuation like ! . ,)
import string
# gets list of stopwords(like, the, is, and )
from nltk.corpus import stopwords
# portStemmer cuts words to their base (e.g., "running" → "run")
# WordNetlemmatizer converts word to root form (e.g., "better" → "good")
from nltk.stem import PorterStemmer, WordNetLemmatizer
# Split sentence into words
from nltk.tokenize import word_tokenize
# convert text data into numerical feature vectors using TF-IDF (Term Frequency–Inverse Document Frequency).
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

import joblib



In [14]:
# Load both CSV files
fake_df = pd.read_csv("data2/Fake.csv")
real_df = pd.read_csv("data2/True.csv")
fake_df.head()

# #  first 5 rows
# print("Fake News:")
# print(fake_df.head())

# print("\nReal News:")
# print(real_df.head())

# # Check shape
# print("Fake shape:", fake_df.shape)
# print("Real shape:", real_df.shape)

# # Check for null values
# print("Fake nulls:\n", fake_df.isnull().sum())
# print("Real nulls:\n", real_df.isnull().sum())

# # Check column names
# print("Fake columns:", fake_df.columns)

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [35]:
# Label fake news as 0
fake_df["label"] = 0

# Label real news as 1
real_df["label"] = 1

# combining fake news , real news
data = pd.concat([fake_df,real_df],axis=0)
data = data.sample(frac=1).reset_index(drop=True)

# View sample combined data
print(data.head())

                                               title  \
0  Police say suspicious package at Frankfurt Chr...   
1   Senator Delivers SCATHING Rebuke To His GOP C...   
2  Australian PM says nine foreigners among those...   
3   Donald Trump Just Attacked A Crying Baby At H...   
4   With Just 2 Words, Some Muslim Men GLORIOUSLY...   

                                                text    subject  \
0  FRANKFURT (Reuters) - A suspicious package tha...  worldnews   
1  It s been a tragic, but infuriating day for Am...       News   
2  MELBOURNE (Reuters) - Nearly half the 19 peopl...  worldnews   
3  You know the age-old political trope that poli...       News   
4  Did you know that there are two words Muslims ...       News   

                 date  label  
0  December 21, 2017       1  
1    November 5, 2017      0  
2  December 21, 2017       1  
3      August 2, 2016      0  
4     August 12, 2016      0  


In [36]:

# Save to CSV (optional)
data.to_csv("combined_news.csv", index=False)


In [37]:
data['content'] = data['title'] + " " + data['text']


In [38]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
def clean_text(text):
    # Lowercase
    text = text.lower()
    
    # Remove content inside brackets [like this]
    text = re.sub(r'\[[^]]*\]', '', text)
    
    # Remove punctuation
    text = ''.join([char for char in text if char not in string.punctuation])
    
    # Tokenize text
    tokens = word_tokenize(text)
    
    # Remove stopwords and lemmatize
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    
    # Join tokens back to string
    return ' '.join(tokens)

# cleaning the actual data
data['text_cleaned'] = data['content'].apply(clean_text)



In [39]:
data = data[['text_cleaned', 'label']]
data.head()


Unnamed: 0,text_cleaned,label
0,police say suspicious package frankfurt christ...,1
1,senator delivers scathing rebuke gop colleague...,0
2,australian pm say nine foreigner among hurt dr...,1
3,donald trump attacked cry baby virginia rally ...,0
4,2 word muslim men gloriously exposed ‘ christi...,0


In [40]:
# extarct feartures
tfidf = TfidfVectorizer(max_features=5000)

X = tfidf.fit_transform(data['text_cleaned'])

print(tfidf.get_feature_names_out()[:10])  # Shows the first 10 features
print(X.shape)  # Rows = number of documents, columns = number of features


['10' '100' '1000' '10000' '100000' '11' '12' '120' '13' '14']
(44898, 5000)


In [41]:
y = data['label']
# splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [45]:
# Create the model
model = LogisticRegression()

# Train the model
model.fit(X_train, y_train)

# Predict on test data
y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9878619153674832
Confusion Matrix:
 [[4678   65]
 [  44 4193]]
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99      4743
           1       0.98      0.99      0.99      4237

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980



In [46]:
joblib.dump(model, 'fake_news_model.pkl')
joblib.dump(tfidf, 'vectorizer.pkl')
print("✅ Model and vectorizer saved successfully.")



✅ Model and vectorizer saved successfully.
